diff -upr linux-2.6.32-504.3.3.el6.orig/COPYING.Parallels linux-2.6.32-504.3.3.el6-042stab103_6/COPYING.Parallels
--- linux-2.6.32-504.3.3.el6.orig/COPYING.Parallels	2015-01-21 12:02:41.381275303 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/COPYING.Parallels	2015-01-21 12:02:41.381275303 +0300
@@ -0,0 +1,350 @@
+
+Nothing in this license should be construed as a grant by Parallels of any rights
+beyond the rights specified in the GNU General Public License, and nothing in
+this license should be construed as a waiver by Parallels of its patent, copyright
+and/or trademark rights, beyond the waiver required by the GNU General Public
+License. This license is expressly inapplicable to any product that is not
+within the scope of the GNU General Public License
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff -upr linux-2.6.32-504.3.3.el6.orig/Documentation/ABI/testing/sysfs-kernel-fscaps linux-2.6.32-504.3.3.el6-042stab103_6/Documentation/ABI/testing/sysfs-kernel-fscaps
--- linux-2.6.32-504.3.3.el6.orig/Documentation/ABI/testing/sysfs-kernel-fscaps	2015-01-21 12:02:41.813263835 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/Documentation/ABI/testing/sysfs-kernel-fscaps	2015-01-21 12:02:41.813263835 +0300
@@ -0,0 +1,8 @@
+What:		/sys/kernel/fscaps
+Date:		February 2011
+KernelVersion:	2.6.38
+Contact:	Ludwig Nussel <ludwig.nussel@suse.de>
+Description
+		Shows whether file system capabilities are honored
+		when executing a binary
+
diff -upr linux-2.6.32-504.3.3.el6.orig/Documentation/filesystems/configfs/configfs_example_explicit.c linux-2.6.32-504.3.3.el6-042stab103_6/Documentation/filesystems/configfs/configfs_example_explicit.c
--- linux-2.6.32-504.3.3.el6.orig/Documentation/filesystems/configfs/configfs_example_explicit.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/Documentation/filesystems/configfs/configfs_example_explicit.c	2015-01-21 12:02:41.348276179 +0300
@@ -464,9 +464,8 @@ static int __init configfs_example_init(
 	return 0;
 
 out_unregister:
-	for (; i >= 0; i--) {
+	for (i--; i >= 0; i--)
 		configfs_unregister_subsystem(example_subsys[i]);
-	}
 
 	return ret;
 }
@@ -475,9 +474,8 @@ static void __exit configfs_example_exit
 {
 	int i;
 
-	for (i = 0; example_subsys[i]; i++) {
+	for (i = 0; example_subsys[i]; i++)
 		configfs_unregister_subsystem(example_subsys[i]);
-	}
 }
 
 module_init(configfs_example_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/Documentation/filesystems/configfs/configfs_example_macros.c linux-2.6.32-504.3.3.el6-042stab103_6/Documentation/filesystems/configfs/configfs_example_macros.c
--- linux-2.6.32-504.3.3.el6.orig/Documentation/filesystems/configfs/configfs_example_macros.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/Documentation/filesystems/configfs/configfs_example_macros.c	2015-01-21 12:02:41.348276179 +0300
@@ -427,9 +427,8 @@ static int __init configfs_example_init(
 	return 0;
 
 out_unregister:
-	for (; i >= 0; i--) {
+	for (i--; i >= 0; i--)
 		configfs_unregister_subsystem(example_subsys[i]);
-	}
 
 	return ret;
 }
@@ -438,9 +437,8 @@ static void __exit configfs_example_exit
 {
 	int i;
 
-	for (i = 0; example_subsys[i]; i++) {
+	for (i = 0; example_subsys[i]; i++)
 		configfs_unregister_subsystem(example_subsys[i]);
-	}
 }
 
 module_init(configfs_example_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/Makefile
--- linux-2.6.32-504.3.3.el6.orig/Makefile	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/Makefile	2015-01-21 12:02:58.999807589 +0300
@@ -2,6 +2,7 @@ VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 32
 EXTRAVERSION =
+VZVERSION = 042stab103_6
 NAME = Man-Eating Seals of Antiquity
 RHEL_MAJOR = 6
 RHEL_MINOR = 6
@@ -372,8 +373,8 @@ CPP_VERS        := $(shell expr $(CPP_MA
 ifeq ($(KBUILD_EXTMOD),)
 KBUILD_CFLAGS   += $(shell if [ $(CPP_VERS) -ge 4004004 ]; then \
 		   echo "-Wno-array-bounds"; else echo ""; fi)
-KBUILD_CFLAGS   += $(shell if [ $(CPP_MAJOR) -eq 4 -a $(CPP_MINOR) -eq 4  ] ; then \
-		   echo "-Werror"; else echo ""; fi)
+#KBUILD_CFLAGS   += $(shell if [ $(CPP_MAJOR) -eq 4 -a $(CPP_MINOR) -eq 4  ] ; then \
+#		   echo "-Werror"; else echo ""; fi)
 endif ##($(KBUILD_EXTMOD),)
 endif #(,$(filter $(ARCH), i386 x86_64))
 
@@ -383,7 +384,7 @@ KBUILD_AFLAGS   := -D__ASSEMBLY__
 KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
 KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
-export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
+export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION
 export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
 export CPP AR NM STRIP OBJCOPY OBJDUMP
 export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
@@ -1078,7 +1079,8 @@ define filechk_utsrelease.h
 	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
 	  exit 1;                                                         \
 	fi;                                                               \
-	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; 		  \
+		echo \#define VZVERSION \"$(VZVERSION)\";)
 endef
 
 define filechk_version.h
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/alpha/include/asm/mman.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/alpha/include/asm/mman.h
--- linux-2.6.32-504.3.3.el6.orig/arch/alpha/include/asm/mman.h	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/alpha/include/asm/mman.h	2015-01-21 12:02:41.702266781 +0300
@@ -53,6 +53,9 @@
 #define MADV_MERGEABLE   12		/* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13		/* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE	14		/* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE	15		/* Not worth backing with hugepages */
+
 #define MADV_DONTDUMP   16		/* Explicity exclude from the core dump,
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/ia64/kernel/setup.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/ia64/kernel/setup.c
--- linux-2.6.32-504.3.3.el6.orig/arch/ia64/kernel/setup.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/ia64/kernel/setup.c	2015-01-21 12:02:52.606977280 +0300
@@ -273,7 +273,7 @@ static void __init setup_crashkernel(uns
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, total,
-			&size, &base);
+			&size, &base, NULL);
 	if (ret == 0 && size > 0) {
 		if (!base) {
 			sort_regions(rsvd_region, *n);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/mips/include/asm/mman.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/mips/include/asm/mman.h
--- linux-2.6.32-504.3.3.el6.orig/arch/mips/include/asm/mman.h	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/mips/include/asm/mman.h	2015-01-21 12:02:41.702266781 +0300
@@ -77,6 +77,9 @@
 #define MADV_UNMERGEABLE 13		/* KSM may not merge identical pages */
 #define MADV_HWPOISON    100		/* poison a page for testing */
 
+#define MADV_HUGEPAGE	14		/* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE	15		/* Not worth backing with hugepages */
+
 #define MADV_DONTDUMP   16		/* Explicity exclude from the core dump,
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/parisc/include/asm/mman.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/parisc/include/asm/mman.h
--- linux-2.6.32-504.3.3.el6.orig/arch/parisc/include/asm/mman.h	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/parisc/include/asm/mman.h	2015-01-21 12:02:41.702266781 +0300
@@ -59,6 +59,9 @@
 #define MADV_MERGEABLE   65		/* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 66		/* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE	67		/* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE	68		/* Not worth backing with hugepages */
+
 #define MADV_DONTDUMP   69		/* Explicity exclude from the core dump,
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	70		/* Clear the MADV_NODUMP flag */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/powerpc/include/asm/elf.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/powerpc/include/asm/elf.h
--- linux-2.6.32-504.3.3.el6.orig/arch/powerpc/include/asm/elf.h	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/powerpc/include/asm/elf.h	2015-01-21 12:02:57.960835165 +0300
@@ -266,6 +266,7 @@ extern int ucache_bsize;
 /* vDSO has arch_setup_additional_pages */
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES
 struct linux_binprm;
+export struct page *vdso32_pages[1];
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int uses_interp);
 #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/powerpc/kernel/machine_kexec.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/powerpc/kernel/machine_kexec.c
--- linux-2.6.32-504.3.3.el6.orig/arch/powerpc/kernel/machine_kexec.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/powerpc/kernel/machine_kexec.c	2015-01-21 12:02:52.606977280 +0300
@@ -111,7 +111,7 @@ void __init reserve_crashkernel(void)
 
 	/* use common parsing */
 	ret = parse_crashkernel(boot_command_line, lmb_phys_mem_size(),
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/sh/kernel/setup.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/sh/kernel/setup.c
--- linux-2.6.32-504.3.3.el6.orig/arch/sh/kernel/setup.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/sh/kernel/setup.c	2015-01-21 12:02:52.606977280 +0300
@@ -155,7 +155,7 @@ static void __init reserve_crashkernel(v
 	free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
 
 	ret = parse_crashkernel(boot_command_line, free_mem,
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, NULL);
 	if (ret == 0 && crash_size) {
 		if (crash_base <= 0) {
 			vp = alloc_bootmem_nopanic(crash_size);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/Kconfig	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/Kconfig	2015-01-21 12:02:53.077964777 +0300
@@ -1525,6 +1525,21 @@ config KEXEC_AUTO_RESERVE
 	  On x86_32, 128M is reserved, on x86_64 1/32 of your memory is
 	  reserved, but it will not exceed 4G.
 
+config KEXEC_REUSE_CRASH
+	bool "try to reuse crashkernel when booting via kexec"
+	depends on KEXEC && PRAM
+	default y
+	---help---
+	  Reuse crashkernel left from the previous kernel if booting via
+	  kexec, so that early init bugs can be debugged.
+
+	  The option is actually enabled by writing 1 to
+	  /proc/sys/kernel/kexec_reuse_crash before calling kexec.
+
+	  Note that crashkernel areas (specified by the crashkernel boot
+	  option) of the new and the previous kernels must coincide for
+	  the crashkernel to be reused.
+
 config CRASH_DUMP
 	bool "kernel crash dumps"
 	depends on X86_64 || (X86_32 && HIGHMEM)
@@ -2128,6 +2143,8 @@ config HAVE_ATOMIC_IOMAP
 	def_bool y
 	depends on X86_32
 
+source "kernel/Kconfig.openvz"
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
@@ -2145,3 +2162,5 @@ source "crypto/Kconfig"
 source "arch/x86/kvm/Kconfig"
 
 source "lib/Kconfig"
+
+source "kernel/bc/Kconfig"
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/Kconfig.debug linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/Kconfig.debug
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/Kconfig.debug	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/Kconfig.debug	2015-01-21 12:02:42.987232665 +0300
@@ -142,6 +142,10 @@ config 4KSTACKS
 	  on the VM subsystem for higher order allocations. This option
 	  will also use IRQ stacks to compensate for the reduced stackspace.
 
+config 16KSTACKS
+	bool "Use 16Kb for kernel stacks instead of 8Kb"
+	depends on X86_64
+
 config DOUBLEFAULT
 	default y
 	bool "Enable doublefault exception handler" if EMBEDDED
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/crypto/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/crypto/Makefile
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/crypto/Makefile	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/crypto/Makefile	2015-01-21 12:02:52.168988908 +0300
@@ -16,6 +16,8 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aes
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
@@ -29,4 +31,12 @@ salsa20-x86_64-y := salsa20-x86_64-asm_6
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+
+# enable AVX support only when $(AS) can actually assemble the instructions
+ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
+AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
+CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
+endif
+sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/crypto/sha1_ssse3_asm.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/crypto/sha1_ssse3_asm.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/crypto/sha1_ssse3_asm.S	2015-01-21 12:02:52.168988908 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/crypto/sha1_ssse3_asm.S	2015-01-21 12:02:52.168988908 +0300
@@ -0,0 +1,558 @@
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ *    http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ *            Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ *   Author: Mathias Krause <minipli@googlemail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#define CTX	%rdi	// arg1
+#define BUF	%rsi	// arg2
+#define CNT	%rdx	// arg3
+
+#define REG_A	%ecx
+#define REG_B	%esi
+#define REG_C	%edi
+#define REG_D	%ebp
+#define REG_E	%edx
+
+#define REG_T1	%eax
+#define REG_T2	%ebx
+
+#define K_BASE		%r8
+#define HASH_PTR	%r9
+#define BUFFER_PTR	%r10
+#define BUFFER_END	%r11
+
+#define W_TMP1	%xmm0
+#define W_TMP2	%xmm9
+
+#define W0	%xmm1
+#define W4	%xmm2
+#define W8	%xmm3
+#define W12	%xmm4
+#define W16	%xmm5
+#define W20	%xmm6
+#define W24	%xmm7
+#define W28	%xmm8
+
+#define XMM_SHUFB_BSWAP	%xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t)	(((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD	16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	.global	\name
+	.type	\name, @function
+	.align 32
+\name:
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	mov	%rsp, %r12
+	sub	$64, %rsp		# allocate workspace
+	and	$~15, %rsp		# align stack
+
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	shl	$6, CNT			# multiply by 64
+	add	BUF, CNT
+	mov	CNT, BUFFER_END
+
+	lea	K_XMM_AR(%rip), K_BASE
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	# cleanup workspace
+	mov	$8, %ecx
+	mov	%rsp, %rdi
+	xor	%rax, %rax
+	rep stosq
+
+	mov	%r12, %rsp		# deallocate workspace
+
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+
+	.size	\name, .-\name
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+	INIT_REGALLOC
+
+	mov	  (HASH_PTR), A
+	mov	 4(HASH_PTR), B
+	mov	 8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+  .set i, 0
+  .rept W_PRECALC_AHEAD
+	W_PRECALC i
+    .set i, (i+1)
+  .endr
+
+.align 4
+1:
+	RR F1,A,B,C,D,E,0
+	RR F1,D,E,A,B,C,2
+	RR F1,B,C,D,E,A,4
+	RR F1,E,A,B,C,D,6
+	RR F1,C,D,E,A,B,8
+
+	RR F1,A,B,C,D,E,10
+	RR F1,D,E,A,B,C,12
+	RR F1,B,C,D,E,A,14
+	RR F1,E,A,B,C,D,16
+	RR F1,C,D,E,A,B,18
+
+	RR F2,A,B,C,D,E,20
+	RR F2,D,E,A,B,C,22
+	RR F2,B,C,D,E,A,24
+	RR F2,E,A,B,C,D,26
+	RR F2,C,D,E,A,B,28
+
+	RR F2,A,B,C,D,E,30
+	RR F2,D,E,A,B,C,32
+	RR F2,B,C,D,E,A,34
+	RR F2,E,A,B,C,D,36
+	RR F2,C,D,E,A,B,38
+
+	RR F3,A,B,C,D,E,40
+	RR F3,D,E,A,B,C,42
+	RR F3,B,C,D,E,A,44
+	RR F3,E,A,B,C,D,46
+	RR F3,C,D,E,A,B,48
+
+	RR F3,A,B,C,D,E,50
+	RR F3,D,E,A,B,C,52
+	RR F3,B,C,D,E,A,54
+	RR F3,E,A,B,C,D,56
+	RR F3,C,D,E,A,B,58
+
+	add	$64, BUFFER_PTR		# move to the next 64-byte block
+	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
+	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
+
+	RR F4,A,B,C,D,E,60
+	RR F4,D,E,A,B,C,62
+	RR F4,B,C,D,E,A,64
+	RR F4,E,A,B,C,D,66
+	RR F4,C,D,E,A,B,68
+
+	RR F4,A,B,C,D,E,70
+	RR F4,D,E,A,B,C,72
+	RR F4,B,C,D,E,A,74
+	RR F4,E,A,B,C,D,76
+	RR F4,C,D,E,A,B,78
+
+	UPDATE_HASH   (HASH_PTR), A
+	UPDATE_HASH  4(HASH_PTR), B
+	UPDATE_HASH  8(HASH_PTR), C
+	UPDATE_HASH 12(HASH_PTR), D
+	UPDATE_HASH 16(HASH_PTR), E
+
+	RESTORE_RENAMED_REGS
+	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
+	jne	1b
+.endm
+
+.macro INIT_REGALLOC
+  .set A, REG_A
+  .set B, REG_B
+  .set C, REG_C
+  .set D, REG_D
+  .set E, REG_E
+  .set T1, REG_T1
+  .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+	# order is important (REG_C is where it should be)
+	mov	B, REG_B
+	mov	D, REG_D
+	mov	A, REG_A
+	mov	E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES  a, b
+  .set _T, \a
+  .set \a, \b
+  .set \b, _T
+.endm
+
+.macro F1  b, c, d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	xor	\d, T1
+	and	\b, T1
+	xor	\d, T1
+.endm
+
+.macro F2  b, c, d
+	mov	\d, T1
+	SWAP_REG_NAMES \d, T1
+	xor	\c, T1
+	xor	\b, T1
+.endm
+
+.macro F3  b, c ,d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	mov	\b, T2
+	or	\b, T1
+	and	\c, T2
+	and	\d, T1
+	or	T2, T1
+.endm
+
+.macro F4  b, c, d
+	F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ *   t1 = F(b, c, d);   e += w(i)
+ *   e += t1;           b <<= 30;   d  += w(i+1);
+ *   t1 = F(a, b, c);
+ *   d += t1;           a <<= 5;
+ *   e += a;
+ *   t1 = e;            a >>= 7;
+ *   t1 <<= 5;
+ *   d += t1;
+ */
+.macro RR  F, a, b, c, d, e, round
+	add	WK(\round), \e
+	\F   \b, \c, \d		# t1 = F(b, c, d);
+	W_PRECALC (\round + W_PRECALC_AHEAD)
+	rol	$30, \b
+	add	T1, \e
+	add	WK(\round + 1), \d
+
+	\F   \a, \b, \c
+	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+	rol	$5, \a
+	add	\a, \e
+	add	T1, \d
+	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
+
+	mov	\e, T1
+	SWAP_REG_NAMES \e, T1
+
+	rol	$5, T1
+	add	T1, \d
+
+	# write:  \a, \b
+	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC  r
+  .set i, \r
+
+  .if (i < 20)
+    .set K_XMM, 0
+  .elseif (i < 40)
+    .set K_XMM, 16
+  .elseif (i < 60)
+    .set K_XMM, 32
+  .elseif (i < 80)
+    .set K_XMM, 48
+  .endif
+
+  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+    .set i, ((\r) % 80)	    # pre-compute for the next iteration
+    .if (i == 0)
+	W_PRECALC_RESET
+    .endif
+	W_PRECALC_00_15
+  .elseif (i<32)
+	W_PRECALC_16_31
+  .elseif (i < 80)   // rounds 32-79
+	W_PRECALC_32_79
+  .endif
+.endm
+
+.macro W_PRECALC_RESET
+  .set W,          W0
+  .set W_minus_04, W4
+  .set W_minus_08, W8
+  .set W_minus_12, W12
+  .set W_minus_16, W16
+  .set W_minus_20, W20
+  .set W_minus_24, W24
+  .set W_minus_28, W28
+  .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+  .set W_minus_32, W_minus_28
+  .set W_minus_28, W_minus_24
+  .set W_minus_24, W_minus_20
+  .set W_minus_20, W_minus_16
+  .set W_minus_16, W_minus_12
+  .set W_minus_12, W_minus_08
+  .set W_minus_08, W_minus_04
+  .set W_minus_04, W
+  .set W,          W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+	W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+	W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+	W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+  .if ((i & 3) == 0)
+	movdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	pshufb	XMM_SHUFB_BSWAP, W_TMP1
+	movdqa	W_TMP1, W
+  .elseif ((i & 3) == 2)
+	paddd	(K_BASE), W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa  W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ *   instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+  # blended scheduling of vector and scalar instruction streams, one 4-wide
+  # vector iteration / 4 scalar rounds
+  .if ((i & 3) == 0)
+	movdqa	W_minus_12, W
+	palignr	$8, W_minus_16, W	# w[i-14]
+	movdqa	W_minus_04, W_TMP1
+	psrldq	$4, W_TMP1		# w[i-3]
+	pxor	W_minus_08, W
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W_TMP1
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP2
+	movdqa	W, W_TMP1
+	pslldq	$12, W_TMP2
+  .elseif ((i & 3) == 2)
+	psrld	$31, W
+	pslld	$1, W_TMP1
+	por	W, W_TMP1
+	movdqa	W_TMP2, W
+	psrld	$30, W_TMP2
+	pslld	$2, W
+  .elseif ((i & 3) == 3)
+	pxor	W, W_TMP1
+	pxor	W_TMP2, W_TMP1
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+  .if ((i & 3) == 0)
+	movdqa	W_minus_04, W_TMP1
+	pxor	W_minus_28, W		# W is W_minus_32 before xor
+	palignr	$8, W_minus_08, W_TMP1
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP1
+  .elseif ((i & 3) == 2)
+	psrld	$30, W
+	pslld	$2, W_TMP1
+	por	W, W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm		// W_PRECALC_SSSE3
+
+
+#define K1	0x5a827999
+#define K2	0x6ed9eba1
+#define K3	0x8f1bbcdc
+#define K4	0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+	movdqu	\a,\b
+.endm
+
+/* SSSE3 optimized implementation:
+ *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
+ *                                       unsigned int rounds);
+ */
+SHA1_VECTOR_ASM     sha1_transform_ssse3
+
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro  W_PRECALC_00_15
+    W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro  W_PRECALC_16_31
+    W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro  W_PRECALC_32_79
+    W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+  .if ((i & 3) == 0)
+	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
+  .elseif ((i & 3) == 2)
+	vpaddd	(K_BASE), W, W_TMP1
+  .elseif ((i & 3) == 3)
+	vmovdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
+	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
+	vpxor	W_minus_08, W, W
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+  .elseif ((i & 3) == 1)
+	vpxor	W_TMP1, W, W
+	vpslldq	$12, W, W_TMP2
+	vpslld	$1, W, W_TMP1
+  .elseif ((i & 3) == 2)
+	vpsrld	$31, W, W
+	vpor	W, W_TMP1, W_TMP1
+	vpslld	$2, W_TMP2, W
+	vpsrld	$30, W_TMP2, W_TMP2
+  .elseif ((i & 3) == 3)
+	vpxor	W, W_TMP1, W_TMP1
+	vpxor	W_TMP2, W_TMP1, W
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
+  .elseif ((i & 3) == 1)
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+	vpxor	W_TMP1, W, W
+  .elseif ((i & 3) == 2)
+	vpslld	$2, W, W_TMP1
+	vpsrld	$30, W, W
+	vpor	W, W_TMP1, W
+  .elseif ((i & 3) == 3)
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm    // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+	vmovdqu	\a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
+ *                                     unsigned int rounds);
+ */
+SHA1_VECTOR_ASM     sha1_transform_avx
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/crypto/sha1_ssse3_glue.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/crypto/sha1_ssse3_glue.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/crypto/sha1_ssse3_glue.c	2015-01-21 12:02:52.168988908 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/crypto/sha1_ssse3_glue.c	2015-01-21 12:02:52.174988749 +0300
@@ -0,0 +1,258 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * Supplemental SSE3 instructions.
+ *
+ * This file is based on sha1_generic.c
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+
+
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+				     unsigned int rounds);
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+				   unsigned int rounds);
+#endif
+
+static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
+
+
+static int sha1_ssse3_init(struct shash_desc *desc)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha1_state){
+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+	};
+
+	return 0;
+}
+
+static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, unsigned int partial)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA1_BLOCK_SIZE - partial;
+		memcpy(sctx->buffer + partial, data, done);
+		sha1_transform_asm(sctx->state, sctx->buffer, 1);
+	}
+
+	if (len - done >= SHA1_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+		sha1_transform_asm(sctx->state, data + done, rounds);
+		done += rounds * SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buffer, data + done, len - done);
+
+	return 0;
+}
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA1_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buffer + partial, data, len);
+
+		return 0;
+	}
+
+	if (!irq_fpu_usable()) {
+		res = crypto_sha1_update(desc, data, len);
+	} else {
+		kernel_fpu_begin();
+		res = __sha1_ssse3_update(desc, data, len, partial);
+		kernel_fpu_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA1_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+	if (!irq_fpu_usable()) {
+		crypto_sha1_update(desc, padding, padlen);
+		crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_fpu_begin();
+		/* We need to fill a whole block for __sha1_ssse3_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buffer + index, padding, padlen);
+		} else {
+			__sha1_ssse3_update(desc, padding, padlen, index);
+		}
+		__sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+		kernel_fpu_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 5; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_ssse3_export(struct shash_desc *desc, void *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_ssse3_init,
+	.update		=	sha1_ssse3_update,
+	.final		=	sha1_ssse3_final,
+	.export		=	sha1_ssse3_export,
+	.import		=	sha1_ssse3_import,
+	.descsize	=	sizeof(struct sha1_state),
+	.statesize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name=	"sha1-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+static bool __init avx_usable(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave)
+		return false;
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+static void sha1_batch_ssse3(u32 *digest, const char *data, unsigned rounds)
+{
+	sha1_transform_ssse3(digest, data, rounds);
+}
+
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+static void sha1_batch_avx(u32 *digest, const char *data, unsigned rounds)
+{
+	sha1_transform_avx(digest, data, rounds);
+}
+#endif
+
+static int __init sha1_ssse3_mod_init(void)
+{
+	int ret;
+
+	/* test for SSSE3 first */
+	if (cpu_has_ssse3)
+		sha1_transform_asm = sha1_transform_ssse3;
+
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+	/* allow AVX to override SSSE3, it's a little faster */
+	if (avx_usable())
+		sha1_transform_asm = sha1_transform_avx;
+#endif
+
+	if (sha1_transform_asm) {
+		pr_info("Using %s optimized SHA-1 implementation\n",
+		        sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3"
+		                                                   : "AVX");
+		ret = crypto_register_shash(&alg);
+		if (ret)
+			return ret;
+
+		if (sha1_transform_asm == sha1_transform_ssse3)
+			sha_batch_transform = sha1_batch_ssse3;
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+		else
+			sha_batch_transform = sha1_batch_avx;
+#endif
+		return 0;
+	}
+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+
+	return -ENODEV;
+}
+
+module_init(sha1_ssse3_mod_init);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS("sha1");
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/ia32/ia32entry.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/ia32/ia32entry.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/ia32/ia32entry.S	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/ia32/ia32entry.S	2015-01-21 12:02:53.809945349 +0300
@@ -625,7 +625,7 @@ ia32_sys_call_table:
 	.quad stub32_iopl		/* 110 */
 	.quad sys_vhangup
 	.quad quiet_ni_syscall	/* old "idle" system call */
-	.quad sys32_vm86_warning	/* vm86old */ 
+	.quad quiet_ni_syscall	/* vm86old */ 
 	.quad compat_sys_wait4
 	.quad sys_swapoff		/* 115 */
 	.quad compat_sys_sysinfo
@@ -678,7 +678,7 @@ ia32_sys_call_table:
 	.quad sys_mremap
 	.quad sys_setresuid16
 	.quad sys_getresuid16	/* 165 */
-	.quad sys32_vm86_warning	/* vm86 */ 
+	.quad quiet_ni_syscall	/* vm86 */ 
 	.quad quiet_ni_syscall	/* query_module */
 	.quad sys_poll
 	.quad compat_sys_nfsservctl
@@ -853,12 +853,33 @@ ia32_sys_call_table:
 	.quad quiet_ni_syscall		/* sys_fanotify_init */
 	.quad quiet_ni_syscall		/* sys32_fanotify_mark */
 	.quad quiet_ni_syscall		/* sys_prlimit64  340 */
-	.quad quiet_ni_syscall		/* sys_name_to_handle_at */
-	.quad quiet_ni_syscall		/* compat_sys_open_by_handle_at */
+	.quad sys_name_to_handle_at
+	.quad compat_sys_open_by_handle_at
 	.quad compat_sys_clock_adjtime
 	.quad sys_syncfs
 	.quad compat_sys_sendmmsg	/* 345 */
 	.quad sys_setns			/* setns */
 	.quad compat_sys_process_vm_readv
 	.quad compat_sys_process_vm_writev
+	.rept 500-(.-ia32_sys_call_table)/8
+		.quad sys_ni_syscall
+	.endr
+	.quad sys_fairsched_mknod	/* 500 */
+	.quad sys_fairsched_rmnod
+	.quad sys_fairsched_chwt
+	.quad sys_fairsched_mvpr
+	.quad sys_fairsched_rate
+	.quad sys_fairsched_vcpus	/* 505 */
+	.quad sys_fairsched_cpumask
+	.quad sys_fairsched_nodemask
+	.quad sys_ni_syscall
+	.quad sys_ni_syscall
+	.quad sys_getluid		/* 510 */
+	.quad sys_setluid
+	.quad compat_sys_setublimit
+	.quad compat_sys_ubstat
+	.quad sys_ni_syscall
+	.quad sys_ni_syscall		/* 515 */
+	.quad sys_lchmod
+	.quad compat_sys_lutime
 ia32_syscall_end:
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/ia32/sys_ia32.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/ia32/sys_ia32.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/ia32/sys_ia32.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/ia32/sys_ia32.c	2015-01-21 12:02:41.579270047 +0300
@@ -623,20 +623,6 @@ long sys32_fadvise64_64(int fd, __u32 of
 				advice);
 }
 
-long sys32_vm86_warning(void)
-{
-	struct task_struct *me = current;
-	static char lastcomm[sizeof(me->comm)];
-
-	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		compat_printk(KERN_INFO
-			      "%s: vm86 mode not supported on 64 bit kernel\n",
-			      me->comm);
-		strncpy(lastcomm, me->comm, sizeof(lastcomm));
-	}
-	return -ENOSYS;
-}
-
 long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
 			  char __user *buf, size_t len)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/compat.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/compat.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/compat.h	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/compat.h	2015-01-21 12:02:42.212253241 +0300
@@ -107,7 +107,8 @@ struct compat_statfs {
 	compat_fsid_t	f_fsid;
 	int		f_namelen;	/* SunOS ignores this field. */
 	int		f_frsize;
-	int		f_spare[5];
+	int		f_flags;
+	int		f_spare[4];
 };
 
 #define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/cpufeature.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/cpufeature.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/cpufeature.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/cpufeature.h	2015-01-21 12:02:44.840183470 +0300
@@ -11,7 +11,7 @@
  * KABI prevents us from extending NCAPINTS.  Instead use RH_EXT_NCAPINTS and
  *  extend the array in the non-whitelisted cpuinfo_x86_rh structure.
  */
-#define RH_EXT_NCAPINTS 1
+#define RH_EXT_NCAPINTS 2
 #define RHNCAPINTS	(NCAPINTS + RH_EXT_NCAPINTS)
 
 /*
@@ -104,6 +104,7 @@
 #define X86_FEATURE_AMD_DCM     (3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */
 #define X86_FEATURE_UNFAIR_SPINLOCK (3*32+29) /* use unfair spinlocks */
+#define X86_FEATURE_CPUID_FAULTING (3*32+30) /* cpuid faulting */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
@@ -182,10 +183,9 @@
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_XSAVEOPT	(7*32+ 4) /* Optimized Xsave */
 #define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTS		(7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_DTHERM	(7*32+ 7) /* Digital Thermal Sensor */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
@@ -221,6 +221,12 @@
 #define X86_FEATURE_RDSEED	(9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		(9*32+19) /* The ADCX and ADOX instructions */
 
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
+
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
@@ -328,6 +334,8 @@ extern const char * const x86_power_flag
 #define cpu_has_xmm4_2		boot_cpu_has(X86_FEATURE_XMM4_2)
 #define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
 #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
+#define cpu_has_xsaveopt	boot_cpu_has(X86_FEATURE_XSAVEOPT)
+#define cpu_has_xsaves		boot_cpu_has(X86_FEATURE_XSAVES)
 #define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
@@ -335,6 +343,7 @@ extern const char * const x86_power_flag
 #define cpu_has_perfctr_nb	boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_topoext		boot_cpu_has(X86_FEATURE_TOPOEXT)
 #define cpu_has_perfctr_l2	boot_cpu_has(X86_FEATURE_PERFCTR_L2)
+#define cpu_has_cpuid_faulting	boot_cpu_has(X86_FEATURE_CPUID_FAULTING)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/elf.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/elf.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/elf.h	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/elf.h	2015-01-21 12:02:48.148095649 +0300
@@ -333,9 +333,14 @@ struct linux_binprm;
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
-				       int uses_interp);
+				       int uses_interp,
+				       unsigned long map_address);
+extern int arch_setup_additional_pages_rhel5(struct linux_binprm *bprm,
+				       int uses_interp,
+				       unsigned long map_address);
 
-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+extern int syscall32_setup_pages(struct linux_binprm *, int exstack,
+				 unsigned long map_address);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
 
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/entry_arch.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/entry_arch.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/entry_arch.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/entry_arch.h	2015-01-21 12:02:41.605269357 +0300
@@ -36,6 +36,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,I
 #endif
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
+BUILD_INTERRUPT(monitor_ipi, MONITOR_IPI_VECTOR)
+BUILD_INTERRUPT(monitor_posted_interrupt, MONITOR_POSTED_INTERRUPT_VECTOR)
 
 /*
  * every pentium local APIC has two 'local interrupts', with a
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/hw_irq.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/hw_irq.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/hw_irq.h	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/hw_irq.h	2015-01-21 12:02:41.605269357 +0300
@@ -30,6 +30,8 @@ extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
 extern void error_interrupt(void);
 extern void irq_work_interrupt(void);
+extern void monitor_ipi(void);
+extern void monitor_posted_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
@@ -59,6 +61,8 @@ extern void trace_apic_timer_interrupt(v
 extern void trace_x86_platform_ipi(void);
 extern void trace_error_interrupt(void);
 extern void trace_irq_work_interrupt(void);
+extern void trace_monitor_ipi(void);
+extern void trace_monitor_posted_interrupt(void);
 
 extern void trace_spurious_interrupt(void);
 extern void trace_thermal_interrupt(void);
@@ -144,6 +148,8 @@ extern void smp_apic_timer_interrupt(str
 extern void smp_spurious_interrupt(struct pt_regs *);
 extern void smp_x86_platform_ipi(struct pt_regs *);
 extern void smp_error_interrupt(struct pt_regs *);
+extern void smp_monitor_ipi(struct pt_regs *);
+extern void smp_monitor_posted_interrupt(struct pt_regs *);
 #ifdef CONFIG_X86_IO_APIC
 extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/irq_vectors.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/irq_vectors.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/irq_vectors.h	2014-12-12 23:29:28.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/irq_vectors.h	2015-01-21 12:02:41.605269357 +0300
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_IRQ_VECTORS_H
 #define _ASM_X86_IRQ_VECTORS_H
 
+#include <linux/threads.h>
 /*
  * Linux IRQ vector layout.
  *
@@ -16,8 +17,8 @@
  *  Vectors   0 ...  31 : system traps and exceptions - hardcoded events
  *  Vectors  32 ... 127 : device interrupts
  *  Vector  128         : legacy int80 syscall interface
- *  Vectors 129 ... 237 : device interrupts
- *  Vectors 238 ... 255 : special interrupts
+ *  Vectors 129 ... 229 : device interrupts
+ *  Vectors 230 ... 255 : special interrupts
  *
  * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
  *
@@ -91,38 +92,42 @@
 #define THRESHOLD_APIC_VECTOR		0xf9
 #define REBOOT_VECTOR			0xf8
 
-/* f0-f7 used for spreading out TLB flushes: */
-#define INVALIDATE_TLB_VECTOR_END	0xf7
-#define INVALIDATE_TLB_VECTOR_START	0xf0
-#define NUM_INVALIDATE_TLB_VECTORS	   8
-
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR		0xef
-
 /*
  * Generic system vector for platform specific use
  */
-#define X86_PLATFORM_IPI_VECTOR		0xed
+#define X86_PLATFORM_IPI_VECTOR		0xf7
 
 /*
  * IRQ work vector:
  */
-#define IRQ_WORK_VECTOR			0xec
+#define IRQ_WORK_VECTOR			0xf6
 
-#define UV_BAU_MESSAGE			0xea
+#define UV_BAU_MESSAGE			0xf5
 
 /*
  * Self IPI vector for machine checks
  */
-#define MCE_SELF_VECTOR			0xeb
+#define MCE_SELF_VECTOR			0xf4
 
 
 /* Vector on which hypervisor callbacks will be delivered */
-#define HYPERVISOR_CALLBACK_VECTOR	0xe9
+#define HYPERVISOR_CALLBACK_VECTOR	0xf3
+
+#define MONITOR_IPI_VECTOR		0xf2
+#define MONITOR_POSTED_INTERRUPT_VECTOR	0xf1
+
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR		0xef
+
+/* f0-f7 used for spreading out TLB flushes: */
+#define NUM_INVALIDATE_TLB_VECTORS	   8
+#define INVALIDATE_TLB_VECTOR_END	0xee
+#define INVALIDATE_TLB_VECTOR_START	\
+	(INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1)
 
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/linkage.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/linkage.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/linkage.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/linkage.h	2015-01-21 12:02:41.321276896 +0300
@@ -8,11 +8,6 @@
 
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
-/*
- * For 32-bit UML - mark functions implemented in assembly that use
- * regparm input parameters:
- */
-#define asmregparm __attribute__((regparm(3)))
 
 /*
  * Make sure the compiler doesn't do anything stupid with the
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/msr-index.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/msr-index.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/msr-index.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/msr-index.h	2015-01-21 12:02:44.840183470 +0300
@@ -47,6 +47,7 @@
 #define MSR_MTRRcap			0x000000fe
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
+#define MSR_MISC_FEATURES_ENABLES	0x00000140
 
 #define MSR_IA32_SYSENTER_CS		0x00000174
 #define MSR_IA32_SYSENTER_ESP		0x00000175
@@ -282,6 +283,8 @@
 #define MSR_SMI_COUNT			0x00000034
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
+#define MSR_IA32_XSS			0x00000da0
+
 #define FEATURE_CONTROL_LOCKED				(1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
 #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/page_64_types.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/page_64_types.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/page_64_types.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/page_64_types.h	2015-01-21 12:02:42.987232665 +0300
@@ -1,7 +1,12 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_ORDER	1
+#ifdef CONFIG_16KSTACKS
+# define THREAD_ORDER	2
+#else
+# define THREAD_ORDER	1
+#endif
+
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/processor.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/processor.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/processor.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/processor.h	2015-01-21 12:02:47.974100269 +0300
@@ -952,8 +952,7 @@ extern unsigned long thread_saved_pc(str
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-#define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
-					0xc0000000 : 0xFFFFe000)
+#define IA32_PAGE_OFFSET 0xc0000000
 
 #define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
@@ -1045,4 +1044,9 @@ static inline uint32_t hypervisor_cpuid_
 	return 0;
 }
 
+extern void (*set_cpuid_faulting_cb)(bool enable);
+extern void set_cpuid_faulting(bool enable);
+
+extern void get_cpu_cap_masked(u32 *val);
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/ptrace.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/ptrace.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/ptrace.h	2014-12-12 23:29:17.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/ptrace.h	2015-01-21 12:02:41.334276552 +0300
@@ -294,6 +294,22 @@ extern void user_enable_block_step(struc
 
 #define ARCH_HAS_USER_SINGLE_STEP_INFO
 
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set.  The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info)				\
+({									\
+	set_thread_flag(TIF_NOTIFY_RESUME);				\
+	false;								\
+})
+
 struct user_desc;
 extern int do_get_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/thread_info.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/thread_info.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/thread_info.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/thread_info.h	2015-01-21 12:02:47.975100242 +0300
@@ -95,6 +95,7 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_RESUME		29
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,6 +118,7 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_RESUME		(1<<TIF_RESUME)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/tlbflush.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/tlbflush.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/tlbflush.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/tlbflush.h	2015-01-21 12:02:58.009833865 +0300
@@ -105,6 +105,7 @@ static inline void flush_tlb_page(struct
 	if (vma->vm_mm == current->active_mm)
 		__flush_tlb_one(addr);
 }
+EXPORT_SYMBOL(flush_tlb_page);
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/trace/irq_vectors.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/trace/irq_vectors.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/trace/irq_vectors.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/trace/irq_vectors.h	2015-01-21 12:02:41.605269357 +0300
@@ -95,6 +95,16 @@ DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
  */
 DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
 
+/*
+ * monitor_ipi - called when ...
+ */
+DEFINE_IRQ_VECTOR_EVENT(monitor_ipi);
+
+/*
+ * monitor_posted_interrupt - called when ...
+ */
+DEFINE_IRQ_VECTOR_EVENT(monitor_posted_interrupt);
+
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE irq_vectors
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/traps.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/traps.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/traps.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/traps.h	2015-01-21 12:02:44.840183470 +0300
@@ -88,6 +88,8 @@ asmlinkage void smp_thermal_interrupt(vo
 asmlinkage void mce_threshold_interrupt(void);
 #endif
 
+void do_cpuid_fault(struct pt_regs *);
+
 /* Interrupts/Exceptions */
 enum {
 	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/tsc.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/tsc.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/tsc.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/tsc.h	2015-01-21 12:02:41.402274746 +0300
@@ -24,7 +24,7 @@ static inline cycles_t get_cycles(void)
 	unsigned long long ret = 0;
 
 #ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
+	if (WARN_ON_ONCE(!cpu_has_tsc))
 		return 0;
 #endif
 	rdtscll(ret);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/unistd_32.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/unistd_32.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/unistd_32.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/unistd_32.h	2015-01-21 12:02:53.809945349 +0300
@@ -346,18 +346,32 @@
 /* #define __NR_fanotify_init		338 */
 /* #define __NR_fanotify_mark		339 */
 /* #define __NR_prlimit64		340 */
-/* #define __NR_name_to_handle_at	341 */
-/* #define __NR_open_by_handle_at  	342 */
+#define __NR_name_to_handle_at	341
+#define __NR_open_by_handle_at	342
 #define __NR_clock_adjtime	343
 #define __NR_syncfs             344
 #define __NR_sendmmsg		345
 #define __NR_setns		346
 #define __NR_process_vm_readv  347
 #define __NR_process_vm_writev 348
+#define __NR_fairsched_mknod	500	/* FairScheduler syscalls */
+#define __NR_fairsched_rmnod	501
+#define __NR_fairsched_chwt	502
+#define __NR_fairsched_mvpr	503
+#define __NR_fairsched_rate	504
+#define __NR_fairsched_vcpus	505
+#define __NR_fairsched_cpumask	506
+#define __NR_fairsched_nodemask	507
+#define __NR_getluid		510
+#define __NR_setluid		511
+#define __NR_setublimit		512
+#define __NR_ubstat		513
+#define __NR_lchmod		516
+#define __NR_lutime		517
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 349
+#define NR_syscalls 518
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/unistd_64.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/unistd_64.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/unistd_64.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/unistd_64.h	2015-01-21 12:02:53.809945349 +0300
@@ -670,9 +670,9 @@ __SYSCALL(__NR_fanotify_mark, sys_ni_sys
 #define __NR_prlimit64				302
 __SYSCALL(__NR_prlimit64, sys_ni_syscall)
 #define __NR_name_to_handle_at			303
-__SYSCALL(__NR_name_to_handle_at, sys_ni_syscall)
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 #define __NR_open_by_handle_at			304
-__SYSCALL(__NR_open_by_handle_at, sys_ni_syscall)
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 #define __NR_clock_adjtime			305
 __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
 #define __NR_syncfs                             306
@@ -687,6 +687,35 @@ __SYSCALL(__NR_get_cpu, sys_ni_syscall)
 __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev			311
 __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_fairsched_nodemask			497
+__SYSCALL(__NR_fairsched_nodemask, sys_fairsched_nodemask)
+#define __NR_fairsched_cpumask			498
+__SYSCALL(__NR_fairsched_cpumask, sys_fairsched_cpumask)
+#define __NR_fairsched_vcpus			499
+__SYSCALL(__NR_fairsched_vcpus, sys_fairsched_vcpus)
+#define __NR_getluid				500
+__SYSCALL(__NR_getluid, sys_getluid)
+#define __NR_setluid				501
+__SYSCALL(__NR_setluid, sys_setluid)
+#define __NR_setublimit				502
+__SYSCALL(__NR_setublimit, sys_setublimit)
+#define __NR_ubstat				503
+__SYSCALL(__NR_ubstat, sys_ubstat)
+#define __NR_fairsched_mknod			504 /* FairScheduler syscalls */
+__SYSCALL(__NR_fairsched_mknod, sys_fairsched_mknod)
+#define __NR_fairsched_rmnod			505
+__SYSCALL(__NR_fairsched_rmnod, sys_fairsched_rmnod)
+#define __NR_fairsched_chwt			506
+__SYSCALL(__NR_fairsched_chwt, sys_fairsched_chwt)
+#define __NR_fairsched_mvpr			507
+__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr)
+#define __NR_fairsched_rate			508
+__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate)
+#define __NR_lchmod				509
+__SYSCALL(__NR_lchmod, sys_lchmod)
+#define __NR_lutime				510
+__SYSCALL(__NR_lutime, sys_lutime)
+
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
@@ -711,6 +740,7 @@ __SYSCALL(__NR_process_vm_writev, sys_pr
 #define __ARCH_WANT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_TIME
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
 #endif	/* __NO_STUBS */
 
 #ifdef __KERNEL__
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/vdso.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/vdso.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/vdso.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/vdso.h	2015-01-21 12:02:47.991099818 +0300
@@ -18,6 +18,7 @@ extern const char VDSO64_PRELINK[];
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const char VDSO32_PRELINK[];
 
+extern const char VDSO32_SYSENTER_RETURN[];
 /*
  * Given a pointer to the vDSO image, find the pointer to VDSO32_name
  * as that symbol is defined in the vDSO sources or linker script.
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/vgtod.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/vgtod.h
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/include/asm/vgtod.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/include/asm/vgtod.h	2015-01-21 12:02:47.797104969 +0300
@@ -12,6 +12,7 @@ struct vsyscall_gtod_data {
 	u32		wall_time_nsec;
 
 	int		sysctl_enabled;
+	int		gettime_monotonic_enabled;
 	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
 		cycle_t (*vread)(void);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/Makefile
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/Makefile	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/Makefile	2015-01-21 12:02:44.840183470 +0300
@@ -48,6 +48,7 @@ obj-y			+= pci-dma.o quirks.o i8237.o to
 obj-y			+= alternative.o i8253.o pci-nommu.o
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= resource.o
+obj-y			+= cpuid_fault.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/apic/io_apic.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/apic/io_apic.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/apic/io_apic.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/apic/io_apic.c	2015-01-21 12:02:42.782238107 +0300
@@ -3366,6 +3366,12 @@ void destroy_irq(unsigned int irq)
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
+int __irq_to_vector(int irq)
+{
+	return irq_cfg(irq)->vector;
+}
+EXPORT_SYMBOL(__irq_to_vector);
+
 /*
  * MSI message composition
  */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/addon_cpuid_features.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/addon_cpuid_features.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/addon_cpuid_features.c	2014-12-12 23:29:08.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/addon_cpuid_features.c	2015-01-21 12:02:41.624268851 +0300
@@ -31,14 +31,13 @@ void __cpuinit init_scattered_cpuid_feat
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_DTS,              CR_EAX, 0, 0x00000006, 0 },
+		{ X86_FEATURE_DTHERM,           CR_EAX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
 		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
 		{ X86_FEATURE_PLN,		CR_EAX, 4, 0x00000006, 0 },
 		{ X86_FEATURE_PTS,		CR_EAX, 6, 0x00000006, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
-		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
 		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
 		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/amd.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/amd.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/amd.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/amd.c	2015-01-21 12:02:41.608269276 +0300
@@ -633,6 +633,13 @@ static void __cpuinit init_amd(struct cp
 #endif
 
 	/*
+	 * Family 0x12 and above processors have APIC timer
+	 * running in deep C states.
+	 */
+	if (c->x86 > 0x11)
+		set_cpu_cap(c, X86_FEATURE_ARAT);
+
+	/*
 	 * Disable GART TLB Walk Errors on Fam10h. We do this here
 	 * because this is always needed when GART is enabled, even in a
 	 * kernel which has no MCE support built in.
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/common.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/common.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/common.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/common.c	2015-01-21 12:02:41.618269012 +0300
@@ -651,6 +651,17 @@ static void __cpuinit get_cpu_cap(struct
 		rh->x86_capability[9 - NCAPINTS] = ebx;
 	}
 
+	/* Extended state features: level 0x0000000d */
+	if (c->cpuid_level >= 0x0000000d) {
+		u32 eax, ebx, ecx, edx;
+		struct cpuinfo_x86_rh *rh = get_cpuinfo_x86_rh(c);
+
+		cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+
+		/* write into "word 10" of the rh extended capability area */
+		rh->x86_capability[10 - NCAPINTS] = eax;
+	}
+
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/intel.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/intel.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/intel.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/intel.c	2015-01-21 12:02:44.909181637 +0300
@@ -365,6 +365,31 @@ static void __cpuinit detect_vmx_virtcap
 	}
 }
 
+static void intel_set_cpuid_faulting(bool enable)
+{
+	unsigned int l1, l2;
+
+	rdmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+	l1 &= ~1;
+	if (enable)
+		l1 |= 1;
+	wrmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+}
+
+static void __cpuinit intel_cpuid_faulting_init(struct cpuinfo_x86 *c)
+{
+	unsigned int l1, l2;
+
+	if (rdmsr_safe(MSR_PLATFORM_INFO, &l1, &l2) != 0 ||
+	    !(l1 & (1 << 31)))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_CPUID_FAULTING);
+	set_cpuid_faulting_cb = intel_set_cpuid_faulting;
+
+	intel_set_cpuid_faulting(false);
+}
+
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -465,6 +490,8 @@ static void __cpuinit init_intel(struct 
 
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
+
+	intel_cpuid_faulting_init(c);
 }
 
 #ifdef CONFIG_X86_32
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/proc.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/proc.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/proc.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/proc.c	2015-01-21 12:02:54.005940145 +0300
@@ -3,6 +3,7 @@
 #include <linux/string.h>
 #include <linux/seq_file.h>
 #include <linux/cpufreq.h>
+#include <linux/sched.h>
 
 /*
  *	Get CPU information for use by the procfs.
@@ -58,10 +59,30 @@ static void show_cpuinfo_misc(struct seq
 }
 #endif
 
+extern void __do_cpuid_fault(unsigned int op, unsigned int count,
+			     unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx);
+
+struct cpu_flags {
+	u32 val[RHNCAPINTS];
+};
+
+static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
+
+static void init_cpu_flags(void *dummy)
+{
+	struct cpu_flags *flags;
+
+	flags = &get_cpu_var(cpu_flags);
+	get_cpu_cap_masked(flags->val);
+	put_cpu_var(cpu_flags);
+}
+
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
 	unsigned int cpu = 0;
+	int is_super = ve_is_super(get_exec_env());
 	int i;
 
 #ifdef CONFIG_SMP
@@ -90,6 +111,7 @@ static int show_cpuinfo(struct seq_file 
 
 		if (!freq)
 			freq = cpu_khz;
+		freq = sched_cpulimit_scale_cpufreq(freq);
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
 			   freq / 1000, (freq % 1000));
 	}
@@ -103,7 +125,10 @@ static int show_cpuinfo(struct seq_file 
 
 	seq_printf(m, "flags\t\t:");
 	for (i = 0; i < 32*RHNCAPINTS; i++)
-		if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+		if (x86_cap_flags[i] != NULL &&
+		    ((is_super && cpu_has(c, i)) ||
+		     (!is_super && test_bit(i, (unsigned long *)
+					    &per_cpu(cpu_flags, cpu)))))
 			seq_printf(m, " %s", x86_cap_flags[i]);
 
 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
@@ -137,21 +162,28 @@ static int show_cpuinfo(struct seq_file 
 	return 0;
 }
 
-static void *c_start(struct seq_file *m, loff_t *pos)
+static void *__c_start(struct seq_file *m, loff_t *pos)
 {
 	if (*pos == 0)	/* just in case, cpu 0 is not the first */
 		*pos = cpumask_first(cpu_online_mask);
 	else
 		*pos = cpumask_next(*pos - 1, cpu_online_mask);
-	if ((*pos) < nr_cpu_ids)
+	if (__cpus_weight(cpu_online_mask, *pos) < num_online_vcpus())
 		return &cpu_data(*pos);
 	return NULL;
 }
 
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	init_cpu_flags(NULL);
+	smp_call_function(init_cpu_flags, NULL, 1);
+	return __c_start(m, pos);
+}
+
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
-	return c_start(m, pos);
+	return __c_start(m, pos);
 }
 
 static void c_stop(struct seq_file *m, void *v)
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/sched.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/sched.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/sched.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/sched.c	2015-01-21 12:02:54.528926262 +0300
@@ -47,7 +47,7 @@ unsigned long arch_scale_smt_power(struc
 	 * aperf/mperf already includes the smt gain
 	 */
 	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
-		return SCHED_LOAD_SCALE;
+		return SCHED_POWER_SCALE;
 
 	return default_scale_smt_power(sd, cpu);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/transmeta.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/transmeta.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpu/transmeta.c	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpu/transmeta.c	2015-01-21 12:02:58.019833600 +0300
@@ -1,6 +1,8 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include "cpu.h"
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpuid_fault.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpuid_fault.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/cpuid_fault.c	2015-01-21 12:02:44.841183443 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/cpuid_fault.c	2015-01-21 12:02:44.933181000 +0300
@@ -0,0 +1,346 @@
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+
+struct cpuid_override_entry {
+	unsigned int op;
+	unsigned int count;
+	bool has_count;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES	16
+
+struct cpuid_override_table {
+	struct rcu_head rcu_head;
+	int size;
+	struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
+static DEFINE_SPINLOCK(cpuid_override_lock);
+
+#define cpuid_override_active		(!!rcu_access_pointer(cpuid_override))
+
+void (*set_cpuid_faulting_cb)(bool enable);
+static DEFINE_PER_CPU(bool, cpuid_faulting_enabled);
+
+void set_cpuid_faulting(bool enable)
+{
+	bool *enabled;
+
+	if (!cpu_has_cpuid_faulting)
+		return;
+	if (!cpuid_override_active)
+		enable = false;
+
+	enabled = &get_cpu_var(cpuid_faulting_enabled);
+	if (*enabled != enable) {
+		set_cpuid_faulting_cb(enable);
+		*enabled = enable;
+	}
+	put_cpu_var(cpuid_faulting_enabled);
+}
+EXPORT_SYMBOL(set_cpuid_faulting);
+
+static void cpuid_override_update(struct cpuid_override_table *new_table)
+{
+	struct cpuid_override_table *old_table;
+
+	spin_lock(&cpuid_override_lock);
+	old_table = rcu_access_pointer(cpuid_override);
+	rcu_assign_pointer(cpuid_override, new_table);
+	spin_unlock(&cpuid_override_lock);
+
+	if (old_table)
+		kfree_rcu(old_table, rcu_head);
+}
+
+static bool cpuid_override_match(unsigned int op, unsigned int count,
+				 unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	bool ret = false;
+	struct cpuid_override_table *t;
+	struct cpuid_override_entry *e;
+	int i;
+
+	rcu_read_lock();
+	t = rcu_dereference(cpuid_override);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < t->size; i++) {
+		e = &t->entries[i];
+		if (e->op != op)
+			continue;
+		if (e->has_count && e->count != count)
+			continue;
+		*eax = e->eax;
+		*ebx = e->ebx;
+		*ecx = e->ecx;
+		*edx = e->edx;
+		ret = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void __do_cpuid_fault(unsigned int op, unsigned int count,
+		      unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	/* check if op is overridden */
+	if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
+		return;
+
+	/* fallback to real cpuid */
+	cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+void do_cpuid_fault(struct pt_regs *regs)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	__do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+
+	regs->ax = eax;
+	regs->bx = ebx;
+	regs->cx = ecx;
+	regs->dx = edx;
+}
+
+void get_cpu_cap_masked(u32 *val)
+{
+	struct cpuinfo_x86 *c = &current_cpu_data;
+	struct cpuinfo_x86_rh *c_rh = &current_cpu_data_rh;
+	unsigned int eax, ebx, ecx, edx;
+
+	memcpy(val, c->x86_capability, NCAPINTS * sizeof(u32));
+	memcpy(val + NCAPINTS, c_rh->x86_capability,
+	       RH_EXT_NCAPINTS * sizeof(u32));
+
+	/*
+	 * Clear feature bits masked using cpuid masking/faulting.
+	 */
+
+	if (c->cpuid_level >= 0x00000001) {
+		__do_cpuid_fault(0x00000001, 0, &eax, &ebx, &ecx, &edx);
+		val[4] &= ecx;
+		val[0] &= edx;
+	}
+
+	if (c->cpuid_level >= 0x00000007) {
+		__do_cpuid_fault(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+		val[9] &= ebx;
+	}
+
+	if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000 &&
+	    c->extended_cpuid_level >= 0x80000001) {
+		__do_cpuid_fault(0x80000001, 0, &eax, &ebx, &ecx, &edx);
+		val[6] &= ecx;
+		val[1] &= edx;
+	}
+
+	if (c->cpuid_level >= 0x0000000d) {
+		__do_cpuid_fault(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+		val[10] &= eax;
+	}
+}
+EXPORT_SYMBOL(get_cpu_cap_masked);
+
+/*
+ * CPUID override entry format:
+ *
+ * op[ count]: eax ebx ecx edx
+ *
+ * All values are in HEX.
+ */
+static int cpuid_override_entry_parse(const char *s, char **endp,
+				      struct cpuid_override_entry *e)
+{
+	int taken;
+	char *end;
+
+	if (sscanf(s, "%x %x: %x %x %x %x%n",
+		   &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
+		   &taken) == 6)
+		e->has_count = true;
+	else if (sscanf(s, "%x: %x %x %x %x%n",
+			&e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
+			&taken) == 5)
+		e->has_count = false;
+	else
+		return -EINVAL;
+
+	end = (char *)s + taken;
+	if (*end) {
+		if (*end != '\n')
+			return -EINVAL;
+		++end;
+	}
+	*endp = end;
+	return 0;
+}
+
+static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct cpuid_override_table *t = NULL;
+	void *page = NULL;
+	char *s;
+	int err;
+
+	err = -E2BIG;
+	if (count >= PAGE_SIZE)
+		goto out;
+
+	err = -ENOMEM;
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	page = (void *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto out;
+
+	s = page;
+	s[count] = '\0';
+	t->size = 0;
+	while (*(s = skip_spaces(s))) {
+		err = -E2BIG;
+		if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
+			goto out;
+		err = -EINVAL;
+		if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
+			goto out;
+	}
+	if (!t->size) {
+		kfree(t);
+		t = NULL;
+	}
+	err = 0;
+out:
+	free_page((unsigned long)page);
+
+	if (!err)
+		cpuid_override_update(t);
+	else
+		kfree(t);
+
+	return err ?: count;
+}
+
+static void *__cpuid_override_seq_start(loff_t pos)
+{
+	struct cpuid_override_table *t = rcu_dereference(cpuid_override);
+	return t && pos < t->size ? &t->entries[pos] : NULL;
+}
+
+static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	rcu_read_lock();
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void *cpuid_override_seq_next(struct seq_file *seq,
+				     void *v, loff_t *ppos)
+{
+	++*ppos;
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void cpuid_override_seq_stop(struct seq_file *s, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int cpuid_override_seq_show(struct seq_file *s, void *v)
+{
+	struct cpuid_override_entry *e = v;
+
+	seq_printf(s, "0x%08x", e->op);
+	if (e->has_count)
+		seq_printf(s, " 0x%08x", e->count);
+	seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		   e->eax, e->ebx, e->ecx, e->edx);
+	return 0;
+}
+
+static struct seq_operations cpuid_override_seq_ops = {
+	.start = cpuid_override_seq_start,
+	.next  = cpuid_override_seq_next,
+	.stop  = cpuid_override_seq_stop,
+	.show  = cpuid_override_seq_show,
+};
+
+static int cpuid_override_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuid_override_seq_ops);
+}
+
+static struct file_operations proc_cpuid_override_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cpuid_override_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = cpuid_override_write,
+};
+
+static void disable_cpuid_faulting_fn(void *unused)
+{
+	set_cpuid_faulting(false);
+}
+
+static int cpuid_faulting_reboot_notify(struct notifier_block *nb,
+					unsigned long code, void *unused)
+{
+	if (code == SYS_RESTART) {
+		/*
+		 * Disable cpuid faulting before loading a new kernel by kexec
+		 * in case the new kernel does not support this feature.
+		 */
+		cpuid_override_update(NULL);
+		on_each_cpu(disable_cpuid_faulting_fn, NULL, 1);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cpuid_faulting_reboot_nb = {
+	.notifier_call = cpuid_faulting_reboot_notify,
+};
+
+static int __init cpuid_fault_init(void)
+{
+	struct proc_dir_entry *proc;
+
+	if (!cpu_has_cpuid_faulting)
+		return 0;
+
+	register_reboot_notifier(&cpuid_faulting_reboot_nb);
+
+	proc = proc_create("cpuid_override", 0644, proc_vz_dir,
+			   &proc_cpuid_override_ops);
+	if (!proc)
+		return -ENOMEM;
+
+	return 0;
+}
+module_init(cpuid_fault_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/dumpstack.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/dumpstack.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/dumpstack.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/dumpstack.c	2015-01-21 12:02:47.263119143 +0300
@@ -202,8 +202,9 @@ void dump_stack(void)
 {
 	unsigned long stack;
 
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
+	printk("Pid: %d, comm: %.20s veid: %u %s %s %.*s\n",
+		current->pid, current->comm,
+		task_veid(current), print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -341,6 +342,7 @@ die_nmi(char *str, struct pt_regs *regs,
 	printk(" on CPU%d, ip %08lx, registers:\n",
 		smp_processor_id(), regs->ip);
 	show_registers(regs);
+	nmi_show_regs(regs, 1);
 	oops_end(flags, regs, 0);
 	if (do_panic || panic_on_oops)
 		panic("Non maskable interrupt");
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/dumpstack_32.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/dumpstack_32.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/dumpstack_32.c	2014-12-12 23:29:02.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/dumpstack_32.c	2015-01-21 12:02:47.263119143 +0300
@@ -88,8 +88,9 @@ void show_registers(struct pt_regs *regs
 	print_modules();
 	__show_regs(regs, 0);
 
-	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
+	printk(KERN_EMERG "Process %.*s (pid: %d, veid: %u, ti=%p task=%p task.ti=%p)\n",
 		TASK_COMM_LEN, current->comm, task_pid_nr(current),
+		task_veid(current),
 		current_thread_info(), current, task_thread_info(current));
 	/*
 	 * When in-kernel, we also print out the stack and code at the
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/dumpstack_64.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/dumpstack_64.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/dumpstack_64.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/dumpstack_64.c	2015-01-21 12:02:47.263119143 +0300
@@ -282,8 +282,9 @@ void show_registers(struct pt_regs *regs
 	printk("CPU %d ", cpu);
 	print_modules();
 	__show_regs(regs, 1);
-	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
-		cur->comm, cur->pid, task_thread_info(cur), cur);
+	printk("Process %s (pid: %d, veid: %d, threadinfo %p, task %p)\n",
+		cur->comm, cur->pid, task_veid(cur),
+		task_thread_info(cur), cur);
 
 	/*
 	 * When in-kernel, we also print out the stack and code at the
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/entry_32.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/entry_32.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/entry_32.S	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/entry_32.S	2015-01-21 12:02:47.975100242 +0300
@@ -327,6 +327,7 @@ ENTRY(ret_from_fork)
 	GET_THREAD_INFO(%ebp)
 	popl %eax
 	CFI_ADJUST_CFA_OFFSET -4
+ret_from_fork_tail:
 	pushl $0x0202			# Reset kernel eflags
 	CFI_ADJUST_CFA_OFFSET 4
 	popfl
@@ -335,6 +336,25 @@ ENTRY(ret_from_fork)
 	CFI_ENDPROC
 END(ret_from_fork)
 
+ENTRY(i386_ret_from_resume)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	movl (%esp),%eax
+	testl %eax,%eax
+	jz    1f
+	pushl %esp
+	call  *%eax
+	addl  $4,%esp
+1:
+	addl  $256,%esp
+	jmp   ret_from_fork_tail
+	CFI_ENDPROC
+
 /*
  * Interrupt exit functions should be protected against kprobes
  */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/entry_64.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/entry_64.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/entry_64.S	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/entry_64.S	2015-01-21 12:02:47.982100057 +0300
@@ -406,8 +406,12 @@ ENTRY(ret_from_fork)
 
 	call schedule_tail			# rdi: 'prev' task parameter
 
+ret_from_fork_tail:
 	GET_THREAD_INFO(%rcx)
+	LOCK ; btr $TIF_RESUME,TI_flags(%rcx)
+	jc  x86_64_ret_from_resume
 
+ret_from_fork_check:
 	RESTORE_REST
 
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
@@ -419,6 +423,18 @@ ENTRY(ret_from_fork)
 	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
+x86_64_ret_from_resume:
+	movq (%rsp),%rax
+	testq %rax,%rax
+	jz 1f
+	movq  %rsp,%rdi
+	call  *%rax
+1:
+	addq $256,%rsp
+	cmpq $0,ORIG_RAX(%rsp)
+	jge  ret_from_fork_tail
+	RESTORE_REST
+	jmp  int_ret_from_sys_call
 	CFI_ENDPROC
 END(ret_from_fork)
 
@@ -1042,6 +1058,12 @@ apicinterrupt IRQ_WORK_VECTOR \
 	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
+apicinterrupt MONITOR_IPI_VECTOR \
+	monitor_ipi smp_monitor_ipi
+
+apicinterrupt MONITOR_POSTED_INTERRUPT_VECTOR \
+	monitor_posted_interrupt smp_monitor_posted_interrupt
+
 /*
  * Exception entry points.
  */
@@ -1204,7 +1226,7 @@ ENTRY(kernel_thread)
 	xorl %r9d,%r9d
 
 	# clone now
-	call do_fork
+	call do_fork_kthread
 	movq %rax,RAX(%rsp)
 	xorl %edi,%edi
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/i387.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/i387.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/i387.c	2014-12-12 23:29:09.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/i387.c	2015-01-21 12:02:57.961835139 +0300
@@ -162,6 +162,7 @@ int init_fpu(struct task_struct *tsk)
 	set_stopped_child_used_math(tsk);
 	return 0;
 }
+EXPORT_SYMBOL(init_fpu);
 
 /*
  * The xstateregs_active() routine is the same as the fpregs_active() routine,
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/irq.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/irq.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/irq.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/irq.c	2015-01-21 12:02:41.606269330 +0300
@@ -286,6 +286,36 @@ void smp_trace_x86_platform_ipi(struct p
 	set_irq_regs(old_regs);
 }
 
+/*
+ * Handler for MONITOR_IPI_VECTOR
+ */
+void smp_monitor_ipi(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+}
+
+void smp_trace_monitor_ipi(struct pt_regs *regs)
+{
+	trace_monitor_ipi_entry(MONITOR_IPI_VECTOR);
+	ack_APIC_irq();
+	trace_monitor_ipi_exit(MONITOR_IPI_VECTOR);
+}
+
+/*
+ * Handler for MONITOR_POSTED_INTERRUPT_VECTOR
+ */
+void smp_monitor_posted_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+}
+
+void smp_trace_monitor_posted_interrupt(struct pt_regs *regs)
+{
+	trace_monitor_posted_interrupt_entry(MONITOR_POSTED_INTERRUPT_VECTOR);
+	ack_APIC_irq();
+	trace_monitor_posted_interrupt_exit(MONITOR_POSTED_INTERRUPT_VECTOR);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/irqinit.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/irqinit.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/irqinit.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/irqinit.c	2015-01-21 12:02:41.606269330 +0300
@@ -204,6 +204,8 @@ static void __init apic_intr_init(void)
 # endif
 
 #endif
+	alloc_intr_gate(MONITOR_IPI_VECTOR, monitor_ipi);
+	alloc_intr_gate(MONITOR_POSTED_INTERRUPT_VECTOR, monitor_posted_interrupt);
 }
 
 void __init native_init_IRQ(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/ldt.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/ldt.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/ldt.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/ldt.c	2015-01-21 12:02:58.672816268 +0300
@@ -13,6 +13,8 @@
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
+#include <bc/kmem.h>
 
 #include <asm/system.h>
 #include <asm/ldt.h>
@@ -39,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, i
 	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
 			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
 	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+		newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE);
 	else
-		newldt = (void *)__get_free_page(GFP_KERNEL);
+		newldt = (void *)__get_free_page(GFP_KERNEL_UBC);
 
 	if (!newldt)
 		return -ENOMEM;
@@ -117,6 +119,7 @@ int init_new_context(struct task_struct 
 	}
 	return retval;
 }
+EXPORT_SYMBOL(init_new_context);
 
 /*
  * No need to lock the MM as we are the last user
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/process.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/process.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/process.c	2014-12-12 23:29:09.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/process.c	2015-01-21 12:02:47.264119117 +0300
@@ -12,6 +12,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/dmi.h>
 #include <linux/utsname.h>
+#include <linux/utsrelease.h>
 #include <trace/events/power.h>
 #include <asm/cpu.h>
 #include <asm/system.h>
@@ -102,11 +103,11 @@ void show_regs_common(void)
 	board = dmi_get_system_info(DMI_BOARD_NAME);
 
 	printk(KERN_CONT "\n");
-	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
-		current->pid, current->comm, print_tainted(),
+	printk(KERN_DEFAULT "Pid: %d, comm: %.20s veid: %d %s %s %.*s %s",
+		current->pid, current->comm, task_veid(current), print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
+		init_utsname()->version, VZVERSION);
 	printk(KERN_CONT " %s %s", vendor, product);
 	if (board)
 		printk(KERN_CONT "/%s", board);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/process_32.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/process_32.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/process_32.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/process_32.c	2015-01-21 12:02:57.961835139 +0300
@@ -38,6 +38,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/sysctl.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -57,6 +58,9 @@
 #include <asm/syscalls.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+EXPORT_SYMBOL(ret_from_fork);
+asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume");
+EXPORT_SYMBOL_GPL(i386_ret_from_resume);
 
 /*
  * Return saved PC of a blocked thread.
@@ -140,7 +144,8 @@ void __show_regs(struct pt_regs *regs, i
 	printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
 			(u16)regs->cs, regs->ip, regs->flags,
 			smp_processor_id());
-	print_symbol("EIP is at %s\n", regs->ip);
+	if (decode_call_traces)
+		print_symbol("EIP is at %s\n", regs->ip);
 
 	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->ax, regs->bx, regs->cx, regs->dx);
@@ -176,6 +181,8 @@ void show_regs(struct pt_regs *regs)
 {
 	show_registers(regs);
 	show_trace(NULL, regs, &regs->sp);
+	if (!decode_call_traces)
+		printk(" EIP: [<%08lx>]\n", regs->ip);
 }
 
 /*
@@ -184,6 +191,7 @@ void show_regs(struct pt_regs *regs)
  * the "args".
  */
 extern void kernel_thread_helper(void);
+EXPORT_SYMBOL_GPL(kernel_thread_helper);
 
 /*
  * Create a kernel thread
@@ -192,6 +200,13 @@ int kernel_thread(int (*fn)(void *), voi
 {
 	struct pt_regs regs;
 
+	/* Don't allow kernel_thread() inside VE */
+	if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) {
+		printk("kernel_thread call inside container\n");
+		dump_stack();
+		return -EPERM;
+	}
+
 	memset(&regs, 0, sizeof(regs));
 
 	regs.bx = (unsigned long) fn;
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/process_64.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/process_64.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/process_64.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/process_64.c	2015-01-21 12:02:57.961835139 +0300
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/slab.h>
 #include <linux/user.h>
+#include <linux/sysctl.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/module.h>
@@ -50,7 +51,10 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 
-asmlinkage extern void ret_from_fork(void);
+
+asmlinkage void kernel_execve(const char *filename, char *const argv[], 
+				char *const envp[]) __asm__ ("kernel_execve");
+EXPORT_SYMBOL(kernel_execve);
 
 DEFINE_PER_CPU(unsigned long, old_rsp);
 static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -162,7 +166,8 @@ void __show_regs(struct pt_regs *regs, i
 
 	show_regs_common();
 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
-	printk_address(regs->ip, 1);
+	if (decode_call_traces)
+		printk_address(regs->ip, 1);
 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
 			regs->sp, regs->flags);
 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
@@ -214,7 +219,9 @@ void __show_regs(struct pt_regs *regs, i
 void show_regs(struct pt_regs *regs)
 {
 	show_registers(regs);
-	show_trace(NULL, regs, (void *)(regs + 1));
+	show_trace(NULL, regs, &regs->sp);
+	if (!decode_call_traces)
+		printk(" EIP: [<%08lx>]\n", regs->ip);
 }
 
 void release_thread(struct task_struct *dead_task)
@@ -665,3 +672,20 @@ unsigned long KSTK_ESP(struct task_struc
 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
 }
+
+long do_fork_kthread(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      struct pt_regs *regs,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	if (ve_allow_kthreads || ve_is_super(get_exec_env()))
+		return do_fork(clone_flags, stack_start, regs, stack_size,
+				parent_tidptr, child_tidptr);
+
+	/* Don't allow kernel_thread() inside VE */
+	printk("kernel_thread call inside container\n");
+	dump_stack();
+	return -EPERM;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/ptrace.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/ptrace.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/ptrace.c	2014-12-12 23:29:17.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/ptrace.c	2015-01-21 12:02:44.282198284 +0300
@@ -30,6 +30,7 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
+#include <asm/unistd.h>
 
 #include "tls.h"
 
@@ -505,6 +506,25 @@ static unsigned long getreg(struct task_
 		return get_desc_base(&task->thread.tls_array[GS_TLS]);
 	}
 #endif
+#ifdef CONFIG_VE
+	case offsetof(struct user_regs_struct, ax): {
+		struct pt_regs *regs;
+		unsigned long ret;
+
+		regs = task_pt_regs(task);
+		ret = *pt_regs_access(regs, offset);
+
+		if (ve_is_super(get_exec_env()) &&
+				!ve_is_super(task->ve_task_info.owner_env) &&
+				((regs->orig_ax == __NR_vfork) ||
+				 (regs->orig_ax == __NR_clone) ||
+				 (regs->orig_ax == __NR_fork)) &&
+				(long)ret > 0)
+			ret = vpid_to_pid_ve((pid_t)ret, task->ve_task_info.owner_env);
+
+		return ret;
+	}
+#endif
 	}
 
 	return *pt_regs_access(task_pt_regs(task), offset);
@@ -1242,7 +1262,7 @@ void send_sigtrap(struct task_struct *ts
  * We must return the syscall number to actually look up in the table.
  * This can be -1L to skip running any syscall at all.
  */
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
+long syscall_trace_enter(struct pt_regs *regs)
 {
 	long ret = 0;
 
@@ -1287,7 +1307,7 @@ asmregparm long syscall_trace_enter(stru
 	return ret ?: regs->orig_ax;
 }
 
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
+void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/setup.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/setup.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/setup.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/setup.c	2015-01-21 12:02:52.616977015 +0300
@@ -24,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/pram.h>
 #include <linux/screen_info.h>
 #include <linux/ioport.h>
 #include <linux/acpi.h>
@@ -518,10 +519,10 @@ static void __init reserve_early_setup_d
  * Returns the base address on success, and -1ULL on failure.
  */
 static
-unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
+unsigned long long __init find_and_reserve_crashkernel(unsigned long long start,
+						       unsigned long long size)
 {
 	const unsigned long long alignment = 16<<20; 	/* 16M */
-	unsigned long long start = 0LL;
 
 	while (1) {
 		int ret;
@@ -555,18 +556,19 @@ static void __init reserve_crashkernel(v
 {
 	unsigned long long total_mem;
 	unsigned long long crash_size, crash_base;
+	int strict;
 	int ret;
 
 	total_mem = get_total_mem();
 
 	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, &strict);
 	if (ret != 0 || crash_size <= 0)
 		return;
 
-	/* 0 means: find the address automatically */
-	if (crash_base <= 0) {
-		crash_base = find_and_reserve_crashkernel(crash_size);
+	if (!strict) {
+		crash_base = find_and_reserve_crashkernel(crash_base,
+							  crash_size);
 		if (crash_base == -1ULL) {
 			pr_info("crashkernel reservation failed. "
 				"No suitable area found.\n");
@@ -1106,6 +1108,8 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn);
 
+	pram_reserve();
+
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/smp.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/smp.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/smp.c	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/smp.c	2015-01-21 12:02:41.570270286 +0300
@@ -281,6 +281,11 @@ void smp_trace_call_function_single_inte
 	exiting_irq();
 }
 
+void send_nmi_ipi_allbutself(void)
+{
+	apic->send_IPI_allbutself(NMI_VECTOR);
+}
+
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/smpboot.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/smpboot.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/smpboot.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/smpboot.c	2015-01-21 12:02:44.029204999 +0300
@@ -830,6 +830,12 @@ do_rest:
 	initial_code = (unsigned long)start_secondary;
 	stack_start.sp = (void *) c_idle.idle->thread.sp;
 
+#ifdef CONFIG_VE
+	/* Cosmetic: sleep_time won't be changed afterwards for the idle
+	* thread;  keep it 0 rather than -cycles. */
+	VE_TASK_INFO(c_idle.idle)->sleep_time = 0;
+#endif
+
 	/* start_ip had better be page-aligned! */
 	start_ip = setup_trampoline();
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/sys_i386_32.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/sys_i386_32.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/sys_i386_32.c	2014-12-12 23:28:52.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/sys_i386_32.c	2015-01-21 12:02:57.961835139 +0300
@@ -18,7 +18,7 @@
 #include <linux/file.h>
 #include <linux/utsname.h>
 #include <linux/ipc.h>
-
+#include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/unistd.h>
 
@@ -221,3 +221,4 @@ int kernel_execve(const char *filename, 
 	: "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
 	return __res;
 }
+EXPORT_SYMBOL(kernel_execve);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/syscall_table_32.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/syscall_table_32.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/syscall_table_32.S	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/syscall_table_32.S	2015-01-21 12:02:53.810945322 +0300
@@ -340,11 +340,32 @@ ENTRY(sys_call_table)
 	.long sys_ni_syscall	/* sys_fanotify_init */
 	.long sys_ni_syscall	/* sys_fanotify_mark */
 	.long sys_ni_syscall	/* sys_prlimit64  340 */
-	.long sys_ni_syscall	/* sys_name_to_handle_at */
-	.long sys_ni_syscall	/* sys_open_by_handle_at */
+	.long sys_name_to_handle_at
+	.long sys_open_by_handle_at
 	.long sys_clock_adjtime
 	.long sys_syncfs
 	.long sys_sendmmsg		/* 345 */
 	.long sys_setns
 	.long sys_process_vm_readv
 	.long sys_process_vm_writev
+	.rept 500-(.-sys_call_table)/4
+		.long sys_ni_syscall
+	.endr
+	.long sys_fairsched_mknod	/* 500 */
+	.long sys_fairsched_rmnod
+	.long sys_fairsched_chwt
+	.long sys_fairsched_mvpr
+	.long sys_fairsched_rate
+	.long sys_fairsched_vcpus	/* 505 */
+	.long sys_fairsched_cpumask
+	.long sys_fairsched_nodemask
+	.long sys_ni_syscall
+	.long sys_ni_syscall
+	.long sys_getluid		/* 510 */
+	.long sys_setluid
+	.long sys_setublimit
+	.long sys_ubstat
+	.long sys_ni_syscall
+	.long sys_ni_syscall		/* 515 */
+	.long sys_lchmod
+	.long sys_lutime
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/time.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/time.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/time.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/time.c	2015-01-21 12:02:42.761238665 +0300
@@ -77,7 +77,23 @@ static irqreturn_t timer_interrupt(int i
 		spin_unlock(&i8259A_lock);
 	}
 
-	global_clock_event->event_handler(global_clock_event);
+	/*
+	 * If hpet is enabled by hpet_late_init(), event_handler can be left
+	 * uninitialized by clockevents_register_device() because of
+	 * hpet_clockevent low rating (by the time hpet_late_init() is called,
+	 * high prio apic timers have already been setup). The event_handler is
+	 * then initialized a bit later by the clocksource_done_booting()
+	 * procedure.
+	 *
+	 * Normally, timer interrupts should not be delivered between these two
+	 * calls, but if e.g. the kernel is booted using kexec, there might be
+	 * some pending interrupts from the previous kernel's context, which
+	 * can lead to a NULL pointer dereference.
+	 *
+	 * So, take precautions against spurious timer interrupts.
+	 */
+	if (global_clock_event->event_handler)
+		global_clock_event->event_handler(global_clock_event);
 
 	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
 	if (MCA_bus)
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/traps.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/traps.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/traps.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/traps.c	2015-01-21 12:02:44.841183443 +0300
@@ -310,6 +310,27 @@ dotraplinkage void do_double_fault(struc
 }
 #endif
 
+static int check_cpuid_fault(struct pt_regs *regs, long error_code)
+{
+	unsigned long addr;
+	unsigned short opcode;
+
+	if (error_code != 0)
+		return 0;
+
+	addr = convert_ip_to_linear(current, regs);
+	if (get_user(opcode, (unsigned short __user *)addr))
+		return 0;
+
+	if (opcode != 0xa20f)
+		return 0;
+
+	do_cpuid_fault(regs);
+
+	regs->ip += 2;
+	return 1;
+}
+
 dotraplinkage void __kprobes
 do_general_protection(struct pt_regs *regs, long error_code)
 {
@@ -326,6 +347,9 @@ do_general_protection(struct pt_regs *re
 	if (!user_mode(regs))
 		goto gp_in_kernel;
 
+	if (check_cpuid_fault(regs, error_code))
+		return;
+
 #ifdef CONFIG_X86_32
 {
 	int cpu;
@@ -502,12 +526,16 @@ static notrace __kprobes void default_do
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
 		 */
-		if (nmi_watchdog_tick(regs, reason))
+		if (nmi_watchdog_tick(regs, reason) +
+				do_nmi_show_regs(regs, cpu))
 			return;
 		if (!do_nmi_callback(regs, cpu))
 #endif /* !CONFIG_LOCKUP_DETECTOR */
+		if (!do_nmi_show_regs(regs, cpu))
 			unknown_nmi_error(reason, regs);
 #else
+		if (do_nmi_show_regs(regs, cpu))
+			return;
 		unknown_nmi_error(reason, regs);
 #endif
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/tsc_sync.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/tsc_sync.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/tsc_sync.c	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/tsc_sync.c	2015-01-21 12:02:44.029204999 +0300
@@ -120,6 +120,10 @@ void __cpuinit check_tsc_sync_source(int
 		return;
 	}
 
+#ifdef CONFIG_VE
+	/* TSC reset. kill whatever might rely on old values */
+	VE_TASK_INFO(current)->wakeup_stamp = 0;
+#endif
 	/*
 	 * Reset it - in case this is a second bootup:
 	 */
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/vmlinux.lds.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/vmlinux.lds.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/vmlinux.lds.S	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/vmlinux.lds.S	2015-01-21 12:02:58.504820727 +0300
@@ -200,6 +200,11 @@ SECTIONS
 	}
 	jiffies = VVIRT(.jiffies);
 
+	.fence_wdog_jiffies64 : AT(VLOAD(.fence_wdog_jiffies64)) {
+		*(.fence_wdog_jiffies64)
+	}
+	fence_wdog_jiffies64 = VVIRT(.fence_wdog_jiffies64);
+
 	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
 		*(.vsyscall_3)
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/vsyscall_64.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/vsyscall_64.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/kernel/vsyscall_64.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/kernel/vsyscall_64.c	2015-01-21 12:02:47.797104969 +0300
@@ -61,6 +61,7 @@ struct vsyscall_gtod_data __vsyscall_gto
 {
 	.lock = SEQLOCK_UNLOCKED,
 	.sysctl_enabled = 1,
+	.gettime_monotonic_enabled = 0,
 };
 
 void update_vsyscall_tz(void)
@@ -234,6 +235,10 @@ static ctl_table kernel_table2[] = {
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
 	  .mode = 0644,
 	  .proc_handler = proc_dointvec },
+	{ .procname = "vsyscall64_gettime_monotonic",
+	  .data = &vsyscall_gtod_data.gettime_monotonic_enabled, .maxlen = sizeof(int),
+	  .mode = 0644,
+	  .proc_handler = proc_dointvec },
 	{}
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/fault.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/fault.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/fault.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/fault.c	2015-01-21 12:02:58.672816268 +0300
@@ -18,6 +18,8 @@
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 
+#include <bc/oom_kill.h>
+
 /*
  * Page fault error code bits:
  *
@@ -712,7 +714,7 @@ show_signal_msg(struct pt_regs *regs, un
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+	ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);
@@ -799,6 +801,7 @@ bad_area_access_error(struct pt_regs *re
 	__bad_area(regs, error_code, address, SEGV_ACCERR);
 }
 
+#if 0
 /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
 static void
 out_of_memory(struct pt_regs *regs, unsigned long error_code,
@@ -810,8 +813,9 @@ out_of_memory(struct pt_regs *regs, unsi
 	 */
 	up_read(&current->mm->mmap_sem);
 
-	pagefault_out_of_memory();
+	out_of_memory_in_ub(get_exec_ub(), 0);
 }
+#endif
 
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
@@ -874,7 +878,14 @@ mm_fault_error(struct pt_regs *regs, uns
 			return 1;
 		}
 
-		out_of_memory(regs, error_code, address);
+		/*
+		 * This fault can be caused by two different reasons:
+		 * 1) Buddy allocator failed to alloc a page for pud and friends.
+		 * 2) OOM-killed failed to provide us requred memory.
+		 * Current task can't execute in such circumstances.
+		 */
+		up_read(&current->mm->mmap_sem);
+		send_sig(SIGKILL, current, 0);
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
@@ -957,7 +968,7 @@ spurious_fault(unsigned long error_code,
 	return ret;
 }
 
-int show_unhandled_signals = 1;
+int show_unhandled_signals = 0;
 
 static inline int
 access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/hugetlbpage.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/hugetlbpage.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/hugetlbpage.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/hugetlbpage.c	2015-01-21 12:02:57.961835139 +0300
@@ -228,6 +228,7 @@ int pmd_huge(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_PSE);
 }
+EXPORT_SYMBOL(pmd_huge);
 
 int pud_huge(pud_t pud)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/init_64.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/init_64.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/init_64.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/init_64.c	2015-01-21 12:02:52.616977015 +0300
@@ -15,6 +15,7 @@
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/pram.h>
 #include <linux/swap.h>
 #include <linux/smp.h>
 #include <linux/init.h>
@@ -741,6 +742,10 @@ void __init mem_init(void)
 	reservedpages = max_pfn - totalram_pages - absent_pages;
 	after_bootmem = 1;
 
+	totalram_pages += pram_reserved_pages;
+	reservedpages -= pram_reserved_pages;
+	pram_show_banned();
+
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/ioremap.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/ioremap.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/ioremap.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/ioremap.c	2015-01-21 12:02:48.690081262 +0300
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/mm.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/mmap.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/mmap.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/mmap.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/mmap.c	2015-01-21 12:02:49.169068546 +0300
@@ -29,6 +29,7 @@
 #include <linux/random.h>
 #include <linux/limits.h>
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <asm/elf.h>
 
 struct __read_mostly va_alignment va_align = {
@@ -123,7 +124,8 @@ void arch_pick_mmap_layout(struct mm_str
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		if (!(current->personality & READ_IMPLIES_EXEC)
+		if ((16 & exec_shield) &&
+		    !(current->personality & READ_IMPLIES_EXEC)
 		    && mmap_is_ia32()) {
 			mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
 			mm->shlib_base = SHLIB_BASE + mmap_rnd();
@@ -131,3 +133,4 @@ void arch_pick_mmap_layout(struct mm_str
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
+EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/pgtable.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/pgtable.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/pgtable.c	2014-12-12 23:29:10.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/pgtable.c	2015-01-21 12:02:58.673816242 +0300
@@ -4,7 +4,10 @@
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
 
+#include <bc/kmem.h>
+
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -16,7 +19,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP 
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return (pte_t *)__get_free_page(PGALLOC_GFP);
+	return (pte_t *)__get_free_page(PGALLOC_KERN_GFP);
 }
 
 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -229,6 +232,7 @@ static void pgd_mop_up_pmds(struct mm_st
 
 			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 			pmd_free(mm, pmd);
+			mm->nr_ptds--;
 		}
 	}
 }
@@ -253,6 +257,8 @@ static void pgd_prepopulate_pmd(struct m
 			       sizeof(pmd_t) * PTRS_PER_PMD);
 
 		pud_populate(mm, pud, pmd);
+		ub_page_table_charge(mm, 0);
+		mm->nr_ptds++;
 	}
 }
 
@@ -261,6 +267,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
 
+	if (ub_page_table_precharge(mm, 1 + PREALLOCATED_PMDS))
+		return NULL;
+
 	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
 
 	if (pgd == NULL)
@@ -286,6 +295,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	spin_unlock(&pgd_lock);
 
+	ub_page_table_charge(mm, 0);
+	mm->nr_ptds++;
+
 	return pgd;
 
 out_free_pmds:
@@ -293,6 +305,7 @@ out_free_pmds:
 out_free_pgd:
 	free_page((unsigned long)pgd);
 out:
+	ub_page_table_commit(mm);
 	return NULL;
 }
 
@@ -302,6 +315,7 @@ void pgd_free(struct mm_struct *mm, pgd_
 	pgd_dtor(pgd);
 	paravirt_pgd_free(mm, pgd);
 	free_page((unsigned long)pgd);
+	mm->nr_ptds--;
 }
 
 int ptep_set_access_flags(struct vm_area_struct *vma,
@@ -352,6 +366,7 @@ int ptep_test_and_clear_young(struct vm_
 
 	return ret;
 }
+EXPORT_SYMBOL(ptep_test_and_clear_young);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/tlb.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/tlb.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/mm/tlb.c	2014-12-12 23:28:50.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/mm/tlb.c	2015-01-21 12:02:58.010833838 +0300
@@ -262,6 +262,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 
 	preempt_enable();
 }
+EXPORT_SYMBOL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 {
@@ -281,6 +282,7 @@ void flush_tlb_page(struct vm_area_struc
 
 	preempt_enable();
 }
+EXPORT_SYMBOL(flush_tlb_page);
 
 static void do_flush_tlb_all(void *info)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/Makefile
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/Makefile	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/Makefile	2015-01-21 12:02:48.142095808 +0300
@@ -6,7 +6,7 @@ VDSO64-$(CONFIG_X86_64)		:= y
 VDSO32-$(CONFIG_X86_32)		:= y
 VDSO32-$(CONFIG_COMPAT)		:= y
 
-vdso-install-$(VDSO64-y)	+= vdso.so
+vdso-install-$(VDSO64-y)	+= vdso-rhel5.so vdso.so
 vdso-install-$(VDSO32-y)	+= $(vdso32-images)
 
 
@@ -14,24 +14,31 @@ vdso-install-$(VDSO32-y)	+= $(vdso32-ima
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
 
 # files to link into kernel
-obj-$(VDSO64-y)			+= vma.o vdso.o
+obj-$(VDSO64-y)			+= vma.o vdso.o vdso-rhel5.o
 obj-$(VDSO32-y)			+= vdso32.o vdso32-setup.o
 
 vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
 
 $(obj)/vdso.o: $(obj)/vdso.so
+$(obj)/vdso-rhel5.o: $(obj)/vdso-rhel5.so
 
-targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
+targets += vdso-rhel5.so vdso.so vdso-rhel5.so.dbg vdso.so.dbg vdso.lds vdso-rhel5.lds $(vobjs-y)
 
 export CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
 		      	-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
 
+VDSO_LDFLAGS_vdso-rhel5.lds = ${VDSO_LDFLAGS_vdso.lds}
+
 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
 
+$(obj)/vdso-rhel5.o: $(src)/vdso-rhel5.S $(obj)/vdso-rhel5.so
+
 $(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
+$(obj)/vdso-rhel5.so.dbg: $(src)/vdso-rhel5.lds $(vobjs) FORCE
+	$(call if_changed,vdso)
 
 $(obj)/%.so: OBJCOPYFLAGS := -S
 $(obj)/%.so: $(obj)/%.so.dbg FORCE
@@ -44,6 +51,7 @@ $(vobjs): KBUILD_CFLAGS += $(CFL)
 
 targets += vdso-syms.lds
 obj-$(VDSO64-y)			+= vdso-syms.lds
+obj-$(VDSO64-y)			+= vdso-rhel5-syms.lds
 
 #
 # Match symbols in the DSO that look like VDSO*; produce a file of constants.
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vclock_gettime.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vclock_gettime.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vclock_gettime.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vclock_gettime.c	2015-01-21 12:02:48.136095969 +0300
@@ -111,7 +111,7 @@ notrace static noinline int do_monotonic
 	return 0;
 }
 
-notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace noinline int ___vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled))
 		switch (clock) {
@@ -120,20 +120,29 @@ notrace int __vdso_clock_gettime(clockid
 				return do_realtime(ts);
 			break;
 		case CLOCK_MONOTONIC:
-			if (likely(gtod->clock.vread))
+			if (likely(gtod->clock.vread &&
+				   gtod->gettime_monotonic_enabled))
 				return do_monotonic(ts);
 			break;
 		case CLOCK_REALTIME_COARSE:
 			return do_realtime_coarse(ts);
 		case CLOCK_MONOTONIC_COARSE:
-			return do_monotonic_coarse(ts);
+			if (likely(gtod->gettime_monotonic_enabled))
+				return do_monotonic_coarse(ts);
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
+
+int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) __attribute__((section("CLOCKGETTIME")));
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+{
+	return ___vdso_clock_gettime(clock, ts);
+}
+
 int clock_gettime(clockid_t, struct timespec *)
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 
-notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+notrace int ___vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
@@ -155,5 +164,12 @@ notrace int __vdso_gettimeofday(struct t
 	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
 	return ret;
 }
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) __attribute__((section("GETTIMEOFDAY")));
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	return ___vdso_gettimeofday(tv, tz);
+}
+
 int gettimeofday(struct timeval *, struct timezone *)
 	__attribute__((weak, alias("__vdso_gettimeofday")));
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-layout.lds.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-layout.lds.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-layout.lds.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-layout.lds.S	2015-01-21 12:02:48.142095808 +0300
@@ -21,6 +21,12 @@ SECTIONS
 	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
 	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
 
+#ifdef VDSO_RHEL5
+	.gettimeofday 0xffffffffff7006f0 : { *(GETTIMEOFDAY) } :text   =0x90909090
+	.clockgettime 0xffffffffff700780 : { *(CLOCKGETTIME) } :text   =0x90909090
+	.getcpu 0xffffffffff7007c0 : { *(GETCPU) } :text   =0x90909090
+#endif
+
 	.dynamic	: { *(.dynamic) }		:text	:dynamic
 
 	.rodata		: { *(.rodata*) }		:text
@@ -43,6 +49,15 @@ SECTIONS
 	 */
 	. = ALIGN(0x100);
 
+#ifndef VDSO_RHEL5
+	/*
+	 * All global symbols have fixed adresses to be able migrate between
+	 * different versions.
+	 */
+	.gettimeofday 0xffffffffff700800 : { *(GETTIMEOFDAY) } :text   =0x90909090
+	.clockgettime 0xffffffffff700820 : { *(CLOCKGETTIME) } :text   =0x90909090
+	.getcpu 0xffffffffff700840 : { *(GETCPU) } :text   =0x90909090
+#endif
 	.text		: { *(.text*) }			:text	=0x90909090
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-note.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-note.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-note.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-note.S	2015-01-21 12:02:48.155095464 +0300
@@ -7,6 +7,8 @@
 #include <linux/version.h>
 #include <linux/elfnote.h>
 
+	.globl vdso_linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+vdso_linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-rhel5.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-rhel5.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-rhel5.S	2015-01-21 12:02:48.142095808 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-rhel5.S	2015-01-21 12:02:48.142095808 +0300
@@ -0,0 +1,10 @@
+#include <linux/init.h>
+
+__INITDATA
+
+	.globl vdso_rhel5_start, vdso_rhel5_end
+vdso_rhel5_start:
+	.incbin "arch/x86/vdso/vdso-rhel5.so"
+vdso_rhel5_end:
+
+__FINIT
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-rhel5.lds.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-rhel5.lds.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso-rhel5.lds.S	2015-01-21 12:02:48.142095808 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso-rhel5.lds.S	2015-01-21 12:02:48.142095808 +0300
@@ -0,0 +1,38 @@
+/*
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ * Here we only choose the prelinked virtual address.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.  We can define local symbols here called VDSO* to make their
+ * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ */
+
+#define VDSO_PRELINK 0xffffffffff700000
+#define VDSO_RHEL5
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+	LINUX_2.6 {
+	global:
+		clock_gettime;
+		__vdso_clock_gettime;
+		gettimeofday;
+		__vdso_gettimeofday;
+		getcpu;
+		__vdso_getcpu;
+	local: *;
+	};
+}
+
+VDSO64_PRELINK = VDSO_PRELINK;
+
+/*
+ * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
+ */
+#define VEXTERN(x)	VDSO64_rhel5_ ## x = vdso_ ## x;
+#include "vextern.h"
+#undef	VEXTERN
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso.lds.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso.lds.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso.lds.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso.lds.S	2015-01-21 12:02:48.155095464 +0300
@@ -34,4 +34,5 @@ VDSO64_PRELINK = VDSO_PRELINK;
  */
 #define VEXTERN(x)	VDSO64_ ## x = vdso_ ## x;
 #include "vextern.h"
+VEXTERN(linux_version_code)
 #undef	VEXTERN
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/int80.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/int80.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/int80.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/int80.S	2015-01-21 12:02:48.130096128 +0300
@@ -1,15 +1,15 @@
 /*
  * Code for the vDSO.  This version uses the old int $0x80 method.
  *
- * First get the common code for the sigreturn entry points.
- * This must come first.
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
  */
-#include "sigreturn.S"
 
 	.text
 	.globl __kernel_vsyscall
 	.type __kernel_vsyscall,@function
-	ALIGN
 __kernel_vsyscall:
 .LSTART_vsyscall:
 	int $0x80
@@ -54,3 +54,8 @@ VDSO32_vsyscall_eh_frame_size = 0x40
 	.section .data,"aw",@progbits
 	.space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
 	.previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "sigreturn.S"
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/note.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/note.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/note.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/note.S	2015-01-21 12:02:48.162095278 +0300
@@ -9,7 +9,9 @@
 /* Ideally this would use UTS_NAME, but using a quoted string here
    doesn't work. Remember to change this when changing the
    kernel's name. */
+	.globl vdso_linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+vdso_linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/sigreturn.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/sigreturn.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/sigreturn.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/sigreturn.S	2015-01-21 12:02:48.130096128 +0300
@@ -15,6 +15,7 @@
 #endif
 
 	.text
+	.org __kernel_vsyscall+0x100,0x90
 	.globl __kernel_sigreturn
 	.type __kernel_sigreturn,@function
 	ALIGN
@@ -27,6 +28,7 @@ __kernel_sigreturn:
 	nop
 	.size __kernel_sigreturn,.-.LSTART_sigreturn
 
+	.org __kernel_vsyscall+0x200,0x90
 	.globl __kernel_rt_sigreturn
 	.type __kernel_rt_sigreturn,@function
 	ALIGN
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/syscall.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/syscall.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/syscall.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/syscall.S	2015-01-21 12:02:48.130096128 +0300
@@ -1,11 +1,12 @@
 /*
  * Code for the vDSO.  This version uses the syscall instruction.
  *
- * First get the common code for the sigreturn entry points.
- * This must come first.
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
  */
 #define SYSCALL_ENTER_KERNEL	syscall
-#include "sigreturn.S"
 
 #include <asm/segment.h>
 
@@ -75,3 +76,7 @@ VDSO32_vsyscall_eh_frame_size = 0x40
 	.section .data,"aw",@progbits
 	.space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
 	.previous
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "sigreturn.S"
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/sysenter.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/sysenter.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/sysenter.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/sysenter.S	2015-01-21 12:02:48.130096128 +0300
@@ -1,10 +1,11 @@
 /*
  * Code for the vDSO.  This version uses the sysenter instruction.
  *
- * First get the common code for the sigreturn entry points.
- * This must come first.
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
  */
-#include "sigreturn.S"
 
 /*
  * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
@@ -27,7 +28,6 @@
 	.text
 	.globl __kernel_vsyscall
 	.type __kernel_vsyscall,@function
-	ALIGN
 __kernel_vsyscall:
 .LSTART_vsyscall:
 	push %ecx
@@ -38,9 +38,12 @@ __kernel_vsyscall:
 .Lenter_kernel:
 	movl %esp,%ebp
 	sysenter
+__kernel_vsyscall_end:
 
 	/* 7: align return point with nop's to make disassembly easier */
-	.space 7,0x90
+	/* For CPT: VSYSCALL32_SYSEXIT value must 0x420 for backward
+	   compatibility with another kernel */
+	.space __kernel_vsyscall + 0x20 - __kernel_vsyscall_end - 2 ,0x90
 
 	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
 	jmp .Lenter_kernel
@@ -114,3 +117,8 @@ VDSO32_SYSENTER_RETURN:	/* Symbol used b
 	 * to verify it matches the other versions.
 	 */
 VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "sigreturn.S"
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/vdso32.lds.S linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/vdso32.lds.S
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32/vdso32.lds.S	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32/vdso32.lds.S	2015-01-21 12:02:48.162095278 +0300
@@ -35,3 +35,4 @@ VDSO32_PRELINK		= VDSO_PRELINK;
 VDSO32_vsyscall		= __kernel_vsyscall;
 VDSO32_sigreturn	= __kernel_sigreturn;
 VDSO32_rt_sigreturn	= __kernel_rt_sigreturn;
+VDSO32_linux_version_code = vdso_linux_version_code;
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32-setup.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32-setup.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vdso32-setup.c	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vdso32-setup.c	2015-01-21 12:02:58.673816242 +0300
@@ -16,6 +16,11 @@
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
+#include <bc/vmpages.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -193,7 +198,8 @@ static __init void relocate_vdso(Elf32_E
 	}
 }
 
-static struct page *vdso32_pages[1];
+struct page *vdso32_pages[1];
+EXPORT_SYMBOL(vdso32_pages);
 
 #ifdef CONFIG_X86_64
 
@@ -303,16 +309,139 @@ int __init sysenter_setup(void)
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
+EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
+
+static DEFINE_MUTEX(vdso32_mutex);
+
+static struct page **uts_prep_vdso_pages_locked(int map)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct mm_struct *mm = current->mm;
+	struct ve_struct *ve = get_exec_env();
+	struct page **pages = vdso32_pages;
+	int n1, n2, n3, new_version;
+	struct page **new_pages, **p;
+	void *addr;
+
+	/*
+	 * Simply reuse vDSO pages if we can.
+	 */
+	if (uts_ns == &init_uts_ns)
+		return vdso32_pages;
+
+	/*
+	 * Dirty lockless hack. Strictly speaking
+	 * we need to return @p here if it's non-nil,
+	 * but since there only one trasition possible
+	 * { =0 ; !=0 } we simply return @uts_ns->vdso32.pages
+	 */
+	p = ACCESS_ONCE(uts_ns->vdso32.pages);
+	smp_read_barrier_depends();
+	if (p)
+		return uts_ns->vdso32.pages;
+
+	up_write(&mm->mmap_sem);
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto out;
+#ifdef CONFIG_X86_32
+		else {
+			/*
+			 * Native x86-32 mode requires vDSO runtime
+			 * relocations applied which is not supported
+			 * in the old vanilla kernels, moreover even
+			 * being ported we would break compatibility
+			 * with rhel5 vdso which has addresses hardcoded.
+			 * Thus simply warn about this problem and
+			 * continue execution without virtualization.
+			 * After all i686 is pretty outdated nowadays.
+			 */
+			pr_warn_once("x86-32 vDSO virtualization is not supported.");
+			goto out;
+		}
+#endif
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto out;
+	}
+
+	mutex_lock(&vdso32_mutex);
+	if (uts_ns->vdso32.pages) {
+		pages = uts_ns->vdso32.pages;
+		goto out_unlock;
+	}
+
+	uts_ns->vdso32.nr_pages	= 1;
+	uts_ns->vdso32.size	= PAGE_SIZE;
+	uts_ns->vdso32.version_off= (unsigned long)VDSO32_SYMBOL(0, linux_version_code);
+	new_pages		= kmalloc(sizeof(struct page *), GFP_KERNEL);
+	if (!new_pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		pages = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	new_pages[0] = alloc_page(GFP_KERNEL);
+	if (!new_pages[0]) {
+		pr_err("Can't allocate page for VE %d\n", ve->veid);
+		kfree(new_pages);
+		pages = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	copy_page(page_address(new_pages[0]), page_address(vdso32_pages[0]));
+
+	addr = page_address(new_pages[0]);
+	*((int *)(addr + uts_ns->vdso32.version_off)) = new_version;
+	smp_wmb();
+
+	pages = uts_ns->vdso32.pages = new_pages;
+
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+out_unlock:
+	mutex_unlock(&vdso32_mutex);
+out:
+	down_write(&mm->mmap_sem);
+	return pages;
+}
+
 /* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
+				unsigned long map_address)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long addr;
+	unsigned long addr = map_address;
 	int ret = 0;
 	bool compat;
+	unsigned long flags;
 
-	if (vdso_enabled == VDSO_DISABLED)
+	if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
+		current->mm->context.vdso = NULL;
 		return 0;
+	}
+
+	flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
+		mm->def_flags;
+
+	ret = -ENOMEM;
+	if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
+		goto err_charge;
 
 	down_write(&mm->mmap_sem);
 
@@ -322,26 +451,31 @@ int arch_setup_additional_pages(struct l
 
 	map_compat_vdso(compat);
 
-	if (compat)
-		addr = VDSO_HIGH_BASE;
-	else {
-		addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1);
+	if (!compat || map_address) {
+		addr = get_unmapped_area_prot(NULL, addr, PAGE_SIZE, 0, 0, 1);
 		if (IS_ERR_VALUE(addr)) {
 			ret = addr;
 			goto up_fail;
 		}
-	}
+	} else
+		addr = VDSO_HIGH_BASE;
 
 	current->mm->context.vdso = (void *)addr;
 
-	if (compat_uses_vma || !compat) {
+	if (compat_uses_vma || !compat || map_address) {
+		struct page **pages = uts_prep_vdso_pages_locked(compat);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto up_fail;
+		}
+
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
 		 */
 		ret = install_special_mapping(mm, addr, PAGE_SIZE,
 					      VM_READ|VM_EXEC|
 					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-					      vdso32_pages);
+					      pages);
 
 		if (ret)
 			goto up_fail;
@@ -355,9 +489,13 @@ int arch_setup_additional_pages(struct l
 		current->mm->context.vdso = NULL;
 
 	up_write(&mm->mmap_sem);
+	if (ret < 0)
+		ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
+err_charge:
 
 	return ret;
 }
+EXPORT_SYMBOL(arch_setup_additional_pages);
 
 #ifdef CONFIG_X86_64
 
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vgetcpu.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vgetcpu.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vgetcpu.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vgetcpu.c	2015-01-21 12:02:48.136095969 +0300
@@ -13,8 +13,8 @@
 #include <asm/vgtod.h>
 #include "vextern.h"
 
-notrace long
-__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+notrace noinline long
+___vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
 	unsigned int p;
 
@@ -32,5 +32,13 @@ __vdso_getcpu(unsigned *cpu, unsigned *n
 	return 0;
 }
 
+int long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+						__attribute__((section("GETCPU")));
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+	return ___vdso_getcpu(cpu, node, unused);
+}
+
 long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 	__attribute__((weak, alias("__vdso_getcpu")));
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vma.c linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vma.c
--- linux-2.6.32-504.3.3.el6.orig/arch/x86/vdso/vma.c	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/x86/vdso/vma.c	2015-01-21 12:02:57.962835113 +0300
@@ -4,6 +4,7 @@
  * Subject to the GPL, v.2
  */
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -14,17 +15,25 @@
 #include <asm/proto.h>
 #include <asm/vdso.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 #include "vextern.h"		/* Just for VMAGIC.  */
 #undef VEXTERN
 
 unsigned int __read_mostly vdso_enabled = 1;
 
 extern char vdso_start[], vdso_end[];
+extern char vdso_rhel5_start[], vdso_rhel5_end[];
 extern unsigned short vdso_sync_cpuid;
 
 static struct page **vdso_pages;
 static unsigned vdso_size;
 
+static struct page **vdso_rhel5_pages;
+static unsigned vdso_rhel5_size;
+
 static inline void *var_ref(void *p, char *name)
 {
 	if (*(void **)p != (void *)VMAGIC) {
@@ -62,6 +71,12 @@ static int __init init_vdso_vars(void)
 		vdso_enabled = 0;
 	}
 
+	init_uts_ns.vdso.addr		= vbase;
+	init_uts_ns.vdso.pages		= vdso_pages;
+	init_uts_ns.vdso.nr_pages	= npages;
+	init_uts_ns.vdso.size		= vdso_size;
+	init_uts_ns.vdso.version_off	= (unsigned long)VDSO64_SYMBOL(0, linux_version_code);
+
 #define VEXTERN(x) \
 	*(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
 #include "vextern.h"
@@ -75,6 +90,47 @@ static int __init init_vdso_vars(void)
 }
 __initcall(init_vdso_vars);
 
+static int __init init_vdso_rhel5_vars(void)
+{
+	int npages = (vdso_rhel5_end - vdso_rhel5_start + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+	char *vbase;
+
+	vdso_rhel5_size = npages << PAGE_SHIFT;
+	vdso_rhel5_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
+	if (!vdso_rhel5_pages)
+		goto oom;
+	for (i = 0; i < npages; i++) {
+		struct page *p;
+		p = alloc_page(GFP_KERNEL);
+		if (!p)
+			goto oom;
+		vdso_rhel5_pages[i] = p;
+		copy_page(page_address(p), vdso_rhel5_start + i*PAGE_SIZE);
+	}
+
+	vbase = vmap(vdso_rhel5_pages, npages, 0, PAGE_KERNEL);
+	if (!vbase)
+		goto oom;
+
+	if (memcmp(vbase, "\177ELF", 4)) {
+		printk("VDSO: I'm broken; not ELF\n");
+		vdso_enabled = 0;
+	}
+
+#define VEXTERN(x) \
+	*(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, rhel5_ ## x), #x) = &__ ## x;
+#include "vextern.h"
+#undef VEXTERN
+	return 0;
+
+ oom:
+	printk("Cannot allocate vdso\n");
+	vdso_enabled = 0;
+	return -ENOMEM;
+}
+__initcall(init_vdso_rhel5_vars);
+
 struct linux_binprm;
 
 /* Put the vdso above the (randomized) stack with another randomized offset.
@@ -108,17 +164,24 @@ static unsigned long vdso_addr(unsigned 
 
 /* Setup a VMA at program startup for the vsyscall page.
    Not called for compat tasks */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
+				unsigned long map_address, struct page ** vdso_pages,
+				unsigned vdso_size)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
 	int ret;
 
-	if (!vdso_enabled)
+	if (!vdso_enabled && map_address == 0) {
+		current->mm->context.vdso = NULL;
 		return 0;
+	}
 
 	down_write(&mm->mmap_sem);
-	addr = vdso_addr(mm->start_stack, vdso_size);
+	if (map_address)
+		addr = map_address;
+	else
+		addr = vdso_addr(mm->start_stack, vdso_size);
 	addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
@@ -141,6 +204,135 @@ up_fail:
 	return ret;
 }
 
+static DEFINE_MUTEX(vdso_mutex);
+
+static int uts_arch_setup_additional_pages(struct linux_binprm *bprm,
+					   int uses_interp,
+					   unsigned long map_address)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct ve_struct *ve = get_exec_env();
+	int i, n1, n2, n3, new_version;
+	struct page **new_pages, **p;
+
+	/*
+	 * For node or in case we've not changed UTS simply
+	 * map preallocated original vDSO.
+	 *
+	 * In turn if we already allocated one for this UTS
+	 * simply reuse it. It improves speed significantly.
+	 */
+	if (uts_ns == &init_uts_ns)
+		goto map_init_uts;
+
+	/*
+	 * Dirty lockless hack. Strictly speaking
+	 * we need to return @p here if it's non-nil,
+	 * but since there only one trasition possible
+	 * { =0 ; !=0 } we simply return @uts_ns->vdso.pages
+	 */
+	p = ACCESS_ONCE(uts_ns->vdso.pages);
+	smp_read_barrier_depends();
+	if (p)
+		goto map_uts;
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto map_init_uts;
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto map_init_uts;
+	}
+
+	mutex_lock(&vdso_mutex);
+	if (uts_ns->vdso.pages) {
+		mutex_unlock(&vdso_mutex);
+		goto map_uts;
+	}
+
+	uts_ns->vdso.nr_pages	= init_uts_ns.vdso.nr_pages;
+	uts_ns->vdso.size	= init_uts_ns.vdso.size;
+	uts_ns->vdso.version_off= init_uts_ns.vdso.version_off;
+	new_pages		= kmalloc(sizeof(struct page *) * init_uts_ns.vdso.nr_pages, GFP_KERNEL);
+	if (!new_pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		goto out_unlock;
+	}
+
+	for (i = 0; i < uts_ns->vdso.nr_pages; i++) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			pr_err("Can't allocate page for VE %d\n", ve->veid);
+			for (; i > 0; i--)
+				put_page(new_pages[i - 1]);
+			kfree(new_pages);
+			goto out_unlock;
+		}
+		new_pages[i] = p;
+		copy_page(page_address(p), page_address(init_uts_ns.vdso.pages[i]));
+	}
+
+	uts_ns->vdso.addr = vmap(new_pages, uts_ns->vdso.nr_pages, 0, PAGE_KERNEL);
+	if (!uts_ns->vdso.addr) {
+		pr_err("Can't map vDSO pages for VE %d\n", ve->veid);
+		for (i = 0; i < uts_ns->vdso.nr_pages; i++)
+			put_page(new_pages[i]);
+		kfree(new_pages);
+		goto out_unlock;
+	}
+
+	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+	smp_wmb();
+	uts_ns->vdso.pages = new_pages;
+	mutex_unlock(&vdso_mutex);
+
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+map_uts:
+	return __arch_setup_additional_pages(bprm, uses_interp, map_address,
+					     uts_ns->vdso.pages, uts_ns->vdso.size);
+map_init_uts:
+	return __arch_setup_additional_pages(bprm, uses_interp, map_address,
+					     init_uts_ns.vdso.pages, init_uts_ns.vdso.size);
+out_unlock:
+	mutex_unlock(&vdso_mutex);
+	return -ENOMEM;
+}
+
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
+				unsigned long map_address)
+{
+	return uts_arch_setup_additional_pages(bprm, uses_interp, map_address);
+}
+EXPORT_SYMBOL(arch_setup_additional_pages);
+
+int arch_setup_additional_pages_rhel5(struct linux_binprm *bprm, int uses_interp,
+				unsigned long map_address)
+{
+	return __arch_setup_additional_pages(bprm, uses_interp, map_address,
+					vdso_rhel5_pages, vdso_rhel5_size);
+}
+EXPORT_SYMBOL(arch_setup_additional_pages_rhel5);
+
+int vdso_is_rhel5(struct page *page)
+{
+	return page == vdso_rhel5_pages[0];
+}
+EXPORT_SYMBOL(vdso_is_rhel5);
+
 static __init int vdso_setup(char *s)
 {
 	vdso_enabled = simple_strtoul(s, NULL, 0);
diff -upr linux-2.6.32-504.3.3.el6.orig/arch/xtensa/include/asm/mman.h linux-2.6.32-504.3.3.el6-042stab103_6/arch/xtensa/include/asm/mman.h
--- linux-2.6.32-504.3.3.el6.orig/arch/xtensa/include/asm/mman.h	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/arch/xtensa/include/asm/mman.h	2015-01-21 12:02:41.702266781 +0300
@@ -83,6 +83,9 @@
 #define MADV_MERGEABLE   12		/* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13		/* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE	14		/* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE	15		/* Not worth backing with hugepages */
+
 #define MADV_DONTDUMP   16		/* Explicity exclude from the core dump,
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-cgroup.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-cgroup.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-cgroup.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-cgroup.c	2015-01-21 12:02:43.567217267 +0300
@@ -173,6 +173,19 @@ static void blkio_add_stat(uint64_t *sta
 		stat[BLKIO_STAT_ASYNC] += add;
 }
 
+static void blkio_max_stat(uint64_t *stat, uint64_t cur, bool direction,
+				bool sync)
+{
+	if (direction)
+		stat[BLKIO_STAT_WRITE] = max(stat[BLKIO_STAT_WRITE], cur);
+	else
+		stat[BLKIO_STAT_READ] = max(stat[BLKIO_STAT_READ], cur);
+	if (sync)
+		stat[BLKIO_STAT_SYNC] = max(stat[BLKIO_STAT_SYNC], cur);
+	else
+		stat[BLKIO_STAT_ASYNC] = max(stat[BLKIO_STAT_ASYNC], cur);
+}
+
 /*
  * Decrements the appropriate stat variable if non-zero depending on the
  * request type. Panics on value being zero.
@@ -404,9 +417,12 @@ void blkiocg_update_completion_stats(str
 	if (time_after64(now, io_start_time))
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 				now - io_start_time, direction, sync);
-	if (time_after64(io_start_time, start_time))
+	if (time_after64(io_start_time, start_time)) {
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 				io_start_time - start_time, direction, sync);
+		blkio_max_stat(stats->stat_arr[BLKIO_STAT_WAIT_MAX],
+				io_start_time - start_time, direction, sync);
+	}
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
@@ -423,20 +439,76 @@ void blkiocg_update_io_merged_stats(stru
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
-/*
- * This function allocates the per cpu stats for blkio_group. Should be called
- * from sleepable context as alloc_per_cpu() requires that.
- */
+static LIST_HEAD(stats_alloc_list);
+static DEFINE_SPINLOCK(stats_alloc_lock);
+
+static void blkio_stats_alloc_fn(struct work_struct *work)
+{
+	struct delayed_work *dw = container_of(work, struct delayed_work, work);
+	struct blkio_group_stats_cpu __percpu *stats;
+	struct blkio_group *blkg;
+
+	spin_lock_irq(&stats_alloc_lock);
+	while (!list_empty(&stats_alloc_list)) {
+		spin_unlock_irq(&stats_alloc_lock);
+
+		stats = alloc_percpu(struct blkio_group_stats_cpu);
+		if (!stats) {
+			/* Cannot fail, try again after timeout */
+			schedule_delayed_work(dw, HZ);
+			return;
+		}
+
+		spin_lock_irq(&stats_alloc_lock);
+		if (list_empty(&stats_alloc_list)) {
+			free_percpu(stats);
+			break;
+		}
+		blkg = list_first_entry(&stats_alloc_list,
+				struct blkio_group, stats_alloc_list);
+		list_del_init(&blkg->stats_alloc_list);
+		blkg->stats_cpu = stats;
+	}
+	spin_unlock_irq(&stats_alloc_lock);
+}
+
+static DECLARE_DELAYED_WORK(stats_alloc_work, blkio_stats_alloc_fn);
+static DEFINE_PER_CPU(struct blkio_group_stats_cpu, stats_plug);
+
 int blkio_alloc_blkg_stats(struct blkio_group *blkg)
 {
-	/* Allocate memory for per cpu stats */
-	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-	if (!blkg->stats_cpu)
-		return -ENOMEM;
+	unsigned long flags;
+
+	/* Set temporary plug */
+	blkg->stats_cpu = &per_cpu_var(stats_plug);
+
+	/* Queue per cpu stat allocation from worker thread. */
+	spin_lock_irqsave(&stats_alloc_lock, flags);
+	list_add(&blkg->stats_alloc_list, &stats_alloc_list);
+	spin_unlock_irqrestore(&stats_alloc_lock, flags);
+
+	schedule_delayed_work(&stats_alloc_work, 0);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
 
+void blkio_free_blkg_stats(struct blkio_group *blkg)
+{
+	unsigned long flags;
+
+	if (!blkg->stats_cpu)
+		return;
+
+	/* Cancel pending stats allocation */
+	spin_lock_irqsave(&stats_alloc_lock, flags);
+	list_del_init(&blkg->stats_alloc_list);
+	spin_unlock_irqrestore(&stats_alloc_lock, flags);
+
+	if (blkg->stats_cpu != &per_cpu_var(stats_plug))
+		free_percpu(blkg->stats_cpu);
+}
+EXPORT_SYMBOL_GPL(blkio_free_blkg_stats);
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, void *key, dev_t dev,
 		enum blkio_policy_id plid)
@@ -449,6 +521,7 @@ void blkiocg_add_blkio_group(struct blki
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	blkg->plid = plid;
+	blkg->blk_ub = blkcg->blk_ub;
 	spin_unlock_irqrestore(&blkcg->lock, flags);
 	/* Need to take css reference ? */
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -626,7 +699,7 @@ static uint64_t blkio_fill_stat(char *st
 }
 
 
-static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
 			enum stat_type_cpu type, enum stat_sub_type sub_type)
 {
 	int cpu;
@@ -1202,6 +1275,9 @@ static int blkiocg_file_read_map(struct 
 		case BLKIO_PROP_io_wait_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_WAIT_TIME, 1, 0);
+		case BLKIO_PROP_io_wait_max:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_WAIT_MAX, 0, 0);
 		case BLKIO_PROP_io_merged:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_MERGED, 1, 0);
@@ -1317,6 +1393,25 @@ blkiocg_file_write_u64(struct cgroup *cg
 	return 0;
 }
 
+int blkio_cgroup_set_weight(struct cgroup *cgroup, u64 weight)
+{
+	return blkio_weight_write(cgroup_to_blkio_cgroup(cgroup), weight);
+}
+
+void blkio_cgroup_set_ub(struct cgroup *cgroup, struct user_beancounter *ub)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
+	blkcg->blk_ub = ub;
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+		blkg->blk_ub = ub;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+}
+
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
@@ -1370,6 +1465,12 @@ struct cftype blkio_files[] = {
 		.read_map = blkiocg_file_read_map,
 	},
 	{
+		.name = "io_wait_max",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_wait_max),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
 		.name = "io_merged",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_merged),
@@ -1545,6 +1646,7 @@ done:
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	INIT_LIST_HEAD(&blkcg->policy_list);
+	blkcg->blk_ub = &ub0;
 	return &blkcg->css;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-cgroup.h linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-cgroup.h
--- linux-2.6.32-504.3.3.el6.orig/block/blk-cgroup.h	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-cgroup.h	2015-01-21 12:02:43.513218700 +0300
@@ -33,6 +33,8 @@ enum stat_type {
 	BLKIO_STAT_SERVICE_TIME = 0,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
+	/* Maximum time spent waiting in scheduler queue in ns */
+	BLKIO_STAT_WAIT_MAX,
 	/* Number of IOs merged */
 	BLKIO_STAT_MERGED,
 	/* Number of IOs queued up */
@@ -83,6 +85,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_sectors,
 	BLKIO_PROP_io_service_time,
 	BLKIO_PROP_io_wait_time,
+	BLKIO_PROP_io_wait_max,
 	BLKIO_PROP_io_merged,
 	BLKIO_PROP_io_queued,
 	BLKIO_PROP_avg_queue_size,
@@ -108,6 +111,7 @@ struct blkio_cgroup {
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
+	struct user_beancounter *blk_ub;
 };
 
 struct blkio_group_stats {
@@ -155,6 +159,9 @@ struct blkio_group {
 	char path[128];
 	/* The device MKDEV(major, minor), this group has been created for */
 	dev_t dev;
+	char *dev_name;
+	struct user_beancounter *blk_ub;
+
 	/* policy which owns this blk group */
 	enum blkio_policy_id plid;
 
@@ -163,6 +170,8 @@ struct blkio_group {
 	struct blkio_group_stats stats;
 	/* Per cpu stats pointer */
 	struct blkio_group_stats_cpu __percpu *stats_cpu;
+
+	struct list_head stats_alloc_list;
 };
 
 struct blkio_policy_node {
@@ -208,6 +217,8 @@ typedef void (blkio_update_group_read_io
 			struct blkio_group *blkg, unsigned int read_iops);
 typedef void (blkio_update_group_write_iops_fn) (void *key,
 			struct blkio_group *blkg, unsigned int write_iops);
+uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+			enum stat_type_cpu type, enum stat_sub_type sub_type);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
@@ -299,6 +310,7 @@ extern void blkiocg_add_blkio_group(stru
 	struct blkio_group *blkg, void *key, dev_t dev,
 	enum blkio_policy_id plid);
 extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
+extern void blkio_free_blkg_stats(struct blkio_group *blkg);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
@@ -314,6 +326,8 @@ void blkiocg_update_io_add_stats(struct 
 		struct blkio_group *curr_blkg, bool direction, bool sync);
 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 					bool direction, bool sync);
+int blkio_cgroup_set_weight(struct cgroup *cgroup, u64 weight);
+void blkio_cgroup_set_ub(struct cgroup *cgroup, struct user_beancounter *ub);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -326,6 +340,7 @@ static inline void blkiocg_add_blkio_gro
 		enum blkio_policy_id plid) {}
 
 static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+static inline void blkio_free_blkg_stats(struct blkio_group *blkg) { }
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
@@ -345,5 +360,11 @@ static inline void blkiocg_update_io_add
 		struct blkio_group *curr_blkg, bool direction, bool sync) {}
 static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 						bool direction, bool sync) {}
+static inline int blkio_cgroup_set_weight(struct cgroup *cgroup, u64 weight)
+{
+	return -EINVAL;
+}
+static inline void blkio_cgroup_set_ub(struct cgroup *cgroup,
+		struct user_beancounter *ub) { }
 #endif
 #endif /* _BLK_CGROUP_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-core.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-core.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-core.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-core.c	2015-01-21 12:02:52.548978821 +0300
@@ -632,6 +632,7 @@ struct request_queue *blk_alloc_queue_no
 	q->orderr = q->ordcolor = 0;
 	q->orig_bar_rq = NULL;
 
+	atomic_set(&q->flush_tag, 0);
 	/*
 	 * By default initialize queue_lock to internal lock and driver can
 	 * override it later if need be.
@@ -914,16 +915,15 @@ static struct request *get_request(struc
 			if (!blk_queue_full(q, is_sync)) {
 				ioc_set_batching(q, ioc);
 				blk_set_queue_full(q, is_sync);
-			} else {
-				if (may_queue != ELV_MQUEUE_MUST
-						&& !ioc_batching(q, ioc)) {
-					/*
-					 * The queue is full and the allocating
-					 * process is not a "batcher", and not
-					 * exempted by the IO scheduler
-					 */
-					goto out;
-				}
+			} else if (may_queue == ELV_MQUEUE_MUST) {
+				ioc_set_batching(q, ioc);
+			} else if (!ioc_batching(q, ioc)) {
+				/*
+				 * The queue is full and the allocating
+				 * process is not a "batcher", and not
+				 * exempted by the IO scheduler
+				 */
+				goto out;
 			}
 		}
 		blk_set_queue_congested(q, is_sync);
@@ -1682,10 +1682,9 @@ static inline void __generic_make_reques
 	old_sector = -1;
 	old_dev = 0;
 	do {
-		char b[BDEVNAME_SIZE];
-
 		q = bdev_get_queue(bio->bi_bdev);
 		if (unlikely(!q)) {
+			char b[BDEVNAME_SIZE];
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
@@ -1696,6 +1695,7 @@ static inline void __generic_make_reques
 
 		if (likely(bio_is_rw(bio) &&
 			   nr_sectors > queue_max_hw_sectors(q))) {
+			char b[BDEVNAME_SIZE];
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
 			       bio_sectors(bio),
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-exec.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-exec.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-exec.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-exec.c	2015-01-21 12:02:42.972233063 +0300
@@ -109,7 +109,7 @@ int blk_execute_rq(struct request_queue 
 
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-	wait_for_completion(&wait);
+	wait_for_completion_io(&wait);
 
 	if (rq->errors)
 		err = -EIO;
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-flush.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-flush.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-flush.c	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-flush.c	2015-01-21 12:02:52.549978794 +0300
@@ -203,7 +203,7 @@ static void flush_end_io(struct request 
 	/* account completion of the flush request */
 	q->flush_running_idx ^= 1;
 	elv_completed_request(q, flush_rq);
-
+	atomic_inc(&q->flush_tag);
 	/* and push the waiting requests to the next stage */
 	list_for_each_entry_safe(rq, n, running, flush.list) {
 		unsigned int seq = blk_flush_cur_seq(rq);
@@ -278,6 +278,7 @@ static bool blk_kick_flush(struct reques
 	q->flush_rq.end_io = flush_end_io;
 
 	q->flush_pending_idx ^= 1;
+	atomic_inc(&q->flush_tag);
 	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
 	return true;
 }
@@ -436,7 +437,7 @@ int __blkdev_issue_flush(struct block_de
 
 	bio_get(bio);
 	submit_bio(WRITE_FLUSH, bio);
-	wait_for_completion(&wait);
+	wait_for_completion_io(&wait);
 
 	/*
 	 * The driver must store the error location in ->bi_sector, if
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-ioc.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-ioc.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-ioc.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-ioc.c	2015-01-21 12:02:55.651896454 +0300
@@ -43,7 +43,9 @@ int put_io_context(struct io_context *io
 			ioc->aic->dtor(ioc->aic);
 		cfq_dtor(ioc);
 		rcu_read_unlock();
-
+#ifdef CONFIG_BEANCOUNTERS
+		put_beancounter(ioc->ioc_ub);
+#endif
 		kmem_cache_free(iocontext_cachep, ioc);
 		return 1;
 	}
@@ -75,6 +77,11 @@ void exit_io_context(struct task_struct 
 	task->io_context = NULL;
 	task_unlock(task);
 
+	ioc_task_unlink(ioc);
+}
+
+void ioc_task_unlink(struct io_context *ioc)
+{
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
 			ioc->aic->exit(ioc->aic);
@@ -83,6 +90,7 @@ void exit_io_context(struct task_struct 
 	}
 	put_io_context(ioc);
 }
+EXPORT_SYMBOL(ioc_task_unlink);
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
@@ -104,6 +112,9 @@ struct io_context *alloc_io_context(gfp_
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 		ret->cgroup_changed = 0;
 #endif
+#ifdef CONFIG_BEANCOUNTERS
+		ret->ioc_ub = get_beancounter(get_exec_ub());
+#endif
 	}
 
 	return ret;
@@ -135,6 +146,7 @@ struct io_context *current_io_context(gf
 
 	return ret;
 }
+EXPORT_SYMBOL(current_io_context);
 
 /*
  * If the current task has no IO context then create one and initialise it.
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-lib.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-lib.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-lib.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-lib.c	2015-01-21 12:02:42.972233063 +0300
@@ -125,7 +125,7 @@ int blkdev_issue_discard(struct block_de
 
 	/* Wait for bios in-flight */
 	if (!atomic_dec_and_test(&bb.done))
-		wait_for_completion(&wait);
+		wait_for_completion_io(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		ret = -EIO;
@@ -187,7 +187,7 @@ int blkdev_issue_zeroout(struct block_de
 
 	/* Wait for bios in-flight */
 	if (!atomic_dec_and_test(&bb.done))
-		wait_for_completion(&wait);
+		wait_for_completion_io(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		/* One of bios in the batch was completed with error.*/
diff -upr linux-2.6.32-504.3.3.el6.orig/block/blk-throttle.c linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-throttle.c
--- linux-2.6.32-504.3.3.el6.orig/block/blk-throttle.c	2014-12-12 23:29:13.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/blk-throttle.c	2015-01-21 12:02:43.567217267 +0300
@@ -159,7 +159,7 @@ static void throtl_free_tg(struct rcu_he
 	struct throtl_grp *tg;
 
 	tg = container_of(head, struct throtl_grp, rcu_head);
-	free_percpu(tg->blkg.stats_cpu);
+	blkio_free_blkg_stats(&tg->blkg);
 	kfree(tg);
 }
 
@@ -336,7 +336,7 @@ static struct throtl_grp * throtl_get_tg
 
 	/* Make sure @q is still alive */
 	if (unlikely(blk_queue_dead(q))) {
-		kfree(tg);
+		throtl_free_tg(&tg->rcu_head);
 		return NULL;
 	}
 
@@ -353,7 +353,7 @@ static struct throtl_grp * throtl_get_tg
 	__tg = throtl_find_tg(td, blkcg);
 
 	if (__tg) {
-		kfree(tg);
+		throtl_free_tg(&tg->rcu_head);
 		rcu_read_unlock();
 		return __tg;
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/block/cfq-iosched.c linux-2.6.32-504.3.3.el6-042stab103_6/block/cfq-iosched.c
--- linux-2.6.32-504.3.3.el6.orig/block/cfq-iosched.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/cfq-iosched.c	2015-01-21 12:02:43.567217267 +0300
@@ -13,6 +13,8 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <bc/io_acct.h>
+#include <linux/sched.h>
 #include "cfq.h"
 
 /*
@@ -32,6 +34,7 @@ static int cfq_slice_idle = HZ / 125;
 static int cfq_group_idle = HZ / 125;
 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
+static int cfq_fast_slow_expiration_rate = 10; /* 10 seconds */
 
 /*
  * offset from end of service tree
@@ -52,6 +55,12 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
+#define CFQD_IDLE_AUTODETECT(cfqd) \
+	((cfqd)->hw_tag == 1 && !(cfqd)->cfq_enable_idle_for_deep)
+#define CFQD_DISK_LOOKS_FAST(cfqd) \
+	(CFQD_IDLE_AUTODETECT(cfqd) && cfqd->cfq_disk_looks_fast > cfqd->cfq_disk_looks_slow)
+#define CFQQ_DEEP_THR		4
+
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private[0])
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
@@ -146,6 +155,11 @@ struct cfq_queue {
 	struct cfq_group *orig_cfqg;
 	/* Number of sectors dispatched from queue in single dispatch round */
 	unsigned long nr_sectors;
+
+	/* When fisrst dispatch in dispatch round happened */
+	unsigned long first_dispatch;
+	/* Number of dispatch happened since first dispatch + 1 */
+	int n_dispatched;
 };
 
 /*
@@ -290,8 +304,93 @@ struct cfq_data {
 
 	/* Number of groups which are on blkcg->blkg_list */
 	unsigned int nr_blkcg_linked_grps;
+
+	/*
+	 * Revert to former behaviour ensuring fairness between
+	 * seeky and seeky&deep tasks sacrificing overall
+	 * performance. Also, disables fast/slow prediction.
+	 */
+	int cfq_enable_idle_for_deep;
+
+	/*
+	 * # times disk claimed as fast and slow correspondingly
+	 */
+	int cfq_disk_looks_fast;
+	int cfq_disk_looks_slow;
+
+	/*
+	 * when fast/slow fields were updated last time
+	 */
+	unsigned long cfq_disk_last_updated;
+
+	/*
+	 * If no events for fast/slow prediction happen in this
+	 * time-frame (measured in seconds), fast/slow counters
+	 * are divided by two.
+	 *
+	 * Zero or negative value turns expiration off.
+	 */
+	int cfq_fast_slow_expiration_rate;
+
+	/*
+	 * Sum of vectors:
+	 * <cfqg->service_trees[0][0].count, ..., cfqg->service_tree_idle.count>
+	 * for all cfqg-s
+	 */
+	unsigned st_counts[sizeof(((struct cfq_group *)NULL)->service_trees) /
+			   sizeof(((struct cfq_group *)NULL)->service_trees[0][0]) + 1];
+
+	/* average */
+	unsigned long cfq_avg_queued[3];
+	unsigned long cfq_avg_indriver[3];
+
+	/* when (in jiffies) to update averages next time */
+	unsigned long cfq_calc_load_update;
+
+	/* last values seen */
+	int cfq_queued_last;
+	int cfq_indriver_last;
+};
+
+#define EXP_ARR_SIZ 12
+/*
+ * EXP_ARR[i][j] == ((EXP_I / FIXED_1) ^ (2^j)) * FIXED_1
+ * where I == 1 for i=0, I == 5 for i=1, I == 15 for i=2
+ */
+const u16 EXP_ARR[3][EXP_ARR_SIZ] = {
+	{ EXP_1, 1733, 1466, 1050, 539, 142, 10 },
+	{ EXP_5, 1981, 1915, 1791, 1567, 1199, 701, 240, 28 },
+	{ EXP_15, 2026, 2004, 1962, 1879, 1723, 1451, 1028, 516, 130, 8 }
 };
 
+/*
+ * update averages every 5sec/60
+ */
+#define CFQD_LOAD_FREQ max(LOAD_FREQ / 60, 1)
+
+/*
+ * Assumption:
+ * cfqq->service_tree always points to cfqq->cfqg->service_tree[i][j]
+ * for some 'i' and 'j'. Exception are service_tree_idle and
+ * required according isolation logic in cfq_service_tree_add
+ */
+static inline int cfq_get_idx_by_st(struct cfq_queue *cfqq, struct cfq_group *cfqg)
+{
+	int max_idx = sizeof(((struct cfq_data *)NULL)->st_counts) /
+		sizeof(((struct cfq_data *)NULL)->st_counts[0]) - 1;
+	int idx;
+
+	if (unlikely(cfqq->service_tree == &cfqg->service_tree_idle))
+		return max_idx;
+
+	idx = cfqq->service_tree - &cfqg->service_trees[0][0];
+	BUG_ON(idx < 0);
+	BUG_ON(idx >= max_idx);
+	return idx;
+}
+
+static void cfq_update_stats(struct cfq_data *cfqd, int indrv_delta);
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -567,6 +666,26 @@ static inline unsigned cfq_group_get_avg
 	return cfqg->busy_queues_avg[rt];
 }
 
+static inline u64
+cfq_group_vslice(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	u64 vslice;
+
+	/* FIXME no group slices in iops mode? */
+	if (iops_mode(cfqd))
+		return 0;
+
+	/*
+	 * Equal to cfq_scale_slice(cfq_group_slice(cfqd, cfqg), cfqg).
+	 * Add group weight beacuse it currently not in service tree.
+	 */
+	vslice = (u64)cfq_target_latency << CFQ_SERVICE_SHIFT;
+	vslice *= BLKIO_WEIGHT_DEFAULT;
+	do_div(vslice, st->total_weight + cfqg->weight);
+	return vslice;
+}
+
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
@@ -863,16 +982,20 @@ cfq_group_notify_queue_add(struct cfq_da
 		return;
 
 	/*
-	 * Currently put the group at the end. Later implement something
-	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continously backlogged.
+	 * Bump vdisktime to be greater or equal min_vdisktime.
+	 */
+	cfqg->vdisktime = max_vdisktime(cfqg->vdisktime, st->min_vdisktime);
+
+	/*
+	 * Put the group at the end, but save one slice from unused time.
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
 		__cfqg = rb_entry_cfqg(n);
-		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
-	} else
-		cfqg->vdisktime = st->min_vdisktime;
+		cfqg->vdisktime = max_vdisktime(cfqg->vdisktime,
+				__cfqg->vdisktime -
+					cfq_group_vslice(cfqd, cfqg));
+	}
 	cfq_group_service_tree_add(st, cfqg);
 }
 
@@ -1012,6 +1135,9 @@ static void cfq_init_add_cfqg_lists(stru
 
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	if (!cfqg->blkg.dev_name && cfqd->queue->kobj.parent)
+		cfqg->blkg.dev_name = kstrdup(kobject_name(
+					cfqd->queue->kobj.parent), GFP_ATOMIC);
 }
 
 /*
@@ -1050,6 +1176,13 @@ static struct cfq_group * cfq_alloc_cfqg
 	return cfqg;
 }
 
+static void cfq_free_cfqg(struct cfq_group *cfqg)
+{
+	blkio_free_blkg_stats(&cfqg->blkg);
+	kfree(cfqg->blkg.dev_name);
+	kfree(cfqg);
+}
+
 static struct cfq_group *
 cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
 {
@@ -1120,7 +1253,7 @@ static struct cfq_group *cfq_get_cfqg(st
 	__cfqg = cfq_find_cfqg(cfqd, blkcg);
 
 	if (__cfqg) {
-		kfree(cfqg);
+		cfq_free_cfqg(cfqg);
 		rcu_read_unlock();
 		return __cfqg;
 	}
@@ -1161,8 +1294,7 @@ static void cfq_put_cfqg(struct cfq_grou
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
-	free_percpu(cfqg->blkg.stats_cpu);
-	kfree(cfqg);
+	cfq_free_cfqg(cfqg);
 }
 
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
@@ -1258,6 +1390,7 @@ static void cfq_service_tree_add(struct 
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
+	struct cfq_group *orig_group = cfqq->cfqg;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
@@ -1320,6 +1453,7 @@ static void cfq_service_tree_add(struct 
 		    cfqq->service_tree == service_tree)
 			return;
 
+		cfqd->st_counts[cfq_get_idx_by_st(cfqq, orig_group)]--;
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
@@ -1354,6 +1488,7 @@ static void cfq_service_tree_add(struct 
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
+	cfqd->st_counts[cfq_get_idx_by_st(cfqq, cfqq->cfqg)]++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
@@ -1459,6 +1594,7 @@ static void cfq_del_cfqq_rr(struct cfq_d
 	cfq_clear_cfqq_on_rr(cfqq);
 
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+		cfqd->st_counts[cfq_get_idx_by_st(cfqq, cfqq->cfqg)]--;
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
@@ -1563,6 +1699,7 @@ static void cfq_activate_request(struct 
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
+	cfq_update_stats(cfqd, 1);
 	cfqd->rq_in_driver++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
@@ -1575,6 +1712,7 @@ static void cfq_deactivate_request(struc
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
 	WARN_ON(!cfqd->rq_in_driver);
+	cfq_update_stats(cfqd, -1);
 	cfqd->rq_in_driver--;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 						cfqd->rq_in_driver);
@@ -1629,6 +1767,9 @@ static void cfq_bio_merged(struct reques
 {
 	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
 					bio_data_dir(bio), cfq_bio_sync(bio));
+
+	if (get_exec_ub() != (RQ_CFQG(req))->blkg.blk_ub)
+		ub_writeback_io(0, bio_sectors(bio));
 }
 
 static void
@@ -1697,6 +1838,12 @@ static void __cfq_set_active_queue(struc
 		cfqq->slice_dispatch = 0;
 		cfqq->nr_sectors = 0;
 
+		cfqq->first_dispatch = 0;
+		cfqq->n_dispatched = 0;
+		if (cfqq->queued[0] + cfqq->queued[1] >= CFQQ_DEEP_THR &&
+		    CFQD_IDLE_AUTODETECT(cfqd))
+			cfq_mark_cfqq_deep(cfqq);
+
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
@@ -1718,6 +1865,12 @@ __cfq_slice_expired(struct cfq_data *cfq
 {
 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 
+	if (cfq_cfqq_deep(cfqq) && CFQD_IDLE_AUTODETECT(cfqd)) {
+		    cfqq->first_dispatch = 0;
+		    cfqq->n_dispatched = 0;
+		    cfq_clear_cfqq_deep(cfqq);
+	}
+
 	if (cfq_cfqq_wait_request(cfqq))
 		cfq_del_timer(cfqd, cfqq);
 
@@ -1944,6 +2097,7 @@ static bool cfq_should_idle(struct cfq_d
 {
 	enum wl_prio_t prio = cfqq_prio(cfqq);
 	struct cfq_rb_root *service_tree = cfqq->service_tree;
+	unsigned count;
 
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
@@ -1964,8 +2118,16 @@ static bool cfq_should_idle(struct cfq_d
 	/*
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
+	 *
+	 * If disk is fast enough, we should be last in this type
+	 * of service tree among all cfq-groups as well.
 	 */
-	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+	if (CFQD_DISK_LOOKS_FAST(cfqd))
+		count = cfqd->st_counts[cfq_get_idx_by_st(cfqq, cfqq->cfqg)];
+	else
+		count = service_tree->count;
+
+	if (count == 1 && cfq_cfqq_sync(cfqq))
 		return 1;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
@@ -2052,6 +2214,12 @@ static void cfq_dispatch_insert(struct r
 
 	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
 
+	if (cfq_cfqq_deep(cfqq) && CFQD_IDLE_AUTODETECT(cfqd)) {
+		cfqq->n_dispatched++;
+		if (!cfqq->first_dispatch)
+			cfqq->first_dispatch = jiffies;
+	}
+
 	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
@@ -2328,6 +2496,19 @@ static struct cfq_queue *cfq_select_queu
 			goto check_group_idle;
 	}
 
+	if (CFQD_IDLE_AUTODETECT(cfqd) && cfq_cfqq_deep(cfqq) &&
+	    cfqq->n_dispatched >= CFQQ_DEEP_THR) {
+		if (cfqq->first_dispatch == jiffies)
+			cfqd->cfq_disk_looks_fast++;
+		else
+			cfqd->cfq_disk_looks_slow++;
+
+		cfqq->first_dispatch = 0;
+		cfqq->n_dispatched = 0;
+		cfq_clear_cfqq_deep(cfqq);
+		cfqd->cfq_disk_last_updated = jiffies;
+	}
+
 	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
@@ -2335,6 +2516,25 @@ static struct cfq_queue *cfq_select_queu
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 
+	if (CFQD_IDLE_AUTODETECT(cfqd)) {
+		if (cfq_cfqq_deep(cfqq)) {
+			cfqq->first_dispatch = 0;
+			cfqq->n_dispatched = 0;
+			cfq_clear_cfqq_deep(cfqq);
+		}
+
+		if ((cfqd->cfq_disk_last_updated &&
+		     cfqd->cfq_fast_slow_expiration_rate > 0 &&
+		     jiffies - cfqd->cfq_disk_last_updated >
+		     HZ * cfqd->cfq_fast_slow_expiration_rate) ||
+		    cfqd->cfq_disk_looks_fast > 128 ||
+		    cfqd->cfq_disk_looks_slow > 128) {
+			cfqd->cfq_disk_looks_fast >>= 1;
+			cfqd->cfq_disk_looks_slow >>= 1;
+			cfqd->cfq_disk_last_updated = jiffies;
+		}
+	}
+
 	/*
 	 * If another queue has a request waiting within our mean seek
 	 * distance, let it run.  The expire code will check for close
@@ -2369,7 +2569,7 @@ static struct cfq_queue *cfq_select_queu
 	 */
 check_group_idle:
 	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
-	    && cfqq->cfqg->dispatched) {
+	    && cfqq->cfqg->dispatched && !CFQD_DISK_LOOKS_FAST(cfqd)) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
@@ -3255,13 +3455,17 @@ cfq_update_idle_window(struct cfq_data *
 
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
-	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
+	if (cfqq->queued[0] + cfqq->queued[1] >= CFQQ_DEEP_THR &&
+	    (!CFQD_IDLE_AUTODETECT(cfqd) || cfq_cfqq_slice_new(cfqq)))
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+	if (CFQD_DISK_LOOKS_FAST(cfqd))
+		enable_idle = 0;
+	else if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
 		enable_idle = 0;
 	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
+	    ((!cfq_cfqq_deep(cfqq) || CFQD_IDLE_AUTODETECT(cfqd))
+	     && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -3306,19 +3510,19 @@ cfq_should_preempt(struct cfq_data *cfqd
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return true;
 
-	if (new_cfqq->cfqg != cfqq->cfqg)
-		return false;
-
-	if (cfq_slice_used(cfqq))
-		return true;
-
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    new_cfqq->service_tree->count == 2 &&
+	    new_cfqq->service_tree->count == 1 + (new_cfqq->cfqg == cfqq->cfqg) &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 
+	if (new_cfqq->cfqg != cfqq->cfqg)
+		return false;
+
+	if (cfq_slice_used(cfqq))
+		return true;
+
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
@@ -3435,6 +3639,12 @@ static void cfq_insert_request(struct re
 	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
+
+	if (get_exec_ub() != (RQ_CFQG(rq))->blkg.blk_ub)
+		ub_writeback_io(1, blk_rq_sectors(rq));
+
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, NULL);
+
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -3504,6 +3714,44 @@ static bool cfq_should_wait_busy(struct 
 	return false;
 }
 
+static void
+avg_calc_load(struct cfq_data *cfqd, int exp_idx,
+	      unsigned long queued, unsigned long indriver)
+{
+	queued <<= FSHIFT;
+	indriver <<= FSHIFT;
+
+	CALC_LOAD(cfqd->cfq_avg_queued[0], EXP_ARR[0][exp_idx], queued);
+	CALC_LOAD(cfqd->cfq_avg_queued[1], EXP_ARR[1][exp_idx], queued);
+	CALC_LOAD(cfqd->cfq_avg_queued[2], EXP_ARR[2][exp_idx], queued);
+
+	CALC_LOAD(cfqd->cfq_avg_indriver[0], EXP_ARR[0][exp_idx], indriver);
+	CALC_LOAD(cfqd->cfq_avg_indriver[1], EXP_ARR[1][exp_idx], indriver);
+	CALC_LOAD(cfqd->cfq_avg_indriver[2], EXP_ARR[2][exp_idx], indriver);
+}
+
+static void cfq_update_stats(struct cfq_data *cfqd, int indrv_delta)
+{
+	unsigned long now = jiffies;
+
+	if (time_before(now, cfqd->cfq_calc_load_update))
+		goto done;
+
+	if (now - cfqd->cfq_calc_load_update >= CFQD_LOAD_FREQ) {
+		int idx = clamp(fls((now - cfqd->cfq_calc_load_update)
+					/ CFQD_LOAD_FREQ)-1, 0, EXP_ARR_SIZ-1);
+		avg_calc_load(cfqd, idx, cfqd->cfq_queued_last,
+			      cfqd->cfq_indriver_last);
+	}
+
+	avg_calc_load(cfqd, 0, cfqd->rq_queued, cfqd->rq_in_driver);
+
+	cfqd->cfq_calc_load_update = now + CFQD_LOAD_FREQ;
+done:
+	cfqd->cfq_queued_last = cfqd->rq_queued;
+	cfqd->cfq_indriver_last = cfqd->rq_in_driver + indrv_delta;
+}
+
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -3516,6 +3764,7 @@ static void cfq_completed_request(struct
 		     !!(rq->cmd_flags & REQ_NOIDLE));
 
 	cfq_update_hw_tag(cfqd);
+	cfq_update_stats(cfqd, -1);
 
 	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
@@ -3610,6 +3859,17 @@ static inline int __cfq_may_queue(struct
 		return ELV_MQUEUE_MUST;
 	}
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	/*
+	 * force queueing first queue in the group after long sleep
+	 */
+	if (!cfq_cfqq_must_alloc_slice(cfqq) && cfqq->cfqg->nr_cfqq == 0 &&
+	    cfqg_key(&cfqq->cfqd->grp_service_tree, cfqq->cfqg) < 0) {
+		cfq_mark_cfqq_must_alloc_slice(cfqq);
+		return ELV_MQUEUE_MUST;
+	}
+#endif
+
 	return ELV_MQUEUE_MAY;
 }
 
@@ -3902,9 +4162,11 @@ static void cfq_exit_queue(struct elevat
 	if (wait)
 		synchronize_rcu();
 
+	BUG_ON(cfqd->nr_blkcg_linked_grps);
+
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/* Free up per cpu stats for root group */
-	free_percpu(cfqd->root_group.blkg.stats_cpu);
+	blkio_free_blkg_stats(&cfqd->root_group.blkg);
 #endif
 	kfree(cfqd);
 }
@@ -4002,6 +4264,8 @@ static void *cfq_init_queue(struct reque
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 1;
 	cfqd->hw_tag = -1;
+	cfqd->cfq_fast_slow_expiration_rate = cfq_fast_slow_expiration_rate;
+	cfqd->cfq_calc_load_update = jiffies + CFQD_LOAD_FREQ;
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
 	 * second, in order to have larger depth for async operations.
@@ -4047,6 +4311,21 @@ cfq_var_show(unsigned int var, char *pag
 	return sprintf(page, "%d\n", var);
 }
 
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static ssize_t
+cfq_var_avg_show(unsigned long *var, char *page)
+{
+	unsigned long avg1 = var[0] + FIXED_1/200;
+	unsigned long avg2 = var[1] + FIXED_1/200;
+	unsigned long avg3 = var[2] + FIXED_1/200;
+	return sprintf(page, "%lu.%02lu %lu.%02lu %lu.%02lu\n",
+		       LOAD_INT(avg1), LOAD_FRAC(avg1),
+		       LOAD_INT(avg2), LOAD_FRAC(avg2),
+		       LOAD_INT(avg3), LOAD_FRAC(avg3));
+}
+
 static ssize_t
 cfq_var_store(unsigned int *var, const char *page, size_t count)
 {
@@ -4065,6 +4344,17 @@ static ssize_t __FUNC(struct elevator_qu
 		__data = jiffies_to_msecs(__data);			\
 	return cfq_var_show(__data, (page));				\
 }
+
+#define SHOW_FUNCTION_AVG(__FUNC, __VAR)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct cfq_data *cfqd = e->elevator_data;			\
+	unsigned long *__data = __VAR;					\
+	spin_lock_irq(cfqd->queue->queue_lock);				\
+	cfq_update_stats(cfqd, 0);					\
+	spin_unlock_irq(cfqd->queue->queue_lock);			\
+	return cfq_var_avg_show(__data, (page));			\
+}
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
@@ -4077,6 +4367,13 @@ SHOW_FUNCTION(cfq_slice_async_show, cfqd
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
 SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
+SHOW_FUNCTION(cfq_enable_idle_for_deep_show, cfqd->cfq_enable_idle_for_deep, 0);
+SHOW_FUNCTION(cfq_disk_looks_fast_show, cfqd->cfq_disk_looks_fast, 0);
+SHOW_FUNCTION(cfq_disk_looks_slow_show, cfqd->cfq_disk_looks_slow, 0);
+SHOW_FUNCTION(cfq_fast_slow_expiration_rate_show, cfqd->cfq_fast_slow_expiration_rate, 0);
+SHOW_FUNCTION_AVG(cfq_queued_avg_show, cfqd->cfq_avg_queued);
+SHOW_FUNCTION_AVG(cfq_in_driver_avg_show, cfqd->cfq_avg_indriver);
+SHOW_FUNCTION(cfq_hw_tag_show, cfqd->hw_tag, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -4095,6 +4392,14 @@ static ssize_t __FUNC(struct elevator_qu
 		*(__PTR) = __data;					\
 	return ret;							\
 }
+
+#define STORE_FUNCTION_AVG(__FUNC, __PTR)				\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct cfq_data *cfqd = e->elevator_data;			\
+	__PTR[0] = __PTR[1] = __PTR[2] = 0;				\
+	return count;							\
+}
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
 		UINT_MAX, 1);
@@ -4111,6 +4416,13 @@ STORE_FUNCTION(cfq_slice_async_rq_store,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
 STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
+STORE_FUNCTION(cfq_enable_idle_for_deep_store, &cfqd->cfq_enable_idle_for_deep, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_disk_looks_fast_store, &cfqd->cfq_disk_looks_fast, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_disk_looks_slow_store, &cfqd->cfq_disk_looks_slow, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_fast_slow_expiration_rate_store, &cfqd->cfq_fast_slow_expiration_rate, 0, UINT_MAX, 0);
+STORE_FUNCTION_AVG(cfq_queued_avg_store, cfqd->cfq_avg_queued);
+STORE_FUNCTION_AVG(cfq_in_driver_avg_store, cfqd->cfq_avg_indriver);
+STORE_FUNCTION(cfq_hw_tag_store, &cfqd->hw_tag, 0, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -4129,6 +4441,13 @@ static struct elv_fs_entry cfq_attrs[] =
 	CFQ_ATTR(group_idle),
 	CFQ_ATTR(low_latency),
 	CFQ_ATTR(group_isolation),
+	CFQ_ATTR(enable_idle_for_deep),
+	CFQ_ATTR(disk_looks_fast),
+	CFQ_ATTR(disk_looks_slow),
+	CFQ_ATTR(fast_slow_expiration_rate),
+	CFQ_ATTR(queued_avg),
+	CFQ_ATTR(in_driver_avg),
+	CFQ_ATTR(hw_tag),
 	__ATTR_NULL
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/block/deadline-iosched.c linux-2.6.32-504.3.3.el6-042stab103_6/block/deadline-iosched.c
--- linux-2.6.32-504.3.3.el6.orig/block/deadline-iosched.c	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/deadline-iosched.c	2015-01-21 12:02:43.336223398 +0300
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/rbtree.h>
+#include <bc/io_acct.h>
 
 /*
  * See Documentation/block/deadline-iosched.txt
@@ -108,6 +109,8 @@ deadline_add_request(struct request_queu
 	 */
 	rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
 	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+	ub_writeback_io(1, blk_rq_sectors(rq));
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, NULL);
 }
 
 /*
@@ -186,6 +189,12 @@ deadline_merged_requests(struct request_
 	deadline_remove_request(q, next);
 }
 
+static void deadline_bio_merged(struct request_queue *q, struct request *req,
+				struct bio *bio)
+{
+	ub_writeback_io(0, bio_sectors(bio));
+}
+
 /*
  * move request from sort list to dispatch queue.
  */
@@ -441,6 +450,7 @@ static struct elevator_type iosched_dead
 		.elevator_merge_fn = 		deadline_merge,
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
+		.elevator_bio_merged_fn =	deadline_bio_merged,
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
 		.elevator_add_req_fn =		deadline_add_request,
 		.elevator_queue_empty_fn =	deadline_queue_empty,
diff -upr linux-2.6.32-504.3.3.el6.orig/block/elevator.c linux-2.6.32-504.3.3.el6-042stab103_6/block/elevator.c
--- linux-2.6.32-504.3.3.el6.orig/block/elevator.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/elevator.c	2015-01-21 12:02:43.790211345 +0300
@@ -868,12 +868,12 @@ void elv_unregister(struct elevator_type
 	 */
 	if (e->ops.trim) {
 		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
+		do_each_thread_all(g, p) {
 			task_lock(p);
 			if (p->io_context)
 				e->ops.trim(p->io_context);
 			task_unlock(p);
-		} while_each_thread(g, p);
+		} while_each_thread_all(g, p);
 		read_unlock(&tasklist_lock);
 	}
 
diff -upr linux-2.6.32-504.3.3.el6.orig/block/genhd.c linux-2.6.32-504.3.3.el6-042stab103_6/block/genhd.c
--- linux-2.6.32-504.3.3.el6.orig/block/genhd.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/genhd.c	2015-01-21 12:02:47.211120524 +0300
@@ -18,13 +18,12 @@
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/device_cgroup.h>
 
 #include "blk.h"
 
 static DEFINE_MUTEX(block_class_lock);
-#ifndef CONFIG_SYSFS_DEPRECATED
 struct kobject *block_depr;
-#endif
 
 /* for extended dynamic devt allocation, currently only one major is used */
 #define MAX_EXT_DEVT		(1 << MINORBITS)
@@ -35,7 +34,7 @@ struct kobject *block_depr;
 static DEFINE_MUTEX(ext_devt_mutex);
 static DEFINE_IDR(ext_devt_idr);
 
-static struct device_type disk_type;
+struct device_type disk_type;
 
 /**
  * disk_get_part - get partition
@@ -151,7 +150,7 @@ struct hd_struct *disk_part_iter_next(st
 		part = rcu_dereference(ptbl->part[piter->idx]);
 		if (!part)
 			continue;
-		if (!part->nr_sects &&
+		if (!part_nr_sects_read(part) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
 		      piter->idx == 0))
@@ -188,7 +187,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 {
 	return part->start_sect <= sector &&
-		sector < part->start_sect + part->nr_sects;
+		sector < part->start_sect + part_nr_sects_read(part);
 }
 
 /**
@@ -284,8 +283,12 @@ void blkdev_show(struct seq_file *seqf, 
 
 	if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&block_class_lock);
-		for (dp = major_names[offset]; dp; dp = dp->next)
+		for (dp = major_names[offset]; dp; dp = dp->next) {
+			if (!devcgroup_device_visible(S_IFBLK, dp->major,
+						0, INT_MAX))
+				continue;
 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
+		}
 		mutex_unlock(&block_class_lock);
 	}
 }
@@ -704,7 +707,7 @@ void __init printk_all_partitions(void)
 
 			printk("%s%s %10llu %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
-			       (unsigned long long)part->nr_sects >> 1,
+			       (unsigned long long)part_nr_sects_read(part) >> 1,
 			       disk_name(disk, part->partno, name_buf));
 			if (is_part0) {
 				if (disk->driverfs_dev != NULL &&
@@ -796,7 +799,7 @@ static int show_partition(struct seq_fil
 	while ((part = disk_part_iter_next(&piter)))
 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
 			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
-			   (unsigned long long)part->nr_sects >> 1,
+			   (unsigned long long)part_nr_sects_read(part) >> 1,
 			   disk_name(sgp, part->partno, buf));
 	disk_part_iter_exit(&piter);
 
@@ -836,7 +839,7 @@ static int __init genhd_device_init(void
 {
 	int error;
 
-	block_class.dev_kobj = sysfs_dev_block_kobj;
+	block_class.dev_kobj = ve_sysfs_dev_block_kobj;
 	error = class_register(&block_class);
 	if (unlikely(error))
 		return error;
@@ -845,10 +848,10 @@ static int __init genhd_device_init(void
 
 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 
-#ifndef CONFIG_SYSFS_DEPRECATED
-	/* create top-level block dir */
-	block_depr = kobject_create_and_add("block", NULL);
-#endif
+	if (!sysfs_deprecated)
+		/* create top-level block dir */
+		block_depr = kobject_create_and_add("block", NULL);
+
 	return 0;
 }
 
@@ -1054,6 +1057,7 @@ static void disk_release(struct device *
 struct class block_class = {
 	.name		= "block",
 };
+EXPORT_SYMBOL(block_class);
 
 static char *block_devnode(struct device *dev, mode_t *mode)
 {
@@ -1064,12 +1068,13 @@ static char *block_devnode(struct device
 	return NULL;
 }
 
-static struct device_type disk_type = {
+struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
 	.devnode	= block_devnode,
 };
+EXPORT_SYMBOL(disk_type);
 
 #ifdef CONFIG_PROC_FS
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/block/ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/block/ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/block/ioctl.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/block/ioctl.c	2015-01-21 12:02:42.924234336 +0300
@@ -12,12 +12,13 @@ static int blkpg_ioctl(struct block_devi
 {
 	struct block_device *bdevp;
 	struct gendisk *disk;
-	struct hd_struct *part;
+	struct hd_struct *part, *lpart;
 	struct blkpg_ioctl_arg a;
 	struct blkpg_partition p;
 	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
+	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -91,6 +92,71 @@ static int blkpg_ioctl(struct block_devi
 			bdput(bdevp);
 
 			return 0;
+		case BLKPG_RESIZE_PARTITION:
+			start = p.start >> 9;
+			length = p.length >> 9;
+			/* check for fit in a hd_struct */
+			if (sizeof(sector_t) == sizeof(long) &&
+			    sizeof(long long) > sizeof(long)) {
+				long pstart = start, plength = length;
+				if (pstart != start || plength != length
+				    || pstart < 0 || plength < 0)
+					return -EINVAL;
+			}
+
+			part = disk_get_part(disk, partno);
+			if (!part)
+				return -ENXIO;
+			bdevp = bdget(part_devt(part));
+			if (!bdevp) {
+				disk_put_part(part);
+				return -ENOMEM;
+			}
+
+			err = 0;
+			mutex_lock(&bdevp->bd_mutex);
+			mutex_lock_nested(&bdev->bd_mutex, 1);
+
+			if (start != part->start_sect) {
+				err = -EINVAL;
+				goto resize_done;
+			}
+			/* overlap? */
+			disk_part_iter_init(&piter, disk,
+					    DISK_PITER_INCL_EMPTY);
+			while ((lpart = disk_part_iter_next(&piter))) {
+				if (lpart->partno != partno &&
+				    !(start + length <= lpart->start_sect ||
+				      start >= lpart->start_sect + lpart->nr_sects)) {
+					disk_part_iter_exit(&piter);
+					err = -EBUSY;
+					goto resize_done;
+				}
+			}
+			disk_part_iter_exit(&piter);
+			part_nr_sects_write(part, length);
+			i_size_write(bdevp->bd_inode, p.length);
+	resize_done:
+			mutex_unlock(&bdevp->bd_mutex);
+			mutex_unlock(&bdev->bd_mutex);
+			bdput(bdevp);
+			disk_put_part(part);
+			return err;
+		case BLKPG_GET_PARTITION:
+			mutex_lock(&bdev->bd_mutex);
+			part = disk_get_part(disk, partno);
+			if (!part)
+			{
+				mutex_unlock(&bdev->bd_mutex);
+				return -ENXIO;
+			}
+			p.start = part->start_sect << 9;
+			p.length = part->nr_sects << 9;
+			disk_put_part(part);
+			mutex_unlock(&bdev->bd_mutex);
+			if (copy_to_user(a.data, &p, sizeof(struct blkpg_partition)))
+				return -EFAULT;
+			return 0;
 		default:
 			return -EINVAL;
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/crypto/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/crypto/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/crypto/Kconfig	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/crypto/Kconfig	2015-01-21 12:02:52.169988881 +0300
@@ -414,6 +414,16 @@ config CRYPTO_SHA1
 	help
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
 
+config CRYPTO_SHA1_SSSE3
+	tristate "SHA1 digest algorithm (SSSE3/AVX)"
+	depends on X86 && 64BIT
+	select CRYPTO_SHA1
+	select CRYPTO_HASH
+	help
+	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
+	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
+	  Extensions (AVX), when available.
+
 config CRYPTO_SHA256
 	tristate "SHA224 and SHA256 digest algorithm"
 	select CRYPTO_HASH
diff -upr linux-2.6.32-504.3.3.el6.orig/crypto/crc32c.c linux-2.6.32-504.3.3.el6-042stab103_6/crypto/crc32c.c
--- linux-2.6.32-504.3.3.el6.orig/crypto/crc32c.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/crypto/crc32c.c	2015-01-21 12:02:53.063965151 +0300
@@ -141,6 +141,12 @@ static u32 crc32c(u32 crc, const u8 *dat
 	return crc;
 }
 
+u32 crc32c_generic(u32 crc, const void *address, unsigned int length)
+{
+	return crc32c(crc, address, length);
+}
+EXPORT_SYMBOL(crc32c_generic);
+
 /*
  * Steps through buffer one byte at at time, calculates reflected 
  * crc using table.
diff -upr linux-2.6.32-504.3.3.el6.orig/crypto/sha1_generic.c linux-2.6.32-504.3.3.el6-042stab103_6/crypto/sha1_generic.c
--- linux-2.6.32-504.3.3.el6.orig/crypto/sha1_generic.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/crypto/sha1_generic.c	2015-01-21 12:02:52.161989093 +0300
@@ -36,7 +36,7 @@ static int sha1_init(struct shash_desc *
 	return 0;
 }
 
-static int sha1_update(struct shash_desc *desc, const u8 *data,
+int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 			unsigned int len)
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
@@ -70,6 +70,7 @@ static int sha1_update(struct shash_desc
 
 	return 0;
 }
+EXPORT_SYMBOL(crypto_sha1_update);
 
 
 /* Add padding and return the message digest. */
@@ -86,10 +87,10 @@ static int sha1_final(struct shash_desc 
 	/* Pad out to 56 mod 64 */
 	index = sctx->count & 0x3f;
 	padlen = (index < 56) ? (56 - index) : ((64+56) - index);
-	sha1_update(desc, padding, padlen);
+	crypto_sha1_update(desc, padding, padlen);
 
 	/* Append length */
-	sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+	crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
 
 	/* Store state in digest */
 	for (i = 0; i < 5; i++)
@@ -120,7 +121,7 @@ static int sha1_import(struct shash_desc
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_init,
-	.update		=	sha1_update,
+	.update		=	crypto_sha1_update,
 	.final		=	sha1_final,
 	.export		=	sha1_export,
 	.import		=	sha1_import,
diff -upr linux-2.6.32-504.3.3.el6.orig/crypto/signature/dsa.c linux-2.6.32-504.3.3.el6-042stab103_6/crypto/signature/dsa.c
--- linux-2.6.32-504.3.3.el6.orig/crypto/signature/dsa.c	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/crypto/signature/dsa.c	2015-01-21 12:02:58.133830573 +0300
@@ -94,4 +94,4 @@ cleanup:
 	mpi_free(v);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(DSA_verify);
+EXPORT_SYMBOL(DSA_verify);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/Makefile	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/Makefile	2015-01-21 12:02:54.725921033 +0300
@@ -67,6 +67,7 @@ obj-$(CONFIG_ZORRO)		+= zorro/
 obj-$(CONFIG_MAC)		+= macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
 obj-$(CONFIG_PARIDE) 		+= block/paride/
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= block/ploop/
 obj-$(CONFIG_TC)		+= tc/
 obj-$(CONFIG_UWB)		+= uwb/
 obj-$(CONFIG_USB_OTG_UTILS)	+= usb/otg/
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/Makefile	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/Makefile	2015-01-21 12:02:44.450193825 +0300
@@ -3,7 +3,7 @@
 obj-y			:= core.o sys.o bus.o dd.o \
 			   driver.o class.o platform.o \
 			   cpu.o firmware.o init.o map.o devres.o \
-			   attribute_container.o transport_class.o
+			   attribute_container.o transport_class.o vedev.o
 obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/base.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/base.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/base.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/base.h	2015-01-21 12:02:43.667214612 +0300
@@ -130,7 +130,12 @@ extern char *make_class_name(const char 
 
 extern int devres_release_all(struct device *dev);
 
+#ifndef CONFIG_VE
 extern struct kset *devices_kset;
+#define ve_devices_kset devices_kset
+#else
+#define ve_devices_kset (get_exec_env()->devices_kset)
+#endif
 
 #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
 extern void module_add_driver(struct module *mod, struct device_driver *drv);
@@ -146,3 +151,17 @@ extern int devtmpfs_init(void);
 #else
 static inline int devtmpfs_init(void) { return 0; }
 #endif
+
+extern struct device_type part_type;
+#ifdef CONFIG_BLOCK
+static inline int device_is_not_partition(struct device *dev)
+{
+	return !(dev->type == &part_type);
+}
+#else
+static inline int device_is_not_partition(struct device *dev)
+{
+	return 1;
+}
+#endif
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/bus.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/bus.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/bus.c	2014-12-12 23:28:56.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/bus.c	2015-01-21 12:02:57.996834210 +0300
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/sched.h>
 #include "base.h"
 #include "power/power.h"
 
@@ -299,7 +300,7 @@ int bus_for_each_dev(struct bus_type *bu
 	klist_iter_exit(&i);
 	return error;
 }
-EXPORT_SYMBOL_GPL(bus_for_each_dev);
+EXPORT_SYMBOL(bus_for_each_dev);
 
 /**
  * bus_find_device - device iterator for locating a particular device.
@@ -439,21 +440,20 @@ static void device_remove_attrs(struct b
 	}
 }
 
-#ifdef CONFIG_SYSFS_DEPRECATED
 static int make_deprecated_bus_links(struct device *dev)
 {
-	return sysfs_create_link(&dev->kobj,
-				 &dev->bus->p->subsys.kobj, "bus");
+	if (sysfs_deprecated)
+		return sysfs_create_link(&dev->kobj,
+					 &dev->bus->p->subsys.kobj, "bus");
+	else
+		return 0;
 }
 
 static void remove_deprecated_bus_links(struct device *dev)
 {
-	sysfs_remove_link(&dev->kobj, "bus");
+	if (sysfs_deprecated)
+		sysfs_remove_link(&dev->kobj, "bus");
 }
-#else
-static inline int make_deprecated_bus_links(struct device *dev) { return 0; }
-static inline void remove_deprecated_bus_links(struct device *dev) { }
-#endif
 
 /**
  * bus_add_device - add device to bus
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/class.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/class.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/class.c	2014-12-12 23:28:56.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/class.c	2015-01-21 12:02:55.330904974 +0300
@@ -19,6 +19,8 @@
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
 #include "base.h"
 
 #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)
@@ -74,8 +76,14 @@ static struct kobj_type class_ktype = {
 };
 
 /* Hotplug events for classes go to the class class_subsys */
-static struct kset *class_kset;
+#ifndef CONFIG_VE
+struct kset *class_kset;
+EXPORT_SYMBOL_GPL(class_kset);
 
+#define visible_class_kset class_kset
+#else
+#define visible_class_kset (get_exec_env()->class_kset)
+#endif
 
 int class_create_file(struct class *cls, const struct class_attribute *attr)
 {
@@ -173,14 +181,14 @@ int __class_register(struct class *cls, 
 
 	/* set the default /sys/dev directory for devices of this class */
 	if (!cls->dev_kobj)
-		cls->dev_kobj = sysfs_dev_char_kobj;
+		cls->dev_kobj = ve_sysfs_dev_char_kobj;
 
-#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
+#if defined(CONFIG_BLOCK)
 	/* let the block class directory show up in the root of sysfs */
-	if (cls != &block_class)
-		cp->class_subsys.kobj.kset = class_kset;
+	if (!sysfs_deprecated || cls != &block_class)
+		cp->class_subsys.kobj.kset = visible_class_kset;
 #else
-	cp->class_subsys.kobj.kset = class_kset;
+	cp->class_subsys.kobj.kset = visible_class_kset;
 #endif
 	cp->class_subsys.kobj.ktype = &class_ktype;
 	cp->class = cls;
@@ -248,7 +256,7 @@ error:
 	kfree(cls);
 	return ERR_PTR(retval);
 }
-EXPORT_SYMBOL_GPL(__class_create);
+EXPORT_SYMBOL(__class_create);
 
 /**
  * class_destroy - destroys a struct class structure
@@ -265,7 +273,6 @@ void class_destroy(struct class *cls)
 	class_unregister(cls);
 }
 
-#ifdef CONFIG_SYSFS_DEPRECATED
 char *make_class_name(const char *name, struct kobject *kobj)
 {
 	char *class_name;
@@ -282,7 +289,6 @@ char *make_class_name(const char *name, 
 	strcat(class_name, kobject_name(kobj));
 	return class_name;
 }
-#endif
 
 /**
  * class_dev_iter_init - initialize class device iterator
@@ -508,7 +514,7 @@ struct class_compat *class_compat_regist
 	cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL);
 	if (!cls)
 		return NULL;
-	cls->kobj = kobject_create_and_add(name, &class_kset->kobj);
+	cls->kobj = kobject_create_and_add(name, &visible_class_kset->kobj);
 	if (!cls->kobj) {
 		kfree(cls);
 		return NULL;
@@ -577,18 +583,25 @@ void class_compat_remove_link(struct cla
 }
 EXPORT_SYMBOL_GPL(class_compat_remove_link);
 
-int __init classes_init(void)
+int classes_init(void)
 {
-	class_kset = kset_create_and_add("class", NULL, NULL);
-	if (!class_kset)
+	visible_class_kset = kset_create_and_add("class", NULL, NULL);
+	if (!visible_class_kset)
 		return -ENOMEM;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(classes_init);
+
+void classes_fini(void)
+{
+	kset_unregister(visible_class_kset);
+}
+EXPORT_SYMBOL_GPL(classes_fini);
 
 EXPORT_SYMBOL_GPL(class_create_file);
 EXPORT_SYMBOL_GPL(class_remove_file);
 EXPORT_SYMBOL_GPL(class_unregister);
-EXPORT_SYMBOL_GPL(class_destroy);
+EXPORT_SYMBOL(class_destroy);
 
 EXPORT_SYMBOL_GPL(class_interface_register);
 EXPORT_SYMBOL_GPL(class_interface_unregister);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/core.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/core.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/core.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/core.c	2015-01-21 12:02:55.330904974 +0300
@@ -23,26 +23,22 @@
 #include <linux/semaphore.h>
 #include <linux/mutex.h>
 #include <linux/async.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
 
 #include "base.h"
 #include "power/power.h"
 
 int (*platform_notify)(struct device *dev) = NULL;
 int (*platform_notify_remove)(struct device *dev) = NULL;
+#ifndef CONFIG_VE
 static struct kobject *dev_kobj;
+#define ve_dev_kobj	dev_kobj
 struct kobject *sysfs_dev_char_kobj;
 struct kobject *sysfs_dev_block_kobj;
-
-#ifdef CONFIG_BLOCK
-static inline int device_is_not_partition(struct device *dev)
-{
-	return !(dev->type == &part_type);
-}
+struct kobject *sysfs_block_kobj;
 #else
-static inline int device_is_not_partition(struct device *dev)
-{
-	return 1;
-}
+#define ve_dev_kobj	(get_exec_env()->dev_kobj)
 #endif
 
 /**
@@ -192,7 +188,9 @@ static int dev_uevent(struct kset *kset,
 	if (dev->driver)
 		add_uevent_var(env, "DRIVER=%s", dev->driver->name);
 
-#ifdef CONFIG_SYSFS_DEPRECATED
+	if (!sysfs_deprecated)
+		goto skip;
+
 	if (dev->class) {
 		struct device *parent = dev->parent;
 
@@ -221,7 +219,7 @@ static int dev_uevent(struct kset *kset,
 			add_uevent_var(env, "PHYSDEVDRIVER=%s",
 				       dev->driver->name);
 	}
-#endif
+skip:
 
 	/* have the bus specific function add its stuff */
 	if (dev->bus && dev->bus->uevent) {
@@ -307,17 +305,22 @@ static ssize_t store_uevent(struct devic
 	enum kobject_action action;
 
 	if (kobject_action_type(buf, count, &action) == 0) {
-		kobject_uevent(&dev->kobj, action);
+		kobject_uevent_env_one(&dev->kobj, action, NULL);
 		goto out;
 	}
 
 	dev_err(dev, "uevent: unsupported action-string; this will "
 		     "be ignored in a future kernel version\n");
-	kobject_uevent(&dev->kobj, KOBJ_ADD);
+	kobject_uevent_env_one(&dev->kobj, KOBJ_ADD, NULL);
 out:
 	return count;
 }
 
+extern ssize_t ve_device_handler(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count);
+static struct device_attribute ve_device_attr =
+	__ATTR(ve_device_add, S_IWUSR, NULL, ve_device_handler);
+
 static struct device_attribute uevent_attr =
 	__ATTR(uevent, S_IRUGO | S_IWUSR, show_uevent, store_uevent);
 
@@ -438,8 +441,9 @@ static ssize_t show_dev(struct device *d
 static struct device_attribute devt_attr =
 	__ATTR(dev, S_IRUGO, show_dev, NULL);
 
-/* kset to create /sys/devices/  */
+#ifndef CONFIG_VE
 struct kset *devices_kset;
+#endif
 
 /**
  * device_create_file - create sysfs attribute file for device.
@@ -557,7 +561,7 @@ static void klist_children_put(struct kl
  */
 void device_initialize(struct device *dev)
 {
-	dev->kobj.kset = devices_kset;
+	dev->kobj.kset = ve_devices_kset;
 	kobject_init(&dev->kobj, &device_ktype);
 	INIT_LIST_HEAD(&dev->dma_pools);
 	init_MUTEX(&dev->sem);
@@ -568,8 +572,7 @@ void device_initialize(struct device *de
 	set_dev_node(dev, -1);
 }
 
-#ifdef CONFIG_SYSFS_DEPRECATED
-static struct kobject *get_device_parent(struct device *dev,
+static struct kobject *get_device_parent_dep(struct device *dev,
 					 struct device *parent)
 {
 	/* class devices without a parent live in /sys/class/<classname>/ */
@@ -582,22 +585,25 @@ static struct kobject *get_device_parent
 	return NULL;
 }
 
-static inline void cleanup_device_parent(struct device *dev) {}
-static inline void cleanup_glue_dir(struct device *dev,
+static inline void cleanup_device_parent_dep(struct device *dev) {}
+static inline void cleanup_glue_dir_dep(struct device *dev,
 				    struct kobject *glue_dir) {}
+#ifndef CONFIG_VE
+static struct kobject *virtual_dir = NULL;
 #else
+# define virtual_dir (get_exec_env()->_virtual_dir)
+#endif
+
 static struct kobject *virtual_device_parent(struct device *dev)
 {
-	static struct kobject *virtual_dir = NULL;
-
 	if (!virtual_dir)
 		virtual_dir = kobject_create_and_add("virtual",
-						     &devices_kset->kobj);
+						     &ve_devices_kset->kobj);
 
 	return virtual_dir;
 }
 
-static struct kobject *get_device_parent(struct device *dev,
+static struct kobject *get_device_parent_nodep(struct device *dev,
 					 struct device *parent)
 {
 	int retval;
@@ -658,7 +664,7 @@ static struct kobject *get_device_parent
 	return NULL;
 }
 
-static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
+static void cleanup_glue_dir_nodep(struct device *dev, struct kobject *glue_dir)
 {
 	/* see if we live in a "glue" directory */
 	if (!glue_dir || !dev->class ||
@@ -668,11 +674,36 @@ static void cleanup_glue_dir(struct devi
 	kobject_put(glue_dir);
 }
 
+static void cleanup_device_parent_nodep(struct device *dev)
+{
+	cleanup_glue_dir_nodep(dev, dev->kobj.parent);
+}
+
+static struct kobject *get_device_parent(struct device *dev,
+		struct device *parent)
+{
+	if (sysfs_deprecated)
+		return get_device_parent_dep(dev, parent);
+	else
+		return get_device_parent_nodep(dev, parent);
+}
+
+static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
+{
+	if (sysfs_deprecated)
+		cleanup_glue_dir_dep(dev, glue_dir);
+	else
+		cleanup_glue_dir_nodep(dev, glue_dir);
+}
+
 static void cleanup_device_parent(struct device *dev)
 {
-	cleanup_glue_dir(dev, dev->kobj.parent);
+	if (sysfs_deprecated)
+		cleanup_device_parent_dep(dev);
+	else
+		cleanup_device_parent_nodep(dev);
 }
-#endif
+
 
 static void setup_parent(struct device *dev, struct device *parent)
 {
@@ -695,7 +726,9 @@ static int device_add_class_symlinks(str
 	if (error)
 		goto out;
 
-#ifdef CONFIG_SYSFS_DEPRECATED
+	if (!sysfs_deprecated)
+		goto nodep;
+
 	/* stacked class devices need a symlink in the class directory */
 	if (dev->kobj.parent != &dev->class->p->class_subsys.kobj &&
 	    device_is_not_partition(dev)) {
@@ -720,7 +753,7 @@ static int device_add_class_symlinks(str
 					  &parent->kobj,
 					  "device");
 		if (error)
-			goto out_busid;
+			goto out_busid_dep;
 
 		class_name = make_class_name(dev->class->name,
 						&dev->kobj);
@@ -736,12 +769,14 @@ static int device_add_class_symlinks(str
 out_device:
 	if (dev->parent && device_is_not_partition(dev))
 		sysfs_remove_link(&dev->kobj, "device");
-out_busid:
+out_busid_dep:
 	if (dev->kobj.parent != &dev->class->p->class_subsys.kobj &&
 	    device_is_not_partition(dev))
 		sysfs_remove_link(&dev->class->p->class_subsys.kobj,
 				  dev_name(dev));
-#else
+	goto out_subsys;
+
+nodep:
 	/* link in the class directory pointing to the device */
 	error = sysfs_create_link(&dev->class->p->class_subsys.kobj,
 				  &dev->kobj, dev_name(dev));
@@ -752,14 +787,12 @@ out_busid:
 		error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
 					  "device");
 		if (error)
-			goto out_busid;
+			goto out_busid_nodep;
 	}
 	return 0;
 
-out_busid:
+out_busid_nodep:
 	sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev));
-#endif
-
 out_subsys:
 	sysfs_remove_link(&dev->kobj, "subsystem");
 out:
@@ -771,7 +804,9 @@ static void device_remove_class_symlinks
 	if (!dev->class)
 		return;
 
-#ifdef CONFIG_SYSFS_DEPRECATED
+	if (!sysfs_deprecated)
+		goto nodep;
+
 	if (dev->parent && device_is_not_partition(dev)) {
 		char *class_name;
 
@@ -787,13 +822,14 @@ static void device_remove_class_symlinks
 	    device_is_not_partition(dev))
 		sysfs_remove_link(&dev->class->p->class_subsys.kobj,
 				  dev_name(dev));
-#else
+	goto done;
+
+nodep:
 	if (dev->parent && device_is_not_partition(dev))
 		sysfs_remove_link(&dev->kobj, "device");
 
 	sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev));
-#endif
-
+done:
 	sysfs_remove_link(&dev->kobj, "subsystem");
 }
 
@@ -832,7 +868,7 @@ static struct kobject *device_to_dev_kob
 	if (dev->class)
 		kobj = dev->class->dev_kobj;
 	else
-		kobj = sysfs_dev_char_kobj;
+		kobj = ve_sysfs_dev_char_kobj;
 
 	return kobj;
 }
@@ -941,6 +977,17 @@ int device_add(struct device *dev)
 	if (platform_notify)
 		platform_notify(dev);
 
+	if (!is_dev_netdev(dev)) {
+		/*
+		 * Net devices must be added with the namespace switching means
+		 * instead of moving them through standard ve_device attributes,
+		 * so ve_device_attr should not be provided.
+		 */
+		error = device_create_file(dev, &ve_device_attr);
+		if (error)
+			goto veError;
+	}
+
 	error = device_create_file(dev, &uevent_attr);
 	if (error)
 		goto attrError;
@@ -1015,6 +1062,8 @@ done:
  ueventattrError:
 	device_remove_file(dev, &uevent_attr);
  attrError:
+	device_remove_file(dev, &ve_device_attr);
+ veError:
 	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
  Error:
@@ -1087,6 +1136,7 @@ void put_device(struct device *dev)
  * NOTE: this should be called manually _iff_ device_add() was
  * also called manually.
  */
+extern void ve_device_del(struct device *dev, struct ve_struct *ve);
 void device_del(struct device *dev)
 {
 	struct device *parent = dev->parent;
@@ -1121,6 +1171,7 @@ void device_del(struct device *dev)
 		mutex_unlock(&dev->class->p->class_mutex);
 	}
 	device_remove_file(dev, &uevent_attr);
+	device_remove_file(dev, &ve_device_attr);
 	device_remove_attrs(dev);
 	bus_remove_device(dev);
 	driver_deferred_probe_del(dev);
@@ -1137,6 +1188,7 @@ void device_del(struct device *dev)
 	 */
 	if (platform_notify_remove)
 		platform_notify_remove(dev);
+	ve_device_del(dev, NULL);
 	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	cleanup_device_parent(dev);
 	kobject_del(&dev->kobj);
@@ -1278,31 +1330,61 @@ struct device *device_find_child(struct 
 	return child;
 }
 
-int __init devices_init(void)
+int devices_init(void)
 {
-	devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
-	if (!devices_kset)
-		return -ENOMEM;
-	dev_kobj = kobject_create_and_add("dev", NULL);
-	if (!dev_kobj)
+	ve_devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
+	if (!ve_devices_kset)
+		goto dev_kset_err;
+	ve_dev_kobj = kobject_create_and_add("dev", NULL);
+	if (!ve_dev_kobj)
 		goto dev_kobj_err;
-	sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
-	if (!sysfs_dev_block_kobj)
-		goto block_kobj_err;
-	sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
-	if (!sysfs_dev_char_kobj)
-		goto char_kobj_err;
-
+	ve_sysfs_dev_block_kobj = kobject_create_and_add("block", ve_dev_kobj);
+	if (!ve_sysfs_dev_block_kobj)
+		goto dev_block_kobj_err;
+	ve_sysfs_dev_char_kobj = kobject_create_and_add("char", ve_dev_kobj);
+	if (!ve_sysfs_dev_char_kobj)
+		goto dev_char_kobj_err;
+	/*
+	 * There is a creation of "/sys/block" kobject for ve != ve0
+	 * Note that for ve0 this kobject already created at init stage
+	 * in function genhd_device_init()(/block/genhd.c)
+	 */
+	if (!ve_is_super(get_exec_env())) {
+		ve_sysfs_block_kobj = kobject_create_and_add("block", NULL);
+		if (!ve_sysfs_block_kobj)
+			goto block_kobj_err;
+	}
 	return 0;
 
- char_kobj_err:
-	kobject_put(sysfs_dev_block_kobj);
- block_kobj_err:
-	kobject_put(dev_kobj);
- dev_kobj_err:
-	kset_unregister(devices_kset);
+block_kobj_err:
+	kobject_put(ve_sysfs_dev_char_kobj);
+dev_char_kobj_err:
+	kobject_put(ve_sysfs_dev_block_kobj);
+dev_block_kobj_err:
+	kobject_put(ve_dev_kobj);
+dev_kobj_err:
+	kset_unregister(ve_devices_kset);
+dev_kset_err:
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(devices_init);
+
+void devices_fini(void)
+{
+	/*
+	 * Delete "/sys/block" kobj for ve != ve0 only
+	 * because kernel doesn't delete /sys/block for ve0
+	 */
+	if (!ve_is_super(get_exec_env()))
+		kobject_put(ve_sysfs_block_kobj);
+	kobject_put(ve_sysfs_dev_char_kobj);
+	kobject_put(ve_sysfs_dev_block_kobj);
+	kobject_put(ve_dev_kobj);
+	kset_unregister(ve_devices_kset);
+	kobject_put(virtual_dir);
+}
+EXPORT_SYMBOL_GPL(devices_fini);
+
 
 EXPORT_SYMBOL_GPL(device_for_each_child);
 EXPORT_SYMBOL_GPL(device_find_child);
@@ -1511,7 +1593,7 @@ struct device *device_create(struct clas
 	va_end(vargs);
 	return dev;
 }
-EXPORT_SYMBOL_GPL(device_create);
+EXPORT_SYMBOL(device_create);
 
 static int __match_devt(struct device *dev, void *data)
 {
@@ -1538,7 +1620,7 @@ void device_destroy(struct class *class,
 		device_unregister(dev);
 	}
 }
-EXPORT_SYMBOL_GPL(device_destroy);
+EXPORT_SYMBOL(device_destroy);
 
 /**
  * device_rename - renames a device
@@ -1564,10 +1646,8 @@ int device_rename(struct device *dev, co
 	pr_debug("device: '%s': %s: renaming to '%s'\n", dev_name(dev),
 		 __func__, new_name);
 
-#ifdef CONFIG_SYSFS_DEPRECATED
-	if ((dev->class) && (dev->parent))
+	if (sysfs_deprecated && (dev->class) && (dev->parent))
 		old_class_name = make_class_name(dev->class->name, &dev->kobj);
-#endif
 
 	old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
 	if (!old_device_name) {
@@ -1579,8 +1659,7 @@ int device_rename(struct device *dev, co
 	if (error)
 		goto out;
 
-#ifdef CONFIG_SYSFS_DEPRECATED
-	if (old_class_name) {
+	if (sysfs_deprecated && old_class_name) {
 		new_class_name = make_class_name(dev->class->name, &dev->kobj);
 		if (new_class_name) {
 			error = sysfs_create_link_nowarn(&dev->parent->kobj,
@@ -1591,8 +1670,7 @@ int device_rename(struct device *dev, co
 			sysfs_remove_link(&dev->parent->kobj, old_class_name);
 		}
 	}
-#else
-	if (dev->class) {
+	if (!sysfs_deprecated && dev->class) {
 		error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj,
 						 &dev->kobj, dev_name(dev));
 		if (error)
@@ -1600,7 +1678,6 @@ int device_rename(struct device *dev, co
 		sysfs_remove_link(&dev->class->p->class_subsys.kobj,
 				  old_device_name);
 	}
-#endif
 
 out:
 	put_device(dev);
@@ -1618,9 +1695,11 @@ static int device_move_class_links(struc
 				   struct device *new_parent)
 {
 	int error = 0;
-#ifdef CONFIG_SYSFS_DEPRECATED
 	char *class_name;
 
+	if (!sysfs_deprecated)
+		goto nodep;
+
 	class_name = make_class_name(dev->class->name, &dev->kobj);
 	if (!class_name) {
 		error = -ENOMEM;
@@ -1644,14 +1723,14 @@ static int device_move_class_links(struc
 out:
 	kfree(class_name);
 	return error;
-#else
+
+nodep:
 	if (old_parent)
 		sysfs_remove_link(&dev->kobj, "device");
 	if (new_parent)
 		error = sysfs_create_link(&dev->kobj, &new_parent->kobj,
 					  "device");
 	return error;
-#endif
 }
 
 /**
@@ -1742,7 +1821,12 @@ void device_shutdown(void)
 {
 	struct device *dev, *devn;
 
-	list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list,
+	if (!ve_is_super(get_exec_env())) {
+		printk("BUG: device_shutdown call from inside VE\n");
+		return;
+	}
+
+	list_for_each_entry_safe_reverse(dev, devn, &ve_devices_kset->list,
 				kobj.entry) {
 		if (dev->bus && dev->bus->shutdown) {
 			dev_dbg(dev, "shutdown\n");
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/devtmpfs.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/devtmpfs.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/devtmpfs.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/devtmpfs.c	2015-01-21 12:02:47.435114578 +0300
@@ -24,8 +24,20 @@
 #include <linux/sched.h>
 #include <linux/init_task.h>
 
+#ifdef CONFIG_VE
+static inline struct vfsmount *ve_devmnt(void)
+{
+	return get_exec_env()->devtmpfs_mnt;
+}
+#else
 static struct vfsmount *dev_mnt;
 
+static inline struct vfsmount *ve_devmnt(void)
+{
+	return dev_mnt;
+}
+#endif
+
 #if defined CONFIG_DEVTMPFS_MOUNT
 static int dev_mount = 1;
 #else
@@ -45,11 +57,12 @@ static int dev_get_sb(struct file_system
 	return get_sb_single(fs_type, flags, data, shmem_fill_super, mnt);
 }
 
-static struct file_system_type dev_fs_type = {
+struct file_system_type dev_fs_type = {
 	.name = "devtmpfs",
 	.get_sb = dev_get_sb,
 	.kill_sb = kill_litter_super,
 };
+EXPORT_SYMBOL(dev_fs_type);
 
 #ifdef CONFIG_BLOCK
 static inline int is_blockdev(struct device *dev)
@@ -65,6 +78,7 @@ static int dev_mkdir(const char *name, m
 	struct nameidata nd;
 	struct dentry *dentry;
 	int err;
+	struct vfsmount *dev_mnt = ve_devmnt();
 
 	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
 			      name, LOOKUP_PARENT, &nd);
@@ -89,6 +103,7 @@ static int create_path(const char *nodep
 	char *path;
 	struct nameidata nd;
 	int err = 0;
+	struct vfsmount *dev_mnt = ve_devmnt();
 
 	path = kstrdup(nodepath, GFP_KERNEL);
 	if (!path)
@@ -136,10 +151,12 @@ int devtmpfs_create_node(struct device *
 	const char *tmp = NULL;
 	const char *nodename;
 	const struct cred *curr_cred;
+	struct user_beancounter *curr_ub;
 	mode_t mode = 0;
 	struct nameidata nd;
 	struct dentry *dentry;
 	int err;
+	struct vfsmount *dev_mnt = ve_devmnt();
 
 	if (!dev_mnt)
 		return 0;
@@ -155,6 +172,7 @@ int devtmpfs_create_node(struct device *
 	else
 		mode |= S_IFCHR;
 
+	curr_ub = set_exec_ub(&ub0);
 	curr_cred = override_creds(&init_cred);
 	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
 			      nodename, LOOKUP_PARENT, &nd);
@@ -188,6 +206,7 @@ int devtmpfs_create_node(struct device *
 out:
 	kfree(tmp);
 	revert_creds(curr_cred);
+	(void)set_exec_ub(curr_ub);
 	return err;
 }
 
@@ -196,6 +215,7 @@ static int dev_rmdir(const char *name)
 	struct nameidata nd;
 	struct dentry *dentry;
 	int err;
+	struct vfsmount *dev_mnt = ve_devmnt();
 
 	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
 			      name, LOOKUP_PARENT, &nd);
@@ -246,6 +266,8 @@ static int delete_path(const char *nodep
 
 static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat)
 {
+	struct vfsmount *dev_mnt = ve_devmnt();
+
 	/* did we create it */
 	if (inode->i_private != &dev_mnt)
 		return 0;
@@ -275,6 +297,7 @@ int devtmpfs_delete_node(struct device *
 	struct kstat stat;
 	int deleted = 1;
 	int err;
+	struct vfsmount *dev_mnt = ve_devmnt();
 
 	if (!dev_mnt)
 		return 0;
@@ -339,6 +362,7 @@ int devtmpfs_mount(const char *mountpoin
 {
 	struct path path;
 	int err;
+	struct vfsmount *dev_mnt = ve_devmnt();
 
 	if (!dev_mount)
 		return 0;
@@ -382,7 +406,11 @@ int __init devtmpfs_init(void)
 		unregister_filesystem(&dev_fs_type);
 		return err;
 	}
+#ifdef CONFIG_VE
+	get_ve0()->devtmpfs_mnt = mnt;
+#else
 	dev_mnt = mnt;
+#endif
 
 	printk(KERN_INFO "devtmpfs: initialized\n");
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/sys.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/sys.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/sys.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/sys.c	2015-01-21 12:02:43.668214585 +0300
@@ -20,6 +20,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pm.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
@@ -567,7 +569,7 @@ EXPORT_SYMBOL_GPL(sysdev_resume);
 
 int __init system_bus_init(void)
 {
-	system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj);
+	system_kset = kset_create_and_add("system", NULL, &ve_devices_kset->kobj);
 	if (!system_kset)
 		return -ENOMEM;
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/base/vedev.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/vedev.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/base/vedev.c	2015-01-21 12:02:44.450193825 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/base/vedev.c	2015-01-21 12:02:47.520112321 +0300
@@ -0,0 +1,838 @@
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/genhd.h>
+#include <linux/vzcalluser.h>
+
+#include "base.h"
+
+struct ve_device_link {
+	char *name;
+	struct kobject *kobj;
+	struct list_head list;
+};
+
+struct ve_device {
+	struct ve_struct *ve;
+	struct device *dev;
+	struct list_head kobj_list;
+	struct list_head ve_list;
+	struct kobject *kobj;
+	struct list_head links;
+	struct kobject* net_link;
+	unsigned int perms_set : 1,
+		     devtmpfs_node_created : 1;
+};
+
+static DECLARE_MUTEX(vedev_lock);
+
+extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+extern void sysfs_put_active(struct sysfs_dirent *sd);
+
+static void __ve_remove_netdev(struct ve_device *ve_dev);
+
+static struct kobject *ve_kobj_path_lookup(char *path, bool create)
+{
+	char *e, *p = path;
+	struct sysfs_dirent *sd, *parent_sd = get_exec_env()->_sysfs_root;
+	struct kobject *k, *pk = NULL;
+
+	if (*p == '/')
+		p++;
+
+	while (1) {
+		e = strchr(p, '/');
+		if (e)
+			*e = '\0';
+		sd = sysfs_get_dirent(parent_sd, p);
+		if (sd == NULL) {
+new:			if (!create) {
+				kobject_put(pk);
+				return ERR_PTR(-ENOENT);
+			}
+			k = kobject_create_and_add(p, pk);
+			kobject_put(pk);
+			if (!k)
+				return ERR_PTR(-ENOMEM);
+		} else {
+			unsigned int f;
+			bool my_parent_is_symlink = false;
+
+follow_symlink:		f = sd->s_flags;
+			if (!(f & SYSFS_DIR) && !(f & SYSFS_KOBJ_LINK)) {
+				sysfs_put(sd);
+
+				return (f & SYSFS_DIR_LINK) && create ?
+					ERR_PTR(-EEXIST) : ERR_PTR(-EINVAL);
+			}
+
+			/*a directory may be deleted*/
+			if (!sysfs_get_active(sd)) {
+				sysfs_put(sd);
+				if (my_parent_is_symlink) {
+					kobject_put(pk);
+					return create ? ERR_PTR(-EINVAL) :
+						        ERR_PTR(-ENOENT);
+				}
+				goto new;
+			}
+
+			if (f & SYSFS_KOBJ_LINK) {
+				struct sysfs_dirent *old_sd = sd;
+
+				sd = sd->s_symlink.target_sd;
+				sysfs_get(sd);
+
+				sysfs_put_active(old_sd);
+				sysfs_put(old_sd);
+				my_parent_is_symlink = true;
+				goto follow_symlink;
+			}
+
+			k = sd->s_dir.kobj;
+
+			kobject_get(k);
+			kobject_put(pk);
+			sysfs_put_active(sd);
+			sysfs_put(sd);
+		}
+		pk = k;
+		parent_sd = k->sd;
+		if (!e)
+			break;
+
+		p = e + 1;
+	}
+
+	return k;
+}
+
+static inline struct kobject *vedev_kobj_path_create(char *path, struct ve_device *ve_dev)
+{
+	struct kobject *obj;
+	struct ve_struct *old_ve = set_exec_env(ve_dev->ve);
+	obj = ve_kobj_path_lookup(path, true);
+	set_exec_env(old_ve);
+	return obj;
+}
+
+static int ve_device_add_symlink(struct kobject *kobj, const char *name, \
+			struct ve_device *ve_dev)
+{
+	char *path;
+	int ret = -ENOMEM;
+	struct kobject *dev_kobj, *ve_kobj = NULL;
+	struct ve_device_link *ve_link;
+
+	path = kobject_get_path(kobj, GFP_KERNEL);
+	if (!path)
+		goto out;
+
+	ve_kobj = vedev_kobj_path_create(path, ve_dev);
+	kfree(path);
+	if (IS_ERR(ve_kobj)) {
+		ret = PTR_ERR(ve_kobj);
+		ve_kobj = NULL;
+		goto out;
+	}
+
+	ve_link = kmalloc(sizeof(struct ve_device_link), GFP_KERNEL);
+	if (!ve_link)
+		goto out;
+
+	ve_link->name = kstrdup(name, GFP_KERNEL);
+	if (!ve_link->name)
+		goto out_free;
+
+	if (ve_dev->kobj)
+		dev_kobj = ve_dev->kobj;
+	else
+		dev_kobj = &ve_dev->dev->kobj;
+
+	ret = sysfs_create_link(ve_kobj, dev_kobj, ve_link->name);
+	if (ret)
+		goto out_free_name;
+
+	ve_link->kobj = ve_kobj;
+	list_add(&ve_link->list, &ve_dev->links);
+
+	return 0;
+
+out_free_name:
+	kfree(ve_link->name);
+out_free:
+	kfree(ve_link);
+out:
+	kobject_put(ve_kobj);
+	return ret;
+}
+
+static void dirlink_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type dirlink_kobj_ktype = {
+	.release	= dirlink_kobj_release,
+};
+
+static struct kobject *kobject_link_create(struct kobject *parent, struct kobject *target)
+{
+	struct sysfs_dirent *sd;
+	struct kobject *kobj;
+
+	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+	if (!kobj)
+		return ERR_PTR(-ENOMEM);
+
+	kobject_init(kobj, &dirlink_kobj_ktype);
+
+	kobject_set_name(kobj, "%s", kobject_name(target));
+	sd = sysfs_create_dirlink(parent->sd, target);
+	if (IS_ERR(sd)) {
+		kobject_put(kobj);
+		kobj = (struct kobject *) sd;
+		goto out;
+	}
+	kobj->sd = sd;
+	kobj->parent = kobject_get(parent);
+out:
+	return kobj;
+}
+
+static void kobject_link_del(struct kobject *kobj, struct ve_struct *ve)
+{
+	struct ve_struct *old_ve;
+	if (!kobj)
+		return;
+	if (kobj->sd) {
+		old_ve = set_exec_env(ve);
+		sysfs_remove_dirlink(kobj->sd);
+		set_exec_env(old_ve);
+	}
+	kobj->sd = NULL;
+	kobject_put(kobj->parent);
+	kobject_put(kobj);
+}
+
+static int ve_device_link_kobj(struct ve_device *ve_dev)
+{
+	char *path, *p;
+	int ret = 0;
+	struct sysfs_dirent *sd;
+	struct kobject *k = NULL, *pk = NULL;
+
+	path = kobject_get_path(&ve_dev->dev->kobj, GFP_KERNEL);
+	if (!path) {
+		return -ENOMEM;
+	}
+	p = strrchr(path, '/');
+	if (p && p != path) {
+		*p = '\0';
+		p++;
+		pk = vedev_kobj_path_create(path, ve_dev);
+		if (IS_ERR(pk)) {
+			ret = PTR_ERR(pk);
+			pk = NULL;
+			goto out;
+		}
+	} else {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sd = sysfs_get_dirent(pk->sd, p);
+	if (sd != NULL) {
+		sysfs_put(sd);
+		ret = -EEXIST;
+		goto out;
+	}
+
+	k = kobject_link_create(pk, &ve_dev->dev->kobj);
+	if (IS_ERR(k)) {
+		ret = PTR_ERR(k);
+		goto out;
+	}
+	ve_dev->kobj = k;
+
+out:
+	kobject_put(pk);
+	kfree(path);
+	return ret;
+}
+
+static int ve_device_link_bus(struct ve_device *ve_dev)
+{
+	struct kobject *devs_kobj = NULL;
+	int ret = 0;
+
+	if (ve_dev->dev->bus) {
+		devs_kobj = &ve_dev->dev->bus->p->devices_kset->kobj;
+		ret = ve_device_add_symlink(devs_kobj, dev_name(ve_dev->dev), ve_dev);
+	}
+
+	return ret;
+}
+
+static int ve_device_link_class(struct ve_device *ve_dev)
+{
+	struct kobject *devs_kobj = NULL;
+	struct device *dev = ve_dev->dev;
+	int ret = 0;
+
+	if (!dev->class)
+		return 0;
+
+	if ((dev->kobj.parent != &dev->class->p->class_subsys.kobj)) {
+		devs_kobj = &dev->class->p->class_subsys.kobj;
+		ret = ve_device_add_symlink(devs_kobj, dev_name(dev), ve_dev);
+	}
+
+	return ret;
+}
+
+static int ve_device_link_dev_block(struct ve_device *ve_dev)
+{
+	struct device *dev = ve_dev->dev;
+	char dev_nums[20];
+
+	snprintf(dev_nums, sizeof(dev_nums), "%d:%d", MAJOR(dev->devt),
+			MINOR(dev->devt));
+	return ve_device_add_symlink(ve_dev->ve->dev_block_kobj, dev_nums,
+					ve_dev);
+}
+
+static void ve_device_del_link(struct ve_device *ve_dev)
+{
+	struct ve_device_link *l, *t;
+	list_for_each_entry_safe(l, t, &ve_dev->links, list) {
+		sysfs_remove_link(l->kobj, l->name);
+		kobject_put(l->kobj);
+		kfree(l->name);
+		kfree(l);
+	}
+	kobject_link_del(ve_dev->kobj, ve_dev->ve);
+}
+
+static int ve_device_create_link(struct ve_device *ve_dev)
+{
+	int ret;
+	ret = ve_device_link_kobj(ve_dev);
+	if (ret == -EEXIST)
+		goto out;
+	if (ret)
+		goto err;
+	ret = ve_device_link_bus(ve_dev);
+	if (ret)
+		goto err;
+out:
+	return 0;
+err:
+	ve_device_del_link(ve_dev);
+	return ret;
+}
+
+static inline struct ve_device *__ve_device_find(struct list_head *head,
+						struct ve_struct *ve)
+{
+	struct ve_device *ve_dev;
+
+	list_for_each_entry(ve_dev, head, kobj_list)
+		if (ve_dev->ve == ve)
+			return ve_dev;
+
+	return NULL;
+}
+
+static struct ve_device *__ve_device_subscribe(struct device *dev, struct ve_struct *ve)
+{
+	struct ve_device *ve_dev;
+
+	ve_dev = kzalloc(sizeof(struct ve_device), GFP_KERNEL);
+
+	if (!ve_dev)
+		return ERR_PTR(-ENOMEM);
+
+	ve_dev->ve = ve;
+	ve_dev->dev = dev;
+	get_device(dev);
+	INIT_LIST_HEAD(&ve_dev->links);
+
+	list_add(&ve_dev->kobj_list, &dev->kobj.env_head);
+	list_add(&ve_dev->ve_list, &ve->devices);
+	return ve_dev;
+}
+
+static struct ve_device *ve_device_subscribe(struct device *dev, struct ve_struct *ve)
+{
+	struct ve_device *ve_dev;
+
+	down(&vedev_lock);
+
+	if (__ve_device_find(&dev->kobj.env_head, ve)) {
+		ve_dev = ERR_PTR(-EEXIST);
+		goto out;
+	}
+
+	ve_dev = __ve_device_subscribe(dev, ve);
+out:
+	up(&vedev_lock);
+	return ve_dev;
+}
+
+static int ve_devtmpfs_delete_node(struct ve_struct *ve, struct device *dev)
+{
+	struct ve_struct *old_ve = set_exec_env(ve);
+	int err = devtmpfs_delete_node(dev);
+	set_exec_env(old_ve);
+	return err;
+}
+
+static void ve_device_del_one(struct ve_device *ve_dev, int event)
+{
+	struct ve_struct *old_ve;
+	unsigned type;
+
+	list_del(&ve_dev->ve_list);
+	list_del(&ve_dev->kobj_list);
+
+	if (event) {
+		old_ve = set_exec_env(ve_dev->ve);
+		kobject_uevent_env_one(&ve_dev->dev->kobj, KOBJ_REMOVE, NULL);
+		set_exec_env(old_ve);
+	}
+
+	ve_device_del_link(ve_dev);
+
+	if (ve_dev->perms_set) {
+		type = ve_dev->dev->class == &block_class ? S_IFBLK : S_IFCHR;
+		set_device_perms_ve(ve_dev->ve, type, ve_dev->dev->devt, 00);
+	}
+
+	if (ve_dev->devtmpfs_node_created)
+		ve_devtmpfs_delete_node(ve_dev->ve, ve_dev->dev);
+
+	put_device(ve_dev->dev);
+	kfree(ve_dev);
+}
+
+void ve_device_del(struct device *dev, struct ve_struct *ve)
+{
+	struct ve_device *ve_dev, *tmp;
+	down(&vedev_lock);
+	list_for_each_entry_safe(ve_dev, tmp, &dev->kobj.env_head, kobj_list) {
+		if (ve && ve_dev->ve != ve)
+			continue;
+
+		ve_device_del_one(ve_dev, 1);
+	}
+	up(&vedev_lock);
+}
+
+/*
+ * Check that physical device is a NIC
+ */
+static inline int is_phydev_net(struct device *dev)
+{
+	struct sysfs_dirent *sd;
+
+	sd = sysfs_get_dirent(dev->kobj.sd, "net");
+	if (!sd)
+		return 0;
+
+	sysfs_put(sd);
+	return 1;
+}
+
+static int ve_devtmpfs_create_node(struct ve_struct *ve, struct device *dev)
+{
+	struct ve_struct *old_ve = set_exec_env(ve);
+	int err = devtmpfs_create_node(dev);
+	set_exec_env(old_ve);
+	return err;
+}
+
+static int ve_device_add(struct device *dev, struct ve_struct *ve,
+			 unsigned mask)
+{
+	int ret = 0;
+	struct ve_device *ve_dev;
+
+	if (is_phydev_net(dev))
+		return -EPERM;
+
+	ve_dev = ve_device_subscribe(dev, ve);
+	if (IS_ERR(ve_dev))
+		return PTR_ERR(ve_dev);
+
+	ret = ve_device_create_link(ve_dev);
+	if (ret < 0)
+		goto err;
+
+	ret = ve_device_link_class(ve_dev);
+	if (ret)
+		goto err;
+
+	/*
+	 * Need some(see below) symlinks for block devices
+	 * for lsblk to be able work in ve
+	 */
+	if (dev->class == &block_class) {
+
+		/*
+		 * Make link /sys/block/devName ->
+		 * ../devices/virtual/block/devName
+		 */
+		ret = ve_device_add_symlink(ve_dev->ve->block_kobj,
+				dev_name(ve_dev->dev), ve_dev);
+		if (ret)
+			goto err;
+
+		/*
+		 * Make link /sys/dev/block/devMAJOR:devMINOR ->
+		 * ../devices/virtual/block/devName
+		 */
+		ret = ve_device_link_dev_block(ve_dev);
+		if (ret)
+			goto err;
+	}
+
+	if (MAJOR(dev->devt)) {
+		unsigned type = dev->class == &block_class ? S_IFBLK : S_IFCHR;
+		type |= VE_USE_MINOR; /* see switch in set_device_perms_ve() */
+
+		ret = ve_devtmpfs_create_node(ve, dev);
+		if (ret < 0 && ret != -EEXIST)
+			goto err;
+		ve_dev->devtmpfs_node_created = 1;
+
+		ret = set_device_perms_ve(ve, type, dev->devt, mask);
+		if (ret < 0)
+			goto err;
+		ve_dev->perms_set = 1;
+	}
+
+	return ret;
+err:
+	down(&vedev_lock);
+	ve_device_del_one(ve_dev, 0);
+	up(&vedev_lock);
+	return ret;
+}
+
+static void ve_device_uevent_add(struct device *dev, struct ve_struct *ve)
+{
+	struct ve_struct *old_ve = set_exec_env(ve);
+	kobject_uevent_env_one(&dev->kobj, KOBJ_ADD, NULL);
+	set_exec_env(old_ve);
+}
+
+ssize_t ve_device_handler(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	int ret;
+	struct ve_struct *ve;
+	envid_t veid;
+	char cmd;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	if (buf[count] != '\0')
+		return -EINVAL;
+
+	if (!strchr("+-", *buf))
+		return -EINVAL;
+	cmd = *buf;
+	buf++;
+
+	if (sscanf(buf, "%u", &veid) != 1)
+		return -EINVAL;
+
+	ve = get_ve_by_id(veid);
+
+	ret = -ENOENT;
+	if (!ve || !ve->is_running)
+		goto out;
+
+	if (cmd == '+') {
+		ret = ve_device_add(dev, ve, 06);
+		if (!ret)
+			ve_device_uevent_add(dev, ve);
+	} else {
+		ve_device_del(dev, ve);
+		ret = 0;
+	}
+out:
+	put_ve(ve);
+	if (unlikely(ret))
+		return ret;
+
+	return count;
+}
+
+static struct kobject *devt2kobj(dev_t devt, unsigned type)
+{
+	const int size = 32;
+	char symlink_path[size];
+	char *type_str;
+
+	if ((type & S_IFMT) == S_IFBLK)
+		type_str = "block";
+	else if ((type & S_IFMT) == S_IFCHR)
+		type_str = "char";
+	else
+		return ERR_PTR(-EINVAL);
+
+	snprintf(symlink_path, size, "dev/%s/%d:%d",
+		 type_str, MAJOR(devt), MINOR(devt));
+	symlink_path[size-1] = 0;
+
+	return ve_kobj_path_lookup(symlink_path, 0);
+}
+
+extern int devcgroup_device_exist(struct cgroup *cgrp, unsigned type,
+				  dev_t device);
+
+int ve_devt_add(struct ve_struct *ve, unsigned type, dev_t devt, unsigned mask)
+{
+	struct kobject *dev_kobj;
+	struct device *dev;
+	bool del = !(mask & (S_IRWXO | S_IXGRP));
+	int err = 0;
+
+	dev_kobj = devt2kobj(devt, type);
+	if (IS_ERR(dev_kobj)) {
+		err = PTR_ERR(dev_kobj);
+
+		/* Do nothing if a special device (like vzlinkdev) was not
+		 * registered as /sys/dev/{block|char}/MAJOR:MINOR on host */
+		if (err == -ENOENT)
+			err = set_device_perms_ve(ve, type, devt, mask);
+
+		return err;
+	}
+
+	if (devcgroup_device_exist(ve->ve_cgroup, type, devt)) {
+		err = set_device_perms_ve(ve, type, devt, mask);
+		if (err || !del)
+			goto err;
+	}
+
+	dev = container_of(dev_kobj, struct device, kobj);
+	if (dev->devt != devt ||
+	    (dev->class == &block_class ? S_IFBLK : S_IFCHR) !=
+	    (type & S_IFMT)) {
+		printk(KERN_ERR "/sys/dev/%s/%d:%d points to %d:%d block=%d\n",
+		       (type & S_IFMT) == S_IFBLK ? "block" : "char",
+		       MAJOR(devt), MINOR(devt),
+		       MAJOR(dev->devt), MINOR(dev->devt),
+		       dev->class == &block_class);
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (del)
+		ve_device_del(dev, ve);
+	else
+		err = ve_device_add(dev, ve, mask);
+err:
+	kobject_put(dev_kobj);
+	return err;
+}
+EXPORT_SYMBOL(ve_devt_add);
+
+void fini_ve_devices(struct ve_struct *ve)
+{
+	struct ve_device *ve_dev, *tmp;
+	down(&vedev_lock);
+	list_for_each_entry_safe(ve_dev, tmp, &ve->devices, ve_list) {
+		/* network devices are removed in netdev_fixup_sysfs() */
+		if (!ve_dev->net_link)
+			ve_device_del_one(ve_dev, 0);
+	}
+	up(&vedev_lock);
+}
+EXPORT_SYMBOL(fini_ve_devices);
+
+int ve_kobject_uevent_env(struct kobject *kobj,
+			enum kobject_action action, char *envp_ext[])
+{
+	int err, ret = 0;
+	struct ve_device *ve_dev;
+	struct ve_struct *ve_old;
+
+	down(&vedev_lock);
+	list_for_each_entry(ve_dev, &kobj->env_head, kobj_list) {
+		ve_old = set_exec_env(ve_dev->ve);
+		err = kobject_uevent_env_one(kobj, action, envp_ext);
+		if (err)
+			ret = err;
+		set_exec_env(ve_old);
+	}
+	up(&vedev_lock);
+
+	return ret;
+}
+
+static int ve_netdev_create(struct kobject *net_obj, struct ve_struct *ve)
+{
+	struct ve_device *ve_dev;
+	struct ve_struct *old_ve;
+	struct kobject *phy_obj = net_obj->parent;
+	struct device *phy_dev;
+	char *path, *p;
+	int err;
+	struct kobject *k = NULL, *pk = NULL;
+
+	/*
+	 * ve_netdev_create should not be called with network
+	 * interface not attached to physical device
+	 */
+	phy_dev = container_of(phy_obj, struct device, kobj);
+
+	ve_dev = __ve_device_subscribe(phy_dev, ve);
+
+	if (IS_ERR(ve_dev))
+		return PTR_ERR(ve_dev);
+
+	path = kobject_get_path(net_obj, GFP_KERNEL);
+
+	if (!path) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	p = strrchr(path, '/');
+
+	if (!p || (p == path)) {
+		err = -EINVAL;
+		kfree(path);
+		goto error;
+	}
+
+	*p = '\0';
+	old_ve = set_exec_env(ve);
+	pk = ve_kobj_path_lookup(path, true);
+	set_exec_env(old_ve);
+
+	kfree(path);
+
+	if (IS_ERR(pk)) {
+		err = PTR_ERR(pk);
+		goto error;
+	}
+
+	k = kobject_link_create(pk, net_obj);
+	kobject_put(pk);
+
+	if (IS_ERR(k)) {
+		err = PTR_ERR(k);
+		goto error;
+	}
+
+	ve_dev->net_link = k;
+	return 0;
+
+error:
+	ve_device_del_one(ve_dev, 0);
+	return err;
+}
+
+/*
+ * Search for "device" symlink in object directory.
+ * If it exists - return physical device, if not
+ * this device is virtual one
+ */
+static struct kobject *netdev_get_phy(struct device *dev)
+{
+	struct sysfs_dirent *sd, *link;
+	struct kobject *target = NULL;
+
+	sd = sysfs_get_dirent(dev->kobj.sd, "device");
+
+	if (!sd)
+		return NULL;
+
+	if (unlikely(!(sd->s_flags & SYSFS_KOBJ_LINK))) {
+		printk(KERN_ERR "device dirent of dev %s is not symlink.\n",
+			dev_name(dev));
+		goto exit;
+	}
+
+	link = sd->s_symlink.target_sd;
+
+	if (!(link->s_flags & (SYSFS_DIR | SYSFS_DIR_LINK))) {
+		printk(KERN_ERR "device link of %s is not describe phy dev\n",
+			dev_name(dev));
+		goto exit;
+	}
+
+	target = link->s_dir.kobj;
+
+exit:
+	sysfs_put(sd);
+	return target;
+}
+
+int ve_netdev_add(struct device *dev, struct ve_struct *ve)
+{
+	int err = -EINVAL;
+	struct ve_device *ve_dev;
+	struct kobject *phy_dev;
+
+	phy_dev = netdev_get_phy(dev);
+	if (!phy_dev)
+		/* Assume no phy object - virtual device */
+		return 0;
+
+	down(&vedev_lock);
+	ve_dev = __ve_device_find(&phy_dev->env_head, ve);
+
+	if (!ve_dev)
+		err = ve_netdev_create(dev->kobj.parent, ve);
+
+	up(&vedev_lock);
+	return err;
+}
+
+static void __ve_remove_netdev(struct ve_device *ve_dev)
+{
+	kobject_link_del(ve_dev->net_link, ve_dev->ve);
+
+	list_del(&ve_dev->ve_list);
+	list_del(&ve_dev->kobj_list);
+
+	put_device(ve_dev->dev);
+	kfree(ve_dev);
+}
+
+int ve_netdev_delete(struct device *dev, struct ve_struct *ve)
+{
+	int err = 0;
+	struct ve_device *ve_dev;
+	struct kobject *phy_dev;
+
+	phy_dev = netdev_get_phy(dev);
+	if (!phy_dev)
+		/* Assume no phy object - virtual device */
+		return 0;
+
+	down(&vedev_lock);
+
+	ve_dev = __ve_device_find(&phy_dev->env_head, ve);
+
+	if (!ve_dev || !ve_dev->net_link) {
+		printk(KERN_ERR "Can't delete virtual device %s in case "
+			"it is not present in VE.\n", kobject_name(phy_dev));
+		err = -EINVAL;
+		goto out;
+	}
+
+	__ve_remove_netdev(ve_dev);
+
+out:
+	up(&vedev_lock);
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/Kconfig	2014-12-12 23:29:31.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/Kconfig	2015-01-21 12:02:58.396823594 +0300
@@ -273,6 +273,23 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+config BLK_DEV_PLOOP
+	tristate "Parallels loopback device support"
+	---help---
+	  Saying Y here will allow you to use a regular file as a block
+	  device; you can then create a file system on that block device and
+	  mount it just as you would mount other block devices such as hard
+	  drive partitions, CD-ROM drives or floppy drives. The loop devices
+	  are block special device files with major number 182 and typically
+	  called /dev/ploop0, /dev/ploop1 etc.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called loop.
+
+	  Most users will answer N here.
+
+source "drivers/block/drbd/Kconfig"
+
 config BLK_DEV_NBD
 	tristate "Network block device support"
 	depends on NET
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/Makefile	2014-12-12 23:29:31.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/Makefile	2015-01-21 12:02:58.396823594 +0300
@@ -37,6 +37,7 @@ obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/Kconfig	2015-01-21 12:02:58.374824177 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/Kconfig	2015-01-21 12:02:58.396823594 +0300
@@ -0,0 +1,49 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+	depends on PROC_FS='n' || INET='n' || CONNECTOR='n'
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET && CONNECTOR
+	select LRU_CACHE
+	default m
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: http://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_TRACE
+	tristate "DRBD tracing"
+	depends on BLK_DEV_DRBD
+	select TRACEPOINTS
+	help
+
+	  Say Y here if you want to be able to trace various events in DRBD.
+
+	  If unsure, say N.
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/Makefile	2015-01-21 12:02:58.396823594 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/Makefile	2015-01-21 12:02:58.396823594 +0300
@@ -0,0 +1,7 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += lru_cache.o
+drbd-y += drbd_buildtag.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/cn_queue.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/cn_queue.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/cn_queue.c	2015-01-21 12:02:58.374824177 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/cn_queue.c	2015-01-21 12:02:58.374824177 +0300
@@ -0,0 +1,212 @@
+/*
+ * 	cn_queue.c
+ * 
+ * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * Modified by Philipp Reiser to work on older 2.6.x kernels.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/suspend.h>
+#include <linux/connector.h>
+#include <linux/delay.h>
+
+#include <linux/drbd_config.h> /* In case kzalloc() is missing. */
+
+#ifdef NEED_BACKPORT_OF_KZALLOC
+static inline void *kzalloc(size_t size, int flags)
+{
+	void *rv = kmalloc(size, flags);
+	if (rv)
+		memset(rv, 0, size);
+
+	return rv;
+}
+#endif
+
+
+#ifndef KERNEL_HAS_MSLEEP
+/**
+ * msleep - sleep safely even with waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+static inline void msleep(unsigned int msecs)
+{
+	unsigned long timeout = (msecs * HZ + 999) / 1000;
+
+	while (timeout) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		timeout = schedule_timeout(timeout);
+	}
+}
+
+#endif
+
+void cn_queue_wrapper(void *data)
+{
+	struct cn_callback_data *d = data;
+
+	d->callback(d->callback_priv);
+
+	d->destruct_data(d->ddata);
+	d->ddata = NULL;
+
+	kfree(d->free);
+}
+
+static struct cn_callback_entry *cn_queue_alloc_callback_entry(char *name, struct cb_id *id, void (*callback)(void *))
+{
+	struct cn_callback_entry *cbq;
+
+	cbq = kzalloc(sizeof(*cbq), GFP_KERNEL);
+	if (!cbq) {
+		printk(KERN_ERR "Failed to create new callback queue.\n");
+		return NULL;
+	}
+
+	snprintf(cbq->id.name, sizeof(cbq->id.name), "%s", name);
+	memcpy(&cbq->id.id, id, sizeof(struct cb_id));
+	cbq->data.callback = callback;
+	
+	INIT_WORK(&cbq->work, &cn_queue_wrapper, &cbq->data);
+	return cbq;
+}
+
+static void cn_queue_free_callback(struct cn_callback_entry *cbq)
+{
+	cancel_delayed_work(&cbq->work);
+	flush_workqueue(cbq->pdev->cn_queue);
+
+	kfree(cbq);
+}
+
+int cn_cb_equal(struct cb_id *i1, struct cb_id *i2)
+{
+	return ((i1->idx == i2->idx) && (i1->val == i2->val));
+}
+
+int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id, void (*callback)(void *))
+{
+	struct cn_callback_entry *cbq, *__cbq;
+	int found = 0;
+
+	cbq = cn_queue_alloc_callback_entry(name, id, callback);
+	if (!cbq)
+		return -ENOMEM;
+
+	atomic_inc(&dev->refcnt);
+	cbq->pdev = dev;
+
+	spin_lock_bh(&dev->queue_lock);
+	list_for_each_entry(__cbq, &dev->queue_list, callback_entry) {
+		if (cn_cb_equal(&__cbq->id.id, id)) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		list_add_tail(&cbq->callback_entry, &dev->queue_list);
+	spin_unlock_bh(&dev->queue_lock);
+
+	if (found) {
+		atomic_dec(&dev->refcnt);
+		cn_queue_free_callback(cbq);
+		return -EINVAL;
+	}
+
+	cbq->nls = dev->nls;
+	cbq->seq = 0;
+	cbq->group = cbq->id.id.idx;
+
+	return 0;
+}
+
+void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id)
+{
+	struct cn_callback_entry *cbq, *n;
+	int found = 0;
+
+	spin_lock_bh(&dev->queue_lock);
+	list_for_each_entry_safe(cbq, n, &dev->queue_list, callback_entry) {
+		if (cn_cb_equal(&cbq->id.id, id)) {
+			list_del(&cbq->callback_entry);
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_bh(&dev->queue_lock);
+
+	if (found) {
+		cn_queue_free_callback(cbq);
+		atomic_dec_and_test(&dev->refcnt);
+	}
+}
+
+struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *nls)
+{
+	struct cn_queue_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+
+	snprintf(dev->name, sizeof(dev->name), "%s", name);
+	atomic_set(&dev->refcnt, 0);
+	INIT_LIST_HEAD(&dev->queue_list);
+	spin_lock_init(&dev->queue_lock);
+
+	dev->nls = nls;
+	dev->netlink_groups = 0;
+
+	dev->cn_queue = create_workqueue(dev->name);
+	if (!dev->cn_queue) {
+		kfree(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+
+void cn_queue_free_dev(struct cn_queue_dev *dev)
+{
+	struct cn_callback_entry *cbq, *n;
+
+	flush_workqueue(dev->cn_queue);
+	destroy_workqueue(dev->cn_queue);
+
+	spin_lock_bh(&dev->queue_lock);
+	list_for_each_entry_safe(cbq, n, &dev->queue_list, callback_entry)
+		list_del(&cbq->callback_entry);
+	spin_unlock_bh(&dev->queue_lock);
+
+	while (atomic_read(&dev->refcnt)) {
+		printk(KERN_INFO "Waiting for %s to become free: refcnt=%d.\n",
+		       dev->name, atomic_read(&dev->refcnt));
+		msleep(1000);
+	}
+
+	kfree(dev);
+	dev = NULL;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/bitops.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/bitops.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/bitops.h	2015-01-21 12:02:58.374824177 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/bitops.h	2015-01-21 12:02:58.374824177 +0300
@@ -0,0 +1,139 @@
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) && !defined(_ASM_GENERIC_BITOPS_LE_H_)
+/* compatibility for before the bitops/le.h split {{{ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+#define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
+
+#if defined(__LITTLE_ENDIAN)
+
+#define generic_test_le_bit(nr, addr) test_bit(nr, addr)
+#define generic___set_le_bit(nr, addr) __set_bit(nr, addr)
+#define generic___clear_le_bit(nr, addr) __clear_bit(nr, addr)
+
+#define generic_test_and_set_le_bit(nr, addr) test_and_set_bit(nr, addr)
+#define generic_test_and_clear_le_bit(nr, addr) test_and_clear_bit(nr, addr)
+
+#define generic___test_and_set_le_bit(nr, addr) __test_and_set_bit(nr, addr)
+#define generic___test_and_clear_le_bit(nr, addr) __test_and_clear_bit(nr, addr)
+
+#define generic_find_next_zero_le_bit(addr, size, offset) find_next_zero_bit(addr, size, offset)
+#define generic_find_next_le_bit(addr, size, offset) \
+			find_next_bit(addr, size, offset)
+
+#elif defined(__BIG_ENDIAN)
+
+#define generic_test_le_bit(nr, addr) \
+	test_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___set_le_bit(nr, addr) \
+	__set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___clear_le_bit(nr, addr) \
+	__clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+#define generic_test_and_set_le_bit(nr, addr) \
+	test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic_test_and_clear_le_bit(nr, addr) \
+	test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+#define generic___test_and_set_le_bit(nr, addr) \
+	__test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+#define generic___test_and_clear_le_bit(nr, addr) \
+	__test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr))
+
+extern unsigned long generic_find_next_zero_le_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset);
+extern unsigned long generic_find_next_le_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset);
+
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+#define generic_find_first_zero_le_bit(addr, size) \
+        generic_find_next_zero_le_bit((addr), (size), 0)
+
+#endif /* before 2.6.17 }}} */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+/* did not yet include generic_find_next_le_bit() {{{ */
+
+#if defined(__LITTLE_ENDIAN)
+
+#define generic_find_next_le_bit(addr, size, offset) \
+		find_next_bit(addr, size, offset)
+
+#elif defined(__BIG_ENDIAN)
+/* from 2.6.33 lib/find_bit.c */
+
+/* include/linux/byteorder does not support "unsigned long" type */
+static inline unsigned long ext2_swabp(const unsigned long * x)
+{
+#if BITS_PER_LONG == 64
+	return (unsigned long) __swab64p((u64 *) x);
+#elif BITS_PER_LONG == 32
+	return (unsigned long) __swab32p((u32 *) x);
+#else
+#error BITS_PER_LONG not defined
+#endif
+}
+
+/* include/linux/byteorder doesn't support "unsigned long" type */
+static inline unsigned long ext2_swab(const unsigned long y)
+{
+#if BITS_PER_LONG == 64
+	return (unsigned long) __swab64((u64) y);
+#elif BITS_PER_LONG == 32
+	return (unsigned long) __swab32((u32) y);
+#else
+#error BITS_PER_LONG not defined
+#endif
+}
+
+unsigned long generic_find_next_le_bit(const unsigned long *addr, unsigned
+		long size, unsigned long offset)
+{
+	const unsigned long *p = addr + BITOP_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= (BITS_PER_LONG - 1UL);
+	if (offset) {
+		tmp = ext2_swabp(p++);
+		tmp &= (~0UL << offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+
+	while (size & ~(BITS_PER_LONG - 1)) {
+		tmp = *(p++);
+		if (tmp)
+			goto found_middle_swap;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = ext2_swabp(p);
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size; /* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+
+found_middle_swap:
+	return result + __ffs(ext2_swab(tmp));
+}
+#else
+#error "unknown byte order"
+#endif
+#endif /* compatibility for generic_find_next_le_bit }}} */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/autoconf.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/autoconf.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/autoconf.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/autoconf.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1 @@
+/* empty file, for compat reasons */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/connector.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/connector.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/connector.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/connector.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1,186 @@
+/*
+ * 	connector.h
+ * 
+ * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * Modified by Philipp Reiser to work on older 2.6.x kernels.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __CONNECTOR_H
+#define __CONNECTOR_H
+#define DRBD_CONNECTOR_BACKPORT_HEADER
+
+#include <asm/types.h>
+
+#define NETLINK_CONNECTOR       11
+
+#define CN_IDX_CONNECTOR		0xffffffff
+#define CN_VAL_CONNECTOR		0xffffffff
+
+/*
+ * Process Events connector unique ids -- used for message routing
+ */
+#define CN_IDX_PROC			0x1
+#define CN_VAL_PROC			0x1
+#define CN_IDX_CIFS			0x2
+#define CN_VAL_CIFS                     0x1
+
+#define CN_NETLINK_USERS		1
+
+/*
+ * Maximum connector's message size.
+ */
+#define CONNECTOR_MAX_MSG_SIZE 	1024
+
+/*
+ * idx and val are unique identifiers which 
+ * are used for message routing and 
+ * must be registered in connector.h for in-kernel usage.
+ */
+
+struct cb_id {
+	__u32 idx;
+	__u32 val;
+};
+
+struct cn_msg {
+	struct cb_id id;
+
+	__u32 seq;
+	__u32 ack;
+
+	__u16 len;		/* Length of the following data */
+	__u16 flags;
+	__u8 data[0];
+};
+
+/*
+ * Notify structure - requests notification about
+ * registering/unregistering idx/val in range [first, first+range].
+ */
+struct cn_notify_req {
+	__u32 first;
+	__u32 range;
+};
+
+/*
+ * Main notification control message
+ * *_notify_num 	- number of appropriate cn_notify_req structures after 
+ *				this struct.
+ * group 		- notification receiver's idx.
+ * len 			- total length of the attached data.
+ */
+struct cn_ctl_msg {
+	__u32 idx_notify_num;
+	__u32 val_notify_num;
+	__u32 group;
+	__u32 len;
+	__u8 data[0];
+};
+
+#ifdef __KERNEL__
+#include <linux/drbd_config.h>
+
+#ifndef KERNEL_HAS_GFP_T
+#define KERNEL_HAS_GFP_T
+typedef unsigned gfp_t;
+#endif
+
+#include <asm/atomic.h>
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+
+#include <net/sock.h>
+
+#define CN_CBQ_NAMELEN		32
+
+struct cn_queue_dev {
+	atomic_t refcnt;
+	unsigned char name[CN_CBQ_NAMELEN];
+
+	struct workqueue_struct *cn_queue;
+
+	struct list_head queue_list;
+	spinlock_t queue_lock;
+
+	int netlink_groups;
+	struct sock *nls;
+};
+
+struct cn_callback_id {
+	unsigned char name[CN_CBQ_NAMELEN];
+	struct cb_id id;
+};
+
+struct cn_callback_data {
+	void (*destruct_data) (void *);
+	void *ddata;
+	
+	void *callback_priv;
+	void (*callback) (void *);
+
+	void *free;
+};
+
+struct cn_callback_entry {
+	struct list_head callback_entry;
+	struct cn_callback *cb;
+	struct work_struct work;
+	struct cn_queue_dev *pdev;
+
+	struct cn_callback_id id;
+	struct cn_callback_data data;
+
+	int seq, group;
+	struct sock *nls;
+};
+
+struct cn_ctl_entry {
+	struct list_head notify_entry;
+	struct cn_ctl_msg *msg;
+};
+
+struct cn_dev {
+	struct cb_id id;
+
+	u32 seq, groups;
+	struct sock *nls;
+	void (*input) (struct sock * sk, int len);
+
+	struct cn_queue_dev *cbdev;
+};
+
+int cn_add_callback(struct cb_id *, char *, void (*callback) (void *));
+void cn_del_callback(struct cb_id *);
+int cn_netlink_send(struct cn_msg *, u32, gfp_t);
+
+int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id, void (*callback)(void *));
+void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id);
+
+struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *);
+void cn_queue_free_dev(struct cn_queue_dev *dev);
+
+int cn_cb_equal(struct cb_id *, struct cb_id *);
+
+void cn_queue_wrapper(void *data);
+
+extern int cn_already_initialized;
+
+#endif				/* __KERNEL__ */
+#endif				/* __CONNECTOR_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/dynamic_debug.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/dynamic_debug.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/dynamic_debug.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/dynamic_debug.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1,8 @@
+#ifndef _DYNAMIC_DEBUG_H
+#define _DYNAMIC_DEBUG_H
+
+#ifndef dynamic_dev_dbg
+#define dynamic_dev_dbg(dev, fmt, ...)
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/hardirq.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/hardirq.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/hardirq.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/hardirq.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1 @@
+/* Just an empty file. */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/memcontrol.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/memcontrol.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/memcontrol.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/memcontrol.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1,3 @@
+/* just an empty file
+ * memcontrol.h did not exist prior to 2.6.25.
+ * but it needs more recent kernels for mm_inline.h to work. */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/mutex.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/mutex.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/mutex.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/mutex.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1,41 @@
+/* "Backport" of the mutex to older Linux-2.6.x kernels.
+ */
+#ifndef __LINUX_MUTEX_H
+#define __LINUX_MUTEX_H
+
+#include <asm/semaphore.h>
+
+struct mutex {
+	struct semaphore sem;
+};
+
+static inline void mutex_init(struct mutex *m)
+{
+	sema_init(&m->sem, 1);
+}
+
+static inline void mutex_lock(struct mutex *m)
+{
+	down(&m->sem);
+}
+
+static inline int mutex_lock_interruptible(struct mutex *m)
+{
+	return down_interruptible(&m->sem);
+}
+
+static inline void mutex_unlock(struct mutex *m)
+{
+	up(&m->sem);
+}
+
+static inline int mutex_is_locked(struct mutex *lock)
+{
+        return atomic_read(&lock->sem.count) != 1;
+}
+
+static inline int mutex_trylock(struct mutex *lock)
+{
+	return !down_trylock(&lock->sem);
+}
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/tracepoint.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/tracepoint.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/compat/linux/tracepoint.h	2015-01-21 12:02:58.375824150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/compat/linux/tracepoint.h	2015-01-21 12:02:58.375824150 +0300
@@ -0,0 +1 @@
+struct tracepoint;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/connector.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/connector.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/connector.c	2015-01-21 12:02:58.376824123 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/connector.c	2015-01-21 12:02:58.376824123 +0300
@@ -0,0 +1,513 @@
+/*
+ * 	connector.c
+ * 
+ * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * Modified by Philipp Reiser to work on older 2.6.x kernels.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/moduleparam.h>
+#include <linux/connector.h>
+#include <linux/capability.h>
+
+#ifndef DRBD_CONNECTOR_BACKPORT_HEADER
+#error "drbd backported connector.c compiled against kernel connector.h will not work"
+#error "enable CONFIG_CONNECTOR in your kernel and try again"
+#endif
+
+#include <net/sock.h>
+
+#ifdef DRBD_NL_DST_GROUPS
+   /* pre 2.6.16 */
+#  define NETLINK_GROUP(skb) NETLINK_CB(skb).dst_groups
+#else
+#  define NETLINK_GROUP(skb) NETLINK_CB(skb).dst_group
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <johnpol@2ka.mipt.ru>");
+MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector.");
+
+static u32 cn_idx = CN_IDX_CONNECTOR;
+static u32 cn_val = CN_VAL_CONNECTOR;
+
+module_param(cn_idx, uint, 0);
+module_param(cn_val, uint, 0);
+MODULE_PARM_DESC(cn_idx, "Connector's main device idx.");
+MODULE_PARM_DESC(cn_val, "Connector's main device val.");
+
+static DECLARE_MUTEX(notify_lock);
+static LIST_HEAD(notify_list);
+
+static struct cn_dev cdev;
+
+int cn_already_initialized = 0;
+
+/*
+ * msg->seq and msg->ack are used to determine message genealogy.
+ * When someone sends message it puts there locally unique sequence
+ * and random acknowledge numbers.  Sequence number may be copied into
+ * nlmsghdr->nlmsg_seq too.
+ *
+ * Sequence number is incremented with each message to be sent.
+ *
+ * If we expect reply to our message then the sequence number in
+ * received message MUST be the same as in original message, and
+ * acknowledge number MUST be the same + 1.
+ *
+ * If we receive a message and its sequence number is not equal to the
+ * one we are expecting then it is a new message.
+ *
+ * If we receive a message and its sequence number is the same as one
+ * we are expecting but it's acknowledgement number is not equal to
+ * the acknowledgement number in the original message + 1, then it is
+ * a new message.
+ *
+ */
+int cn_netlink_send(struct cn_msg *msg, u32 __group, gfp_t gfp_mask)
+{
+	struct cn_callback_entry *__cbq;
+	unsigned int size;
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct cn_msg *data;
+	struct cn_dev *dev = &cdev;
+	u32 group = 0;
+	int found = 0;
+
+	if (!__group) {
+		spin_lock_bh(&dev->cbdev->queue_lock);
+		list_for_each_entry(__cbq, &dev->cbdev->queue_list,
+				    callback_entry) {
+			if (cn_cb_equal(&__cbq->id.id, &msg->id)) {
+				found = 1;
+				group = __cbq->group;
+			}
+		}
+		spin_unlock_bh(&dev->cbdev->queue_lock);
+
+		if (!found)
+			return -ENODEV;
+	} else {
+		group = __group;
+	}
+
+	size = NLMSG_SPACE(sizeof(*msg) + msg->len);
+
+	skb = alloc_skb(size, gfp_mask);
+	if (!skb)
+		return -ENOMEM;
+
+	nlh = NLMSG_PUT(skb, 0, msg->seq, NLMSG_DONE, size - sizeof(*nlh));
+
+	data = NLMSG_DATA(nlh);
+
+	memcpy(data, msg, sizeof(*data) + msg->len);
+
+	NETLINK_GROUP(skb) = group;
+
+	netlink_broadcast(dev->nls, skb, 0, group, gfp_mask);
+
+	return 0;
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+/*
+ * Callback helper - queues work and setup destructor for given data.
+ */
+static int cn_call_callback(struct cn_msg *msg, void (*destruct_data)(void *), void *data)
+{
+	struct cn_callback_entry *__cbq;
+	struct cn_dev *dev = &cdev;
+	int err = -ENODEV;
+
+	spin_lock_bh(&dev->cbdev->queue_lock);
+	list_for_each_entry(__cbq, &dev->cbdev->queue_list, callback_entry) {
+		if (cn_cb_equal(&__cbq->id.id, &msg->id)) {
+			if (likely(!test_bit(0, &__cbq->work.pending) &&
+					__cbq->data.ddata == NULL)) {
+				__cbq->data.callback_priv = msg;
+
+				__cbq->data.ddata = data;
+				__cbq->data.destruct_data = destruct_data;
+
+				if (queue_work(dev->cbdev->cn_queue,
+						&__cbq->work))
+					err = 0;
+			} else {
+				struct work_struct *w;
+				struct cn_callback_data *d;
+				
+				w = kmalloc(sizeof(*w) + sizeof(*d), GFP_ATOMIC);
+				if (w) {
+					memset(w,0,sizeof(*w) + sizeof(*d));
+					d = (struct cn_callback_data *)(w+1);
+
+					d->callback_priv = msg;
+					d->callback = __cbq->data.callback;
+					d->ddata = data;
+					d->destruct_data = destruct_data;
+					d->free = w;
+
+					INIT_LIST_HEAD(&w->entry);
+					w->pending = 0;
+					w->func = &cn_queue_wrapper;
+					w->data = d;
+					init_timer(&w->timer);
+					
+					if (queue_work(dev->cbdev->cn_queue, w))
+						err = 0;
+					else {
+						kfree(w);
+						err = -EINVAL;
+					}
+				} else
+					err = -ENOMEM;
+			}
+			break;
+		}
+	}
+	spin_unlock_bh(&dev->cbdev->queue_lock);
+
+	return err;
+}
+
+/*
+ * Skb receive helper - checks skb and msg size and calls callback
+ * helper.
+ */
+static int __cn_rx_skb(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32 pid, uid, seq, group;
+	struct cn_msg *msg;
+
+	pid = NETLINK_CREDS(skb)->pid;
+	uid = NETLINK_CREDS(skb)->uid;
+	seq = nlh->nlmsg_seq;
+	group = NETLINK_GROUP(skb);
+	msg = NLMSG_DATA(nlh);
+
+	/* DRBD specific change: Only allow packets from ROOT */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return cn_call_callback(msg, (void (*)(void *))kfree_skb, skb);
+}
+
+/*
+ * Main netlink receiving function.
+ *
+ * It checks skb and netlink header sizes and calls the skb receive
+ * helper with a shared skb.
+ */
+static void cn_rx_skb(struct sk_buff *__skb)
+{
+	struct nlmsghdr *nlh;
+	u32 len;
+	int err;
+	struct sk_buff *skb;
+
+	skb = skb_get(__skb);
+
+	if (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+
+		if (nlh->nlmsg_len < sizeof(struct cn_msg) ||
+		    skb->len < nlh->nlmsg_len ||
+		    nlh->nlmsg_len > CONNECTOR_MAX_MSG_SIZE) {
+			kfree_skb(skb);
+			goto out;
+		}
+
+		len = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (len > skb->len)
+			len = skb->len;
+
+		err = __cn_rx_skb(skb, nlh);
+		if (err < 0)
+			kfree_skb(skb);
+	}
+
+out:
+	kfree_skb(__skb);
+}
+
+/*
+ * Netlink socket input callback - dequeues the skbs and calls the
+ * main netlink receiving function.
+ */
+static void cn_input(struct sock *sk, int len)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL)
+		cn_rx_skb(skb);
+}
+
+/*
+ * Notification routing.
+ *
+ * Gets id and checks if there are notification request for it's idx
+ * and val.  If there are such requests notify the listeners with the
+ * given notify event.
+ *
+ */
+static void cn_notify(struct cb_id *id, u32 notify_event)
+{
+	struct cn_ctl_entry *ent;
+
+	down(&notify_lock);
+	list_for_each_entry(ent, &notify_list, notify_entry) {
+		int i;
+		struct cn_notify_req *req;
+		struct cn_ctl_msg *ctl = ent->msg;
+		int idx_found, val_found;
+
+		idx_found = val_found = 0;
+
+		req = (struct cn_notify_req *)ctl->data;
+		for (i = 0; i < ctl->idx_notify_num; ++i, ++req) {
+			if (id->idx >= req->first && 
+					id->idx < req->first + req->range) {
+				idx_found = 1;
+				break;
+			}
+		}
+
+		for (i = 0; i < ctl->val_notify_num; ++i, ++req) {
+			if (id->val >= req->first && 
+					id->val < req->first + req->range) {
+				val_found = 1;
+				break;
+			}
+		}
+
+		if (idx_found && val_found) {
+			struct cn_msg m = { .ack = notify_event, };
+
+			memcpy(&m.id, id, sizeof(m.id));
+			cn_netlink_send(&m, ctl->group, GFP_KERNEL);
+		}
+	}
+	up(&notify_lock);
+}
+
+/*
+ * Callback add routing - adds callback with given ID and name.
+ * If there is registered callback with the same ID it will not be added.
+ *
+ * May sleep.
+ */
+int cn_add_callback(struct cb_id *id, char *name, void (*callback)(void *))
+{
+	int err;
+	struct cn_dev *dev = &cdev;
+
+	err = cn_queue_add_callback(dev->cbdev, name, id, callback);
+	if (err)
+		return err;
+
+	cn_notify(id, 0);
+
+	return 0;
+}
+
+/*
+ * Callback remove routing - removes callback
+ * with given ID.
+ * If there is no registered callback with given
+ * ID nothing happens.
+ *
+ * May sleep while waiting for reference counter to become zero.
+ */
+void cn_del_callback(struct cb_id *id)
+{
+	struct cn_dev *dev = &cdev;
+
+	cn_queue_del_callback(dev->cbdev, id);
+	cn_notify(id, 1);
+}
+
+/*
+ * Checks two connector's control messages to be the same.
+ * Returns 1 if they are the same or if the first one is corrupted.
+ */
+static int cn_ctl_msg_equals(struct cn_ctl_msg *m1, struct cn_ctl_msg *m2)
+{
+	int i;
+	struct cn_notify_req *req1, *req2;
+
+	if (m1->idx_notify_num != m2->idx_notify_num)
+		return 0;
+
+	if (m1->val_notify_num != m2->val_notify_num)
+		return 0;
+
+	if (m1->len != m2->len)
+		return 0;
+
+	if ((m1->idx_notify_num + m1->val_notify_num) * sizeof(*req1) !=
+	    m1->len)
+		return 1;
+
+	req1 = (struct cn_notify_req *)m1->data;
+	req2 = (struct cn_notify_req *)m2->data;
+
+	for (i = 0; i < m1->idx_notify_num; ++i) {
+		if (req1->first != req2->first || req1->range != req2->range)
+			return 0;
+		req1++;
+		req2++;
+	}
+
+	for (i = 0; i < m1->val_notify_num; ++i) {
+		if (req1->first != req2->first || req1->range != req2->range)
+			return 0;
+		req1++;
+		req2++;
+	}
+
+	return 1;
+}
+
+/*
+ * Main connector device's callback.
+ *
+ * Used for notification of a request's processing.
+ */
+static void cn_callback(void *data)
+{
+	struct cn_msg *msg = data;
+	struct cn_ctl_msg *ctl;
+	struct cn_ctl_entry *ent;
+	u32 size;
+
+	if (msg->len < sizeof(*ctl))
+		return;
+
+	ctl = (struct cn_ctl_msg *)msg->data;
+
+	size = (sizeof(*ctl) + ((ctl->idx_notify_num +
+				 ctl->val_notify_num) *
+				sizeof(struct cn_notify_req)));
+
+	if (msg->len != size)
+		return;
+
+	if (ctl->len + sizeof(*ctl) != msg->len)
+		return;
+
+	/*
+	 * Remove notification.
+	 */
+	if (ctl->group == 0) {
+		struct cn_ctl_entry *n;
+
+		down(&notify_lock);
+		list_for_each_entry_safe(ent, n, &notify_list, notify_entry) {
+			if (cn_ctl_msg_equals(ent->msg, ctl)) {
+				list_del(&ent->notify_entry);
+				kfree(ent);
+			}
+		}
+		up(&notify_lock);
+
+		return;
+	}
+
+	size += sizeof(*ent);
+
+	ent = kmalloc(size, GFP_KERNEL);
+	if (!ent)
+		return;
+
+	memset(ent,0,size);
+	ent->msg = (struct cn_ctl_msg *)(ent + 1);
+
+	memcpy(ent->msg, ctl, size - sizeof(*ent));
+
+	down(&notify_lock);
+	list_add(&ent->notify_entry, &notify_list);
+	up(&notify_lock);
+}
+
+int __init cn_init(void)
+{
+	struct cn_dev *dev = &cdev;
+	int err;
+
+	dev->input = cn_input;
+	dev->id.idx = cn_idx;
+	dev->id.val = cn_val;
+
+#ifdef DRBD_NL_DST_GROUPS
+	/* history of upstream commits between kernel.org 2.6.13 and 2.6.14-rc1:
+	 * 4fdb3bb723db469717c6d38fda667d8b0fa86ebd 2005-08-10 adds module parameter
+	 * d629b836d151d43332492651dd841d32e57ebe3b 2005-08-15 renames dst_groups to dst_group
+	 * 066286071d3542243baa68166acb779187c848b3 2005-08-15 adds groups parameter
+	 * so it is not exactly correct to trigger on the rename dst_groups to dst_group,
+	 * but sufficiently close.
+	 */
+	dev->nls = netlink_kernel_create(NETLINK_CONNECTOR,dev->input);
+#else
+	dev->nls = netlink_kernel_create(NETLINK_CONNECTOR,
+					 CN_NETLINK_USERS + 0xf,
+					 dev->input, THIS_MODULE);
+#endif
+	if (!dev->nls)
+		return -EIO;
+
+	dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls);
+	if (!dev->cbdev) {
+		if (dev->nls->sk_socket)
+			sock_release(dev->nls->sk_socket);
+		return -EINVAL;
+	}
+
+	err = cn_add_callback(&dev->id, "connector", &cn_callback);
+	if (err) {
+		cn_queue_free_dev(dev->cbdev);
+		if (dev->nls->sk_socket)
+			sock_release(dev->nls->sk_socket);
+		return -EINVAL;
+	}
+
+	cn_already_initialized = 1;
+
+	return 0;
+}
+
+void __exit cn_fini(void)
+{
+	struct cn_dev *dev = &cdev;
+
+	cn_already_initialized = 0;
+
+	cn_del_callback(&dev->id);
+	cn_queue_free_dev(dev->cbdev);
+	if (dev->nls->sk_socket)
+		sock_release(dev->nls->sk_socket);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_actlog.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_actlog.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_actlog.c	2015-01-21 12:02:58.376824123 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_actlog.c	2015-01-21 12:02:58.376824123 +0300
@@ -0,0 +1,1400 @@
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include <linux/dynamic_debug.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_wrappers.h"
+
+/* We maintain a trivial checksum in our on disk activity log.
+ * With that we can ensure correct operation even when the storage
+ * device might do a partial (last) sector write while losing power.
+ */
+struct __packed al_transaction {
+	u32       magic;
+	u32       tr_number;
+	struct __packed {
+		u32 pos;
+		u32 extent; } updates[1 + AL_EXTENTS_PT];
+	u32       xor_sum;
+};
+
+struct update_odbm_work {
+	struct drbd_work w;
+	unsigned int enr;
+};
+
+struct update_al_work {
+	struct drbd_work w;
+	struct lc_element *al_ext;
+	struct completion event;
+	unsigned int enr;
+	/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
+	unsigned int old_enr;
+};
+
+
+int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+
+/* The actual tracepoint needs to have constant number of known arguments...
+ */
+void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	trace__drbd_resync(mdev, level, fmt, ap);
+	va_end(ap);
+}
+
+void *drbd_md_get_buffer(struct drbd_conf *mdev)
+{
+	int r;
+
+	wait_event(mdev->misc_wait,
+		   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
+		   mdev->state.disk <= D_FAILED);
+
+	return r ? NULL : page_address(mdev->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->md_io_in_use))
+		wake_up(&mdev->misc_wait);
+}
+
+static bool md_io_allowed(struct drbd_conf *mdev)
+{
+	enum drbd_disk_state ds = mdev->state.disk;
+	return ds >= D_NEGOTIATING || ds == D_ATTACHING;
+}
+
+void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt = bdev->dc.disk_timeout * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
+	if (dt == 0)
+		dev_err(DEV, "meta-data IO operation timed out\n");
+}
+
+STATIC int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+				 struct drbd_backing_dev *bdev,
+				 struct page *page, sector_t sector,
+				 int rw, int size)
+{
+	struct bio *bio;
+	int ok;
+
+	if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
+		rw |= DRBD_REQ_FUA | DRBD_REQ_FLUSH;
+	rw |= DRBD_REQ_UNPLUG | DRBD_REQ_SYNC;
+
+#ifndef REQ_FLUSH
+	/* < 2.6.36, "barrier" semantic may fail with EOPNOTSUPP */
+ retry:
+#endif
+	mdev->md_io.done = 0;
+	mdev->md_io.error = -ENODEV;
+
+	bio = bio_alloc_drbd(GFP_NOIO);
+	bio->bi_bdev = bdev->md_bdev;
+	bio->bi_sector = sector;
+	ok = (bio_add_page(bio, page, size, 0) == size);
+	if (!ok)
+		goto out;
+	bio->bi_private = &mdev->md_io;
+	bio->bi_end_io = drbd_md_io_complete;
+	bio->bi_rw = rw;
+
+	trace_drbd_bio(mdev, "Md", bio, 0, NULL);
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		ok = 0;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+		bio_endio(bio, -EIO);
+	else
+		submit_bio(rw, bio);
+	wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
+	ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
+
+#ifndef REQ_FLUSH
+	/* check for unsupported barrier op.
+	 * would rather check on EOPNOTSUPP, but that is not reliable.
+	 * don't try again for ANY return value != 0 */
+	if (mdev->md_io.done && unlikely((bio->bi_rw & DRBD_REQ_HARDBARRIER) && !ok)) {
+		/* Try again with no barrier */
+		dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		rw &= ~DRBD_REQ_HARDBARRIER;
+		bio_put(bio);
+		goto retry;
+	}
+#endif
+ out:
+	bio_put(bio);
+	return ok;
+}
+
+int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+			 sector_t sector, int rw)
+{
+	int logical_block_size, mask, ok;
+	int offset = 0;
+	struct page *iop = mdev->md_io_page;
+
+	D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
+
+	if (!bdev->md_bdev) {
+		if (DRBD_ratelimit(5*HZ, 5)) {
+			dev_err(DEV, "bdev->md_bdev==NULL\n");
+			dump_stack();
+		}
+		return 0;
+	}
+
+	logical_block_size = bdev_logical_block_size(bdev->md_bdev);
+	if (logical_block_size == 0)
+		logical_block_size = MD_SECTOR_SIZE;
+
+	/* in case logical_block_size != 512 [ s390 only? ] */
+	if (logical_block_size != MD_SECTOR_SIZE) {
+		mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
+		D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+		D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
+		offset = sector & mask;
+		sector = sector & ~mask;
+		iop = mdev->md_io_tmpp;
+
+		if (rw & WRITE) {
+			/* these are GFP_KERNEL pages, pre-allocated
+			 * on device initialization */
+			void *p = page_address(mdev->md_io_page);
+			void *hp = page_address(mdev->md_io_tmpp);
+
+			ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
+					READ, logical_block_size);
+
+			if (unlikely(!ok)) {
+				dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
+				    "READ [logical_block_size!=512]) failed!\n",
+				    (unsigned long long)sector);
+				return 0;
+			}
+
+			memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
+		}
+	}
+
+#if DUMP_MD >= 3
+	dev_info(DEV, "%s [%d]:%s(,%llus,%s)\n",
+	     current->comm, current->pid, __func__,
+	     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+#endif
+
+	if (sector < drbd_md_first_sector(bdev) ||
+	    sector > drbd_md_last_sector(bdev))
+		dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
+	if (unlikely(!ok)) {
+		dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
+		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+		return 0;
+	}
+
+	if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
+		void *p = page_address(mdev->md_io_page);
+		void *hp = page_address(mdev->md_io_tmpp);
+
+		memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+	}
+
+	return ok;
+}
+
+static
+struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	struct lc_element *tmp;
+	unsigned long     al_flags = 0;
+	int wake;
+
+	spin_lock_irq(&mdev->al_lock);
+	tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(tmp != NULL)) {
+		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+			spin_unlock_irq(&mdev->al_lock);
+			if (wake)
+				wake_up(&mdev->al_wait);
+			return NULL;
+		}
+	}
+	al_ext   = lc_get(mdev->act_log, enr);
+	al_flags = mdev->act_log->flags;
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (!al_ext) {
+		if (al_flags & LC_STARVING)
+			dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
+		if (al_flags & LC_DIRTY)
+			dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
+	}
+	*/
+
+	return al_ext;
+}
+
+void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	struct lc_element *al_ext;
+	struct update_al_work al_work;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+	trace_drbd_actlog(mdev, sector, "al_begin_io");
+
+	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+
+	if (al_ext->lc_number != enr) {
+		/* drbd_al_write_transaction(mdev,al_ext,enr);
+		 * recurses into generic_make_request(), which
+		 * disallows recursion, bios being serialized on the
+		 * current->bio_tail list now.
+		 * we have to delegate updates to the activity log
+		 * to the worker thread. */
+		init_completion(&al_work.event);
+		al_work.al_ext = al_ext;
+		al_work.enr = enr;
+		al_work.old_enr = al_ext->lc_number;
+		al_work.w.cb = w_al_write_transaction;
+		drbd_queue_work_front(&mdev->data.work, &al_work.w);
+		wait_for_completion(&al_work.event);
+
+		mdev->al_writ_cnt++;
+
+		/*
+		DUMPI(al_ext->lc_number);
+		DUMPI(mdev->act_log->new_number);
+		*/
+		spin_lock_irq(&mdev->al_lock);
+		lc_changed(mdev->act_log, al_ext);
+		spin_unlock_irq(&mdev->al_lock);
+		wake_up(&mdev->al_wait);
+	}
+}
+
+void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	struct lc_element *extent;
+	unsigned long flags;
+
+	trace_drbd_actlog(mdev, sector, "al_complete_io");
+
+	spin_lock_irqsave(&mdev->al_lock, flags);
+
+	extent = lc_find(mdev->act_log, enr);
+
+	if (!extent) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+		return;
+	}
+
+	if (lc_put(mdev->act_log, extent) == 0)
+		wake_up(&mdev->al_wait);
+
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
+{
+	return rs_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+int
+w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct update_al_work *aw = container_of(w, struct update_al_work, w);
+	struct lc_element *updated = aw->al_ext;
+	const unsigned int new_enr = aw->enr;
+	const unsigned int evicted = aw->old_enr;
+	struct al_transaction *buffer;
+	sector_t sector;
+	int i, n, mx;
+	unsigned int extent_nr;
+	u32 xor_sum = 0;
+
+	if (!get_ldev(mdev)) {
+		dev_err(DEV,
+			"disk is %s, cannot start al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
+		complete(&((struct update_al_work *)w)->event);
+		return 1;
+	}
+	/* do we have to do a bitmap write, first?
+	 * TODO reduce maximum latency:
+	 * submit both bios, then wait for both,
+	 * instead of doing two synchronous sector writes.
+	 * For now, we must not write the transaction,
+	 * if we cannot write out the bitmap of the evicted extent. */
+	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
+		drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
+
+	/* The bitmap write may have failed, causing a state change. */
+	if (mdev->state.disk < D_INCONSISTENT) {
+		dev_err(DEV,
+			"disk is %s, cannot write al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
+
+	buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
+	if (!buffer) {
+		dev_err(DEV, "disk failed while waiting for md_io buffer\n");
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
+
+	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+	n = lc_index_of(mdev->act_log, updated);
+
+	buffer->updates[0].pos = cpu_to_be32(n);
+	buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+	xor_sum ^= new_enr;
+
+	mx = min_t(int, AL_EXTENTS_PT,
+		   mdev->act_log->nr_elements - mdev->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = mdev->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
+		buffer->updates[i+1].pos = cpu_to_be32(idx);
+		buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+		xor_sum ^= extent_nr;
+	}
+	for (; i < AL_EXTENTS_PT; i++) {
+		buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+		buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+		xor_sum ^= LC_FREE;
+	}
+	mdev->al_tr_cycle += AL_EXTENTS_PT;
+	if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
+		mdev->al_tr_cycle = 0;
+
+	buffer->xor_sum = cpu_to_be32(xor_sum);
+
+	sector =  mdev->ldev->md.md_offset
+		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
+
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+		drbd_chk_io_error(mdev, 1, true);
+
+	if (++mdev->al_tr_pos >
+	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+		mdev->al_tr_pos = 0;
+
+	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+	mdev->al_tr_number++;
+
+	drbd_md_put_buffer(mdev);
+
+	complete(&((struct update_al_work *)w)->event);
+	put_ldev(mdev);
+
+	return 1;
+}
+
+/**
+ * drbd_al_read_tr() - Read a single transaction from the on disk activity log
+ * @mdev:	DRBD device.
+ * @bdev:	Block device to read form.
+ * @b:		pointer to an al_transaction.
+ * @index:	On disk slot of the transaction to read.
+ *
+ * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+ */
+STATIC int drbd_al_read_tr(struct drbd_conf *mdev,
+			   struct drbd_backing_dev *bdev,
+			   struct al_transaction *b,
+			   int index)
+{
+	sector_t sector;
+	int rv, i;
+	u32 xor_sum = 0;
+
+	sector = bdev->md.md_offset + bdev->md.al_offset + index;
+
+	/* Dont process error normally,
+	 * as this is done before disk is attached! */
+	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
+		return -1;
+
+	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+
+	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
+		xor_sum ^= be32_to_cpu(b->updates[i].extent);
+	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+	return rv;
+}
+
+/**
+ * drbd_al_read_log() - Restores the activity log from its on disk representation.
+ * @mdev:	DRBD device.
+ * @bdev:	Block device to read form.
+ *
+ * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
+ */
+int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	struct al_transaction *buffer;
+	int i;
+	int rv;
+	int mx;
+	int active_extents = 0;
+	int transactions = 0;
+	int found_valid = 0;
+	int from = 0;
+	int to = 0;
+	u32 from_tnr = 0;
+	u32 to_tnr = 0;
+	u32 cnr;
+
+	mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+
+	/* lock out all other meta data io for now,
+	 * and make sure the page is mapped.
+	 */
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		return 0;
+
+	/* Find the valid transaction in the log */
+	for (i = 0; i <= mx; i++) {
+		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+		if (rv == 0)
+			continue;
+		if (rv == -1) {
+			drbd_md_put_buffer(mdev);
+			return 0;
+		}
+		cnr = be32_to_cpu(buffer->tr_number);
+
+		if (++found_valid == 1) {
+			from = i;
+			to = i;
+			from_tnr = cnr;
+			to_tnr = cnr;
+			continue;
+		}
+		if ((int)cnr - (int)from_tnr < 0) {
+			D_ASSERT(from_tnr - cnr + i - from == mx+1);
+			from = i;
+			from_tnr = cnr;
+		}
+		if ((int)cnr - (int)to_tnr > 0) {
+			D_ASSERT(cnr - to_tnr == i - to);
+			to = i;
+			to_tnr = cnr;
+		}
+	}
+
+	if (!found_valid) {
+		dev_warn(DEV, "No usable activity log found.\n");
+		drbd_md_put_buffer(mdev);
+		return 1;
+	}
+
+	/* Read the valid transactions.
+	 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
+	i = from;
+	while (1) {
+		int j, pos;
+		unsigned int extent_nr;
+		unsigned int trn;
+
+		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+		ERR_IF(rv == 0) goto cancel;
+		if (rv == -1) {
+			drbd_md_put_buffer(mdev);
+			return 0;
+		}
+
+		trn = be32_to_cpu(buffer->tr_number);
+
+		spin_lock_irq(&mdev->al_lock);
+
+		/* This loop runs backwards because in the cyclic
+		   elements there might be an old version of the
+		   updated element (in slot 0). So the element in slot 0
+		   can overwrite old versions. */
+		for (j = AL_EXTENTS_PT; j >= 0; j--) {
+			pos = be32_to_cpu(buffer->updates[j].pos);
+			extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+			if (extent_nr == LC_FREE)
+				continue;
+
+			lc_set(mdev->act_log, extent_nr, pos);
+			active_extents++;
+		}
+		spin_unlock_irq(&mdev->al_lock);
+
+		transactions++;
+
+cancel:
+		if (i == to)
+			break;
+		i++;
+		if (i > mx)
+			i = 0;
+	}
+
+	mdev->al_tr_number = to_tnr+1;
+	mdev->al_tr_pos = to;
+	if (++mdev->al_tr_pos >
+	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+		mdev->al_tr_pos = 0;
+
+	/* ok, we are done with it */
+	drbd_md_put_buffer(mdev);
+
+	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
+	     transactions, active_extents);
+
+	return 1;
+}
+
+/**
+ * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
+ * @mdev:	DRBD device.
+ */
+void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+{
+	unsigned int enr;
+	unsigned long add = 0;
+	char ppb[10];
+	int i, tmp;
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		tmp = drbd_bm_ALe_set_all(mdev, enr);
+		dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
+		add += tmp;
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+
+	dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
+	     ppsize(ppb, Bit2KB(add)));
+}
+
+static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&mdev->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if (likely(rv))
+		lc_del(mdev->act_log, al_ext);
+	spin_unlock_irq(&mdev->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @mdev:	DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_conf *mdev)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		al_ext = lc_element_by_index(mdev->act_log, i);
+		if (al_ext->lc_number == LC_FREE)
+			continue;
+		wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+	}
+
+	wake_up(&mdev->al_wait);
+}
+
+STATIC int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+
+	if (!get_ldev(mdev)) {
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+		kfree(udw);
+		return 1;
+	}
+
+	drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
+	put_ldev(mdev);
+
+	kfree(udw);
+
+	if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
+		switch (mdev->state.conn) {
+		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
+		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+			drbd_resync_finished(mdev);
+		default:
+			/* nothing to do */
+			break;
+		}
+	}
+	drbd_bcast_sync_progress(mdev);
+
+	return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+STATIC void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+				      int count, int success)
+{
+	struct lc_element *e;
+	struct update_odbm_work *udw;
+
+	unsigned int enr;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt));
+
+	/* I simply assume that a sector/size pair never crosses
+	 * a 16 MB extent border. (Currently this is true...) */
+	enr = BM_SECT_TO_EXT(sector);
+
+	e = lc_get(mdev->resync, enr);
+	if (e) {
+		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+		if (ext->lce.lc_number == enr) {
+			if (success)
+				ext->rs_left -= count;
+			else
+				ext->rs_failed += count;
+			if (ext->rs_left < ext->rs_failed) {
+				dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
+				     (unsigned long long)sector,
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->rs_failed, count,
+				     drbd_conn_str(mdev->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			}
+		} else {
+			/* Normally this element should be in the cache,
+			 * since drbd_rs_begin_io() pulled it already in.
+			 *
+			 * But maybe an application write finished, and we set
+			 * something outside the resync lru_cache in sync.
+			 */
+			int rs_left = drbd_bm_e_weight(mdev, enr);
+			if (ext->flags != 0) {
+				dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			if (ext->rs_failed) {
+				dev_warn(DEV, "Kicking resync_lru element enr=%u "
+				     "out with rs_failed=%d\n",
+				     ext->lce.lc_number, ext->rs_failed);
+			}
+			ext->rs_left = rs_left;
+			ext->rs_failed = success ? 0 : count;
+			lc_changed(mdev->resync, &ext->lce);
+		}
+		lc_put(mdev->resync, &ext->lce);
+		/* no race, we are within the al_lock! */
+
+		if (ext->rs_left == ext->rs_failed) {
+			ext->rs_failed = 0;
+
+			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
+			if (udw) {
+				udw->enr = ext->lce.lc_number;
+				udw->w.cb = w_update_odbm;
+				drbd_queue_work_front(&mdev->data.work, &udw->w);
+			} else {
+				dev_warn(DEV, "Could not kmalloc an udw\n");
+			}
+		}
+	} else {
+		dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
+		    mdev->resync_locked,
+		    mdev->resync->nr_elements,
+		    mdev->resync->flags);
+	}
+}
+
+void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
+{
+	unsigned long now = jiffies;
+	unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
+	int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+		if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
+		    mdev->state.conn != C_PAUSED_SYNC_T &&
+		    mdev->state.conn != C_PAUSED_SYNC_S) {
+			mdev->rs_mark_time[next] = now;
+			mdev->rs_mark_left[next] = still_to_go;
+			mdev->rs_last_mark = next;
+		}
+	}
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
+		       const char *file, const unsigned int line)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+	unsigned long flags;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+		dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we clear it (in sync).
+	 * round up start sector, round down end sector.  we make sure we only
+	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS,
+			  "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
+			  (unsigned long long)sector, size, sbnr, ebnr);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
+	if (count && get_ldev(mdev)) {
+		drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
+		spin_lock_irqsave(&mdev->al_lock, flags);
+		drbd_try_clear_on_disk_bm(mdev, sector, count, true);
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+		put_ldev(mdev);
+	}
+	if (wake_up)
+		wake_up(&mdev->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit,
+ * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+			    const char *file, const unsigned int line)
+{
+	unsigned long sbnr, ebnr, lbnr, flags;
+	sector_t esector, nr_sectors;
+	unsigned int enr, count = 0;
+	struct lc_element *e;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+		dev_err(DEV, "sector: %llus, size: %d\n",
+			(unsigned long long)sector, size);
+		return 0;
+	}
+
+	if (!get_ldev(mdev))
+		return 0; /* no disk, no metadata, no bitmap to set bits in */
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors)
+		goto out;
+	ERR_IF(esector >= nr_sectors)
+		esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we set it out of sync,
+	 * we do not need to round anything here */
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS,
+			  "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
+			  (unsigned long long)sector, size, sbnr, ebnr);
+
+	/* ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.  */
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	count = drbd_bm_set_bits(mdev, sbnr, ebnr);
+
+	enr = BM_SECT_TO_EXT(sector);
+	e = lc_find(mdev->resync, enr);
+	if (e)
+		lc_entry(e, struct bm_extent, lce)->rs_left += count;
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+
+out:
+	put_ldev(mdev);
+
+	return count;
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int wakeup = 0;
+	unsigned long rs_flags;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (mdev->resync_locked > mdev->resync->nr_elements/2) {
+		spin_unlock_irq(&mdev->al_lock);
+		return NULL;
+	}
+	e = lc_get(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			bm_ext->rs_failed = 0;
+			lc_changed(mdev->resync, &bm_ext->lce);
+			wakeup = 1;
+		}
+		if (bm_ext->lce.refcnt == 1)
+			mdev->resync_locked++;
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+	}
+	rs_flags = mdev->resync->flags;
+	spin_unlock_irq(&mdev->al_lock);
+	if (wakeup)
+		wake_up(&mdev->al_wait);
+
+	if (!bm_ext) {
+		if (rs_flags & LC_STARVING)
+			dev_warn(DEV, "Have to wait for element"
+			     " (resync LRU too small?)\n");
+		BUG_ON(rs_flags & LC_DIRTY);
+	}
+
+	return bm_ext;
+}
+
+static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	int rv = 0;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (unlikely(enr == mdev->act_log->new_number))
+		rv = 1;
+	else {
+		al_ext = lc_find(mdev->act_log, enr);
+		if (al_ext) {
+			if (al_ext->refcnt)
+				rv = 1;
+		}
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (unlikely(rv)) {
+		dev_info(DEV, "Delaying sync read until app's write is done\n");
+	}
+	*/
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent *bm_ext;
+	int i, sig;
+	int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
+			 200 times -> 20 seconds. */
+
+	trace_drbd_resync(mdev, TRACE_LVL_ALL,
+			  "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n",
+			  (unsigned long long)sector, enr);
+retry:
+	sig = wait_event_interruptible(mdev->al_wait,
+			(bm_ext = _bme_get(mdev, enr)));
+	if (sig)
+		return -EINTR;
+
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 0;
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		sig = wait_event_interruptible(mdev->al_wait,
+					       !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
+					       test_bit(BME_PRIORITY, &bm_ext->flags));
+
+		if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
+			spin_lock_irq(&mdev->al_lock);
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+				mdev->resync_locked--;
+				wake_up(&mdev->al_wait);
+			}
+			spin_unlock_irq(&mdev->al_lock);
+			if (sig)
+				return -EINTR;
+			if (schedule_timeout_interruptible(HZ/10))
+				return -EINTR;
+			if (sa && --sa == 0)
+				dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
+					 "Resync stalled?\n");
+			goto retry;
+		}
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+	return 0;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n",
+			  (unsigned long long)sector);
+
+	spin_lock_irq(&mdev->al_lock);
+	if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
+		/* in case you have very heavy scattered io, it may
+		 * stall the syncer undefined if we give up the ref count
+		 * when we try again and requeue.
+		 *
+		 * if we don't give up the refcount, but the next time
+		 * we are scheduled this extent has been "synced" by new
+		 * application writes, we'd miss the lc_put on the
+		 * extent we keep the refcount on.
+		 * so we remembered which extent we had to try again, and
+		 * if the next requested one is something else, we do
+		 * the lc_put here...
+		 * we also have to wake_up
+		 */
+
+		trace_drbd_resync(mdev, TRACE_LVL_ALL,
+				  "dropping %u, apparently got 'synced' by application io\n",
+				  mdev->resync_wenr);
+
+		e = lc_find(mdev->resync, mdev->resync_wenr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (bm_ext) {
+			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			mdev->resync_wenr = LC_FREE;
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0)
+				mdev->resync_locked--;
+			wake_up(&mdev->al_wait);
+		} else {
+			dev_alert(DEV, "LOGIC BUG\n");
+		}
+	}
+	/* TRY. */
+	e = lc_try_get(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (test_bit(BME_LOCKED, &bm_ext->flags))
+			goto proceed;
+		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			mdev->resync_locked++;
+		} else {
+			/* we did set the BME_NO_WRITES,
+			 * but then could not set BME_LOCKED,
+			 * so we tried again.
+			 * drop the extra reference. */
+			trace_drbd_resync(mdev, TRACE_LVL_ALL,
+					  "dropping extra reference on %u\n", enr);
+
+			bm_ext->lce.refcnt--;
+			D_ASSERT(bm_ext->lce.refcnt > 0);
+		}
+		goto check_al;
+	} else {
+		/* do we rather want to try later? */
+		if (mdev->resync_locked > mdev->resync->nr_elements-3) {
+			trace_drbd_resync(mdev, TRACE_LVL_ALL,
+					  "resync_locked = %u!\n", mdev->resync_locked);
+
+			goto try_again;
+		}
+		/* Do or do not. There is no try. -- Yoda */
+		e = lc_get(mdev->resync, enr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (!bm_ext) {
+			const unsigned long rs_flags = mdev->resync->flags;
+			if (rs_flags & LC_STARVING)
+				dev_warn(DEV, "Have to wait for element"
+				     " (resync LRU too small?)\n");
+			BUG_ON(rs_flags & LC_DIRTY);
+			goto try_again;
+		}
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			bm_ext->rs_failed = 0;
+			lc_changed(mdev->resync, &bm_ext->lce);
+			wake_up(&mdev->al_wait);
+			D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+		}
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+		D_ASSERT(bm_ext->lce.refcnt == 1);
+		mdev->resync_locked++;
+		goto check_al;
+	}
+check_al:
+	trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr);
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		if (unlikely(al_enr+i == mdev->act_log->new_number))
+			goto try_again;
+		if (lc_is_used(mdev->act_log, al_enr+i))
+			goto try_again;
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+	mdev->resync_wenr = LC_FREE;
+	spin_unlock_irq(&mdev->al_lock);
+	return 0;
+
+try_again:
+	trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr);
+	if (bm_ext)
+		mdev->resync_wenr = enr;
+	spin_unlock_irq(&mdev->al_lock);
+	return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	unsigned long flags;
+
+	trace_drbd_resync(mdev, TRACE_LVL_ALL,
+			  "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n",
+			  (long long)sector, enr);
+
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	e = lc_find(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (!bm_ext) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if (bm_ext->lce.refcnt == 0) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
+		    "but refcnt is 0!?\n",
+		    (unsigned long long)sector, enr);
+		return;
+	}
+
+	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+		mdev->resync_locked--;
+		wake_up(&mdev->al_wait);
+	}
+
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @mdev:	DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_conf *mdev)
+{
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n");
+
+	spin_lock_irq(&mdev->al_lock);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
+		lc_reset(mdev->resync);
+		put_ldev(mdev);
+	}
+	mdev->resync_locked = 0;
+	mdev->resync_wenr = LC_FREE;
+	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @mdev:	DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_conf *mdev)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n");
+
+	spin_lock_irq(&mdev->al_lock);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		/* ok, ->resync is there. */
+		for (i = 0; i < mdev->resync->nr_elements; i++) {
+			e = lc_element_by_index(mdev->resync, i);
+			bm_ext = lc_entry(e, struct bm_extent, lce);
+			if (bm_ext->lce.lc_number == LC_FREE)
+				continue;
+			if (bm_ext->lce.lc_number == mdev->resync_wenr) {
+				dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
+				     " got 'synced' by application io\n",
+				     mdev->resync_wenr);
+				D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+				D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				mdev->resync_wenr = LC_FREE;
+				lc_put(mdev->resync, &bm_ext->lce);
+			}
+			if (bm_ext->lce.refcnt != 0) {
+				dev_info(DEV, "Retrying drbd_rs_del_all() later. "
+				     "refcnt=%d\n", bm_ext->lce.refcnt);
+				put_ldev(mdev);
+				spin_unlock_irq(&mdev->al_lock);
+				return -EAGAIN;
+			}
+			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
+			lc_del(mdev->resync, &bm_ext->lce);
+		}
+		D_ASSERT(mdev->resync->used == 0);
+		put_ldev(mdev);
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
+
+	return 0;
+}
+
+/**
+ * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ * @size:	Size of failed IO operation, in byte.
+ */
+void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+
+	trace_drbd_resync(mdev, TRACE_LVL_SUMMARY,
+			  "drbd_rs_failed_io: sector=%llus, size=%u\n",
+			  (unsigned long long)sector, size);
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+		dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/*
+	 * round up start sector, round down end sector.  we make sure we only
+	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irq(&mdev->al_lock);
+	count = drbd_bm_count_bits(mdev, sbnr, ebnr);
+	if (count) {
+		mdev->rs_failed += count;
+
+		if (get_ldev(mdev)) {
+			drbd_try_clear_on_disk_bm(mdev, sector, count, false);
+			put_ldev(mdev);
+		}
+
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	if (wake_up)
+		wake_up(&mdev->al_wait);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_bitmap.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_bitmap.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_bitmap.c	2015-01-21 12:02:58.377824096 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_bitmap.c	2015-01-21 12:02:58.377824096 +0300
@@ -0,0 +1,1677 @@
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <linux/slab.h>
+#include <linux/dynamic_debug.h>
+#include <asm/kmap_types.h>
+#include <linux/kref.h>
+
+#include "drbd_int.h"
+
+/* See the ifdefs and comments inside that header file.
+ * On recent kernels this is not needed. */
+#include "compat/bitops.h"
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50		bytes backend storage (1 PiB)
+ * 1 << (50 - 12)	bits needed
+ *	38 --> we need u64 to index and count bits
+ * 1 << (38 - 3)	bitmap bytes needed
+ *	35 --> we still need u64 to index and count bytes
+ *			(that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2)	32bit longs needed
+ *	33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3)	64bit longs needed
+ *	32 --> we could get away with a 32bit unsigned int to index and count
+ *	64bit long words, but I rather stay with unsigned long for now.
+ *	We probably should neither count nor point to bytes or long words
+ *	directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ *	22 --> we need that much 4KiB pages of bitmap.
+ *	1 << (22 + 3) --> on a 64bit arch,
+ *	we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ *	Bitmap is stored little endian on disk, and is kept little endian in
+ *	core memory. Currently we still hold the full bitmap in core as long
+ *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ *	seems excessive.
+ *
+ *	We plan to reduce the amount of in-core bitmap pages by paging them in
+ *	and out against their on-disk location as necessary, but need to make
+ *	sure we don't cause too much meta data IO, and must not deadlock in
+ *	tight memory situations. This needs some more work.
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+	struct page **bm_pages;
+	spinlock_t bm_lock;
+
+	/* see LIMITATIONS: above */
+
+	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+	unsigned long bm_bits;
+	size_t   bm_words;
+	size_t   bm_number_of_pages;
+	sector_t bm_dev_capacity;
+	struct mutex bm_change; /* serializes resize operations */
+
+	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+	enum bm_flag bm_flags;
+
+	/* debugging aid, in case we are still racy somewhere */
+	char          *bm_why;
+	struct task_struct *bm_task;
+};
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!DRBD_ratelimit(5*HZ, 5))
+		return;
+	dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
+	    current == mdev->receiver.task ? "receiver" :
+	    current == mdev->asender.task  ? "asender"  :
+	    current == mdev->worker.task   ? "worker"   : current->comm,
+	    func, b->bm_why ?: "?",
+	    b->bm_task == mdev->receiver.task ? "receiver" :
+	    b->bm_task == mdev->asender.task  ? "asender"  :
+	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+}
+
+void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int trylock_failed;
+
+	if (!b) {
+		dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
+	}
+
+	trylock_failed = !mutex_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
+		    current == mdev->receiver.task ? "receiver" :
+		    current == mdev->asender.task  ? "asender"  :
+		    current == mdev->worker.task   ? "worker"   : current->comm,
+		    why, b->bm_why ?: "?",
+		    b->bm_task == mdev->receiver.task ? "receiver" :
+		    b->bm_task == mdev->asender.task  ? "asender"  :
+		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+		mutex_lock(&b->bm_change);
+	}
+	if (BM_LOCKED_MASK & b->bm_flags)
+		dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
+	b->bm_flags |= flags & BM_LOCKED_MASK;
+
+	b->bm_why  = why;
+	b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!b) {
+		dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
+
+	if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
+		dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
+
+	b->bm_flags &= ~BM_LOCKED_MASK;
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	mutex_unlock(&b->bm_change);
+}
+
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ *  1<<38 bits,
+ *  1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK		31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR	30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT	29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT	28
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+	set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+	return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	clear_bit(BM_PAGE_IO_LOCK, addr);
+	smp_mb__after_clear_bit();
+	wake_up(&mdev->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+	/* use cmpxchg? */
+	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+	volatile const unsigned long *addr = &page_private(page);
+	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
+{
+	struct page *page = b->bm_pages[idx];
+	return (unsigned long *) kmap_atomic(page, km);
+}
+
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	return __bm_map_pidx(b, idx, KM_IRQ1);
+}
+
+static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+{
+	kunmap_atomic(p_addr, km);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+	return __bm_unmap(p_addr, KM_IRQ1);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_conf*, but for the debug macros I like to have the mdev around
+ * to be able to report device specific.
+ */
+
+
+STATIC void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
+					  "a NULL pointer; i=%lu n=%lu\n",
+					  i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+STATIC void bm_vk_free(void *ptr, int v)
+{
+	if (v)
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+STATIC struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+	struct page **old_pages = b->bm_pages;
+	struct page **new_pages, *page;
+	unsigned int i, bytes, vmalloced = 0;
+	unsigned long have = b->bm_number_of_pages;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* Trying kmalloc first, falling back to vmalloc.
+	 * GFP_KERNEL is ok, as this is done when a lower level disk is
+	 * "attached" to the drbd.  Context is receiver thread or cqueue
+	 * thread.  As we have no disk yet, we are not in the IO path,
+	 * not even the IO path of the peer. */
+	bytes = sizeof(struct page *)*want;
+	new_pages = kmalloc(bytes, GFP_KERNEL);
+	if (!new_pages) {
+		new_pages = vmalloc(bytes);
+		if (!new_pages)
+			return NULL;
+		vmalloced = 1;
+	}
+
+	memset(new_pages, 0, bytes);
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			page = alloc_page(GFP_HIGHUSER);
+			if (!page) {
+				bm_free_pages(new_pages + have, i - have);
+				bm_vk_free(new_pages, vmalloced);
+				return NULL;
+			}
+			/* we want to know which page it is
+			 * from the endio handlers */
+			bm_store_page_idx(page, i);
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	if (vmalloced)
+		b->bm_flags |= BM_P_VMALLOCED;
+	else
+		b->bm_flags &= ~BM_P_VMALLOCED;
+
+	return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in mdev->bitmap.
+ */
+int drbd_bm_init(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	WARN_ON(b != NULL);
+	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	spin_lock_init(&b->bm_lock);
+	mutex_init(&b->bm_change);
+	init_waitqueue_head(&b->bm_io_wait);
+
+	mdev->bitmap = b;
+
+	return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_conf *mdev)
+{
+	ERR_IF(!mdev->bitmap) return 0;
+	return mdev->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_conf *mdev)
+{
+	ERR_IF (!mdev->bitmap) return;
+	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
+	bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
+	kfree(mdev->bitmap);
+	mdev->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
+STATIC int bm_clear_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+	int cleared = 0;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		cleared += hweight_long(*bm);
+		*bm = 0;
+	}
+	bm_unmap(p_addr);
+	return cleared;
+}
+
+STATIC void bm_set_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		*bm |= ~mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		*bm = ~0UL;
+	}
+	bm_unmap(p_addr);
+}
+
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+STATIC unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+	unsigned long *p_addr;
+	unsigned long bits = 0;
+	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+	int idx, i, last_word;
+
+	/* all but last page */
+	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+		p_addr = __bm_map_pidx(b, idx, KM_USER0);
+		for (i = 0; i < LWPP; i++)
+			bits += hweight_long(p_addr[i]);
+		__bm_unmap(p_addr, KM_USER0);
+		cond_resched();
+	}
+	/* last (or only) page */
+	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+	p_addr = __bm_map_pidx(b, idx, KM_USER0);
+	for (i = 0; i < last_word; i++)
+		bits += hweight_long(p_addr[i]);
+	p_addr[last_word] &= cpu_to_lel(mask);
+	bits += hweight_long(p_addr[last_word]);
+	/* 32bit arch, may have an unused padding long */
+	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+		p_addr[last_word+1] = 0;
+	__bm_unmap(p_addr, KM_USER0);
+	return bits;
+}
+
+/* offset and len in long words.*/
+STATIC void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	unsigned int idx;
+	size_t do_now, end;
+
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		if (bm+do_now > p_addr + LWPP) {
+			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+		} else
+			memset(bm, c, do_now * sizeof(long));
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+		offset += do_now;
+	}
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long bits, words, owords, obits;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
+	int err = 0, growing;
+	int opages_vmalloced;
+
+	ERR_IF(!b) return -ENOMEM;
+
+	drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
+
+	dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
+			(unsigned long long)capacity);
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		bm_free_pages(opages, onpages);
+		bm_vk_free(opages, opages_vmalloced);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (get_ldev(mdev)) {
+		u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
+		put_ldev(mdev);
+		if (bits > bits_on_disk) {
+			DUMPLLU(bits);
+			DUMPLLU(bits_on_disk);
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else {
+		if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
+			npages = NULL;
+		else
+			npages = bm_realloc_pages(b, want);
+	}
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages && growing && set_new_bits)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		if (set_new_bits) {
+			bm_memset(b, owords, 0xff, words-owords);
+			b->bm_set += bits - obits;
+		} else
+			bm_memset(b, owords, 0x00, words-owords);
+
+	}
+
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
+	}
+
+	(void)bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		bm_vk_free(opages, opages_vmalloced);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+	dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
+
+ out:
+	drbd_bm_unlock(mdev);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+
+	return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+	unsigned long s;
+	/* if I don't have a disk, I don't know about out-of-sync status */
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+		return 0;
+	s = _drbd_bm_total_weight(mdev);
+	put_ldev(mdev);
+	return s;
+}
+
+size_t drbd_bm_words(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+
+	return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+			unsigned long *buffer)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long word, bits;
+	unsigned int idx;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+	if (number == 0)
+		return;
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
+
+	spin_lock_irq(&b->bm_lock);
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			bits = hweight_long(*bm);
+			word = *bm | *buffer++;
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+		}
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (end == b->bm_words)
+		b->bm_set -= bm_clear_surplus(b);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+		     unsigned long *buffer)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
+		dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--)
+				*buffer++ = *bm++;
+			bm_unmap(p_addr);
+		}
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+struct bm_aio_ctx {
+	struct drbd_conf *mdev;
+	atomic_t in_flight;
+	unsigned int done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+	int error;
+	struct kref kref;
+};
+
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+	put_ldev(ctx->mdev);
+	kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
+static BIO_ENDIO_TYPE bm_async_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error)
+{
+	struct bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_conf *mdev = ctx->mdev;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	BIO_ENDIO_FN_START;
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?!
+	 * do we want to WARN() on this? */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+	    !bm_test_page_unchanged(b->bm_pages[idx]))
+		dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
+
+	if (error) {
+		/* ctx error will hold the completed-last non-zero error code,
+		 * in case error codes differ. */
+		ctx->error = error;
+		bm_set_page_io_err(b->bm_pages[idx]);
+		/* Not identical to on disk version of it.
+		 * Is BM_PAGE_IO_ERROR enough? */
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
+					error, idx);
+	} else {
+		bm_clear_page_io_err(b->bm_pages[idx]);
+		dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
+	}
+
+	bm_page_unlock_io(mdev, idx);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES)
+		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&mdev->misc_wait);
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	}
+
+	BIO_ENDIO_FN_RETURN;
+}
+
+STATIC void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+{
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+	struct drbd_conf *mdev = ctx->mdev;
+	struct drbd_bitmap *b = mdev->bitmap;
+	struct page *page;
+	unsigned int len;
+
+	sector_t on_disk_sector =
+		mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
+	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+	/* this might happen with very small
+	 * flexible external meta data device,
+	 * or with PAGE_SIZE > 4k */
+	len = min_t(unsigned int, PAGE_SIZE,
+		(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
+
+	/* serialize IO on this page */
+	bm_page_lock_io(mdev, page_nr);
+	/* before memcpy and submit,
+	 * so it can be redirtied any time */
+	bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES) {
+		void *src, *dest;
+		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+		dest = kmap_atomic(page, KM_USER0);
+		src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
+		memcpy(dest, src, PAGE_SIZE);
+		kunmap_atomic(src, KM_USER1);
+		kunmap_atomic(dest, KM_USER0);
+		bm_store_page_idx(page, page_nr);
+	} else
+		page = b->bm_pages[page_nr];
+
+	bio->bi_bdev = mdev->ldev->md_bdev;
+	bio->bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
+	bio_add_page(bio, page, len, 0);
+	bio->bi_private = ctx;
+	bio->bi_end_io = bm_async_io_complete;
+
+	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio, -EIO);
+	} else {
+		submit_bio(rw, bio);
+		/* this should not count as user activity and cause the
+		 * resync to throttle -- see drbd_rs_should_slow_down(). */
+		atomic_add(len >> 9, &mdev->rs_sect_ev);
+	}
+}
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+STATIC int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+{
+	struct bm_aio_ctx *ctx;
+	struct drbd_bitmap *b = mdev->bitmap;
+	int num_pages, i, count = 0;
+	unsigned long now;
+	char ppb[10];
+	int err = 0;
+
+	/*
+	 * We are protected against bitmap disappearing/resizing by holding an
+	 * ldev reference (caller must have called get_ldev()).
+	 * For read/write, we are protected against changes to the bitmap by
+	 * the bitmap lock (see drbd_bitmap_io).
+	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
+	 * as we submit copies of pages anyways.
+	 */
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	if (!ctx->flags)
+		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+	num_pages = b->bm_number_of_pages;
+
+	now = jiffies;
+
+	/* let the layers below us try to merge these bios... */
+	for (i = 0; i < num_pages; i++) {
+		/* ignore completely unchanged pages */
+		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+			break;
+		if (rw & WRITE) {
+			if (bm_test_page_unchanged(b->bm_pages[i])) {
+				dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
+				continue;
+			}
+			/* during lazy writeout,
+			 * ignore those pages not marked for lazy writeout. */
+			if (lazy_writeout_upper_idx &&
+			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+				dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
+				continue;
+			}
+		}
+		atomic_inc(&ctx->in_flight);
+		bm_page_io_async(ctx, i, rw);
+		++count;
+		cond_resched();
+	}
+
+	/*
+	 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+	 * will not set ctx->done early, and decrement / test it here.  If there
+	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
+	 */
+	if (!atomic_dec_and_test(&ctx->in_flight)) {
+		drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+		wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+	} else
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
+	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
+			rw == WRITE ? "WRITE" : "READ",
+			count, jiffies - now);
+
+	if (ctx->error) {
+		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
+		drbd_chk_io_error(mdev, 1, true);
+		err = -EIO; /* ctx->error ? */
+	}
+
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk failed during IO... */
+
+	now = jiffies;
+	if (rw == WRITE) {
+		drbd_md_flush(mdev);
+	} else /* rw == READ */ {
+		b->bm_set = bm_count_bits(b);
+		dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
+		     jiffies - now);
+	}
+	now = b->bm_set;
+
+	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @mdev:	DRBD device.
+ */
+int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, READ, 0, 0);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ */
+int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, 0, 0);
+}
+
+/**
+ * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @mdev:	DRBD device.
+ * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
+}
+
+
+/**
+ * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
+ * @mdev:	DRBD device.
+ * @idx:	bitmap page index
+ *
+ * We don't want to special case on logical_block_size of the backend device,
+ * so we submit PAGE_SIZE aligned pieces.
+ * Note that on "most" systems, PAGE_SIZE is 4k.
+ *
+ * In case this becomes an issue on systems with larger PAGE_SIZE,
+ * we may want to change this again to write 4k aligned 4k pieces.
+ */
+int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
+{
+	struct bm_aio_ctx *ctx;
+	int err;
+
+	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
+		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
+		return 0;
+	}
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = BM_AIO_COPY_PAGES,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	bm_page_io_async(ctx, idx, WRITE_SYNC);
+	wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+
+	if (ctx->error)
+		drbd_chk_io_error(mdev, 1, true);
+		/* that should force detach, so the in memory bitmap will be
+		 * gone in a moment as well. */
+
+	mdev->bm_writ_cnt++;
+	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
+ *
+ * this returns a bit number, NOT a sector!
+ */
+static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
+	const int find_zero_bit, const enum km_type km)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr;
+	unsigned long bit_offset;
+	unsigned i;
+
+
+	if (bm_fo > b->bm_bits) {
+		dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+		bm_fo = DRBD_END_OF_BITMAP;
+	} else {
+		while (bm_fo < b->bm_bits) {
+			/* bit offset of the first bit in the page */
+			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
+
+			if (find_zero_bit)
+				i = find_next_zero_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+			else
+				i = find_next_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+
+			__bm_unmap(p_addr, km);
+			if (i < PAGE_SIZE*8) {
+				bm_fo = bit_offset + i;
+				if (bm_fo >= b->bm_bits)
+					break;
+				goto found;
+			}
+			bm_fo = bit_offset + PAGE_SIZE*8;
+		}
+		bm_fo = DRBD_END_OF_BITMAP;
+	}
+ found:
+	return bm_fo;
+}
+
+static unsigned long bm_find_next(struct drbd_conf *mdev,
+	unsigned long bm_fo, const int find_zero_bit)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long i = DRBD_END_OF_BITMAP;
+
+	ERR_IF(!b) return i;
+	ERR_IF(!b->bm_pages) return i;
+
+	spin_lock_irq(&b->bm_lock);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(mdev);
+
+	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	return bm_find_next(mdev, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	return bm_find_next(mdev, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
+	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
+	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+STATIC int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+	unsigned long e, int val)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int last_page_nr = -1U;
+	int c = 0;
+	int changed_total = 0;
+
+	if (e >= b->bm_bits) {
+		dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+				s, e, b->bm_bits);
+		e = b->bm_bits ? b->bm_bits -1 : 0;
+	}
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != last_page_nr) {
+			if (p_addr)
+				__bm_unmap(p_addr, KM_IRQ1);
+			if (c < 0)
+				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+			else if (c > 0)
+				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+			changed_total += c;
+			c = 0;
+			p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
+			last_page_nr = page_nr;
+		}
+		if (val)
+			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+		else
+			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+	}
+	if (p_addr)
+		__bm_unmap(p_addr, KM_IRQ1);
+	if (c < 0)
+		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+	else if (c > 0)
+		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+	changed_total += c;
+	b->bm_set += changed_total;
+	return changed_total;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+STATIC int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+	const unsigned long e, int val)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	int c = 0;
+
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+		bm_print_lock_info(mdev);
+
+	c = __bm_change_bits_to(mdev, s, e, val);
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	return bm_change_bits_to(mdev, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	return -bm_change_bits_to(mdev, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+		int page_nr, int first_word, int last_word)
+{
+	int i;
+	int bits;
+	int changed = 0;
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+	for (i = first_word; i < last_word; i++) {
+		bits = hweight_long(paddr[i]);
+		paddr[i] = ~0UL;
+		changed += BITS_PER_LONG - bits;
+	}
+	kunmap_atomic(paddr, KM_IRQ1);
+	if (changed) {
+		/* We only need lazy writeout, the information is still in the
+		 * remote bitmap as well, and is reconstructed during the next
+		 * bitmap exchange, if lost locally due to a crash. */
+		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+		b->bm_set += changed;
+	}
+}
+
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	/* First set_bit from the first bit (s)
+	 * up to the next long boundary (sl),
+	 * then assign full words up to the last long boundary (el),
+	 * then set_bit up to and including the last bit (e).
+	 *
+	 * Do not use memset, because we must account for changes,
+	 * so we need to loop over the words with hweight() anyways.
+	 */
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long sl = ALIGN(s,BITS_PER_LONG);
+	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+	int first_page;
+	int last_page;
+	int page_nr;
+	int first_word;
+	int last_word;
+
+	if (e - s <= 3*BITS_PER_LONG) {
+		/* don't bother; el and sl may even be wrong. */
+		spin_lock_irq(&b->bm_lock);
+		__bm_change_bits_to(mdev, s, e, 1);
+		spin_unlock_irq(&b->bm_lock);
+		return;
+	}
+
+	/* difference is large enough that we can trust sl and el */
+
+	spin_lock_irq(&b->bm_lock);
+
+	/* bits filling the current long */
+	if (sl)
+		__bm_change_bits_to(mdev, s, sl-1, 1);
+
+	first_page = sl >> (3 + PAGE_SHIFT);
+	last_page = el >> (3 + PAGE_SHIFT);
+
+	/* MLPP: modulo longs per page */
+	/* LWPP: long words per page */
+	first_word = MLPP(sl >> LN2_BPL);
+	last_word = LWPP;
+
+	/* first and full pages, unless first page == last page */
+	for (page_nr = first_page; page_nr < last_page; page_nr++) {
+		bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
+		spin_unlock_irq(&b->bm_lock);
+		cond_resched();
+		first_word = 0;
+		spin_lock_irq(&b->bm_lock);
+	}
+
+	/* last page (respectively only page, for first page == last page) */
+	last_word = MLPP(el >> LN2_BPL);
+	bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+	/* possibly trailing bits.
+	 * example: (e & 63) == 63, el will be e+1.
+	 * if that even was the very last bit,
+	 * it would trigger an assert in __bm_change_bits_to()
+	 */
+	if (el <= e)
+		__bm_change_bits_to(mdev, el, e, 1);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ *  1 ... bit set
+ *  0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr;
+	int i;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(mdev);
+	if (bitnr < b->bm_bits) {
+		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
+	} else if (bitnr == b->bm_bits) {
+		i = -1;
+	} else { /* (bitnr > b->bm_bits) */
+		dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+		i = 0;
+	}
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int page_nr = -1U;
+	int c = 0;
+
+	/* If this is called without a bitmap, that is a bug.  But just to be
+	 * robust in case we screwed up elsewhere, in that case pretend there
+	 * was one dirty bit in the requested area, so we won't try to do a
+	 * local read there (no bitmap probably implies no disk) */
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm_pages) return 1;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(mdev);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != idx) {
+			page_nr = idx;
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_pidx(b, idx);
+		}
+		ERR_IF (bitnr >= b->bm_bits) {
+			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+		} else {
+			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+		}
+	}
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int count, s, e;
+	unsigned long flags;
+	unsigned long *p_addr, *bm;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(mdev);
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1), b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		int n = e-s;
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+		bm = p_addr + MLPP(s);
+		while (n--)
+			count += hweight_long(*bm++);
+		bm_unmap(p_addr);
+	} else {
+		dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+#if DUMP_MD >= 3
+	dev_info(DEV, "enr=%lu weight=%d e=%d s=%d\n", enr, count, e, s);
+#endif
+	return count;
+}
+
+/* Set all bits covered by the AL-extent al_enr.
+ * Returns number of bits changed. */
+unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long weight;
+	unsigned long s, e;
+	int count, i, do_now;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irq(&b->bm_lock);
+	if (BM_DONT_SET & b->bm_flags)
+		bm_print_lock_info(mdev);
+	weight = b->bm_set;
+
+	s = al_enr * BM_WORDS_PER_AL_EXT;
+	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
+	/* assert that s and e are on the same page */
+	D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
+	      ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
+	count = 0;
+	if (s < b->bm_words) {
+		i = do_now = e-s;
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+		bm = p_addr + MLPP(s);
+		while (i--) {
+			count += hweight_long(*bm);
+			*bm = -1UL;
+			bm++;
+		}
+		bm_unmap(p_addr);
+		b->bm_set += do_now*BITS_PER_LONG - count;
+		if (e == b->bm_words)
+			b->bm_set -= bm_clear_surplus(b);
+	} else {
+		dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
+	}
+	weight = b->bm_set - weight;
+	spin_unlock_irq(&b->bm_lock);
+	return weight;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_buildtag.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_buildtag.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_buildtag.c	2015-01-21 12:02:58.377824096 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_buildtag.c	2015-01-21 12:02:58.377824096 +0300
@@ -0,0 +1,7 @@
+/* automatically generated. DO NOT EDIT. */
+#include <linux/drbd.h>
+const char *drbd_buildtag(void)
+{
+	return "GIT-hash: 83ca112086600faacab2f157bc5a9324f7bd7f77"
+		" build by root@sighted, 2012-10-09 12:47:51";
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_int.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_int.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_int.h	2015-01-21 12:02:58.379824044 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_int.h	2015-01-21 12:02:58.379824044 +0300
@@ -0,0 +1,2698 @@
+/*
+  drbd_int.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/genhd.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+#include <linux/drbd_config.h>
+
+#ifdef __CHECKER__
+# define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x)       __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
+
+/* Compatibility for older kernels */
+#ifndef __acquires
+# ifdef __CHECKER__
+#  define __acquires(x)	__attribute__((context(x,0,1)))
+#  define __releases(x)	__attribute__((context(x,1,0)))
+#  define __acquire(x)	__context__(x,1)
+#  define __release(x)	__context__(x,-1)
+#  define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
+# else
+#  define __acquires(x)
+#  define __releases(x)
+#  define __acquire(x)	(void)0
+#  define __release(x)	(void)0
+#  define __cond_lock(x,c) (c)
+# endif
+#endif
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern int disable_sendpage;
+extern int allow_oos;
+extern unsigned int cn_idx;
+
+#ifdef DRBD_ENABLE_FAULTS
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+#include <linux/major.h>
+#ifndef DRBD_MAJOR
+# define DRBD_MAJOR 147
+#endif
+
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ *
+ * FIXME btw, we should register some reboot notifier.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+/* All EEs on the free list should have ID_VACANT (== 0)
+ * freshly allocated EEs get !ID_VACANT (== 1)
+ * so if it says "cannot dereference null pointer at address 0x00000001",
+ * it is most likely one of these :( */
+
+#define ID_IN_SYNC      (4711ULL)
+#define ID_OUT_OF_SYNC  (4712ULL)
+
+#define ID_SYNCER (-1ULL)
+#define ID_VACANT 0
+#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_conf;
+
+#ifdef DBG_ALL_SYMBOLS
+# define STATIC
+#else
+# define STATIC static
+#endif
+
+#ifdef PARANOIA
+# define PARANOIA_BUG_ON(x) BUG_ON(x)
+#else
+# define PARANOIA_BUG_ON(x)
+#endif
+
+/*
+ * Some Message Macros
+ *************************/
+
+/* handy macro: DUMPP(somepointer) */
+#define DUMPP(A)   dev_err(DEV, #A " = %p in %s:%d\n", (A), __FILE__, __LINE__);
+#define DUMPLU(A)  dev_err(DEV, #A " = %lu in %s:%d\n", (unsigned long)(A), __FILE__, __LINE__);
+#define DUMPLLU(A) dev_err(DEV, #A " = %llu in %s:%d\n", (unsigned long long)(A), __FILE__, __LINE__);
+#define DUMPLX(A)  dev_err(DEV, #A " = %lx in %s:%d\n", (A), __FILE__, __LINE__);
+#define DUMPI(A)   dev_err(DEV, #A " = %d in %s:%d\n", (int)(A), __FILE__, __LINE__);
+
+#define DUMPST(A) DUMPLLU((unsigned long long)(A))
+
+#if 0
+#define D_DUMPP(A)   DUMPP(A)
+#define D_DUMPLU(A)  DUMPLU(A)
+#define D_DUMPLLU(A) DUMPLLU(A)
+#define D_DUMPLX(A)  DUMPLX(A)
+#define D_DUMPI(A)   DUMPI(A)
+#else
+#define D_DUMPP(A)
+#define D_DUMPLU(A)
+#define D_DUMPLLU(A)
+#define D_DUMPLX(A)
+#define D_DUMPI(A)
+#endif
+
+/* upstream kernel wants us to use dev_warn(), ...
+ * dev_printk() expects to be presented a struct device *;
+ * in older kernels, (<= 2.6.24), there is nothing suitable there.
+ * "backport" hack: redefine dev_printk.
+ * Trigger is definition of dev_to_disk macro, introduced with the
+ * commit edfaa7c36574f1bf09c65ad602412db9da5f96bf
+ *     Driver core: convert block from raw kobjects to core devices
+ */
+#if defined(dev_to_disk) && defined(disk_to_dev)
+/* to shorten dev_warn(DEV, "msg"); and relatives statements */
+#define DEV (disk_to_dev(mdev->vdisk))
+#else
+#undef dev_printk
+#define DEV mdev
+#define dev_printk(level, dev, format, arg...)  \
+	        printk(level "block drbd%u: " format , dev->minor , ## arg)
+#endif
+/* also, some older kernels do not have all of these. */
+#ifndef dev_emerg
+#define dev_emerg(dev, format, arg...)          \
+	        dev_printk(KERN_EMERG , dev , format , ## arg)
+#define dev_alert(dev, format, arg...)          \
+	        dev_printk(KERN_ALERT , dev , format , ## arg)
+#define dev_crit(dev, format, arg...)           \
+	        dev_printk(KERN_CRIT , dev , format , ## arg)
+#endif
+
+
+
+/* see kernel/printk.c:printk_ratelimit
+ * macro, so it is easy do have independent rate limits at different locations
+ * "initializer element not constant ..." with kernel 2.4 :(
+ * so I initialize toks to something large
+ */
+#define DRBD_ratelimit(ratelimit_jiffies, ratelimit_burst)	\
+({								\
+	int __ret;						\
+	static unsigned long toks = 0x80000000UL;		\
+	static unsigned long last_msg;				\
+	static int missed;					\
+	unsigned long now = jiffies;				\
+	toks += now - last_msg;					\
+	last_msg = now;						\
+	if (toks > (ratelimit_burst * ratelimit_jiffies))	\
+		toks = ratelimit_burst * ratelimit_jiffies;	\
+	if (toks >= ratelimit_jiffies) {			\
+		int lost = missed;				\
+		missed = 0;					\
+		toks -= ratelimit_jiffies;			\
+		if (lost)					\
+			dev_warn(DEV, "%d messages suppressed in %s:%d.\n", \
+				lost, __FILE__, __LINE__);	\
+		__ret = 1;					\
+	} else {						\
+		missed++;					\
+		__ret = 0;					\
+	}							\
+	__ret;							\
+})
+
+
+#ifdef DBG_ASSERTS
+extern void drbd_assert_breakpoint(struct drbd_conf *, char *, char *, int);
+# define D_ASSERT(exp)	if (!(exp)) \
+	 drbd_assert_breakpoint(mdev, #exp, __FILE__, __LINE__)
+#else
+# define D_ASSERT(exp)	if (!(exp)) \
+	 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
+#endif
+#define ERR_IF(exp) if (({						\
+	int _b = (exp) != 0;						\
+	if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n",	\
+		__func__, #exp, __FILE__, __LINE__);			\
+	 _b;								\
+	}))
+
+/* Defines to control fault insertion */
+enum {
+	DRBD_FAULT_MD_WR = 0,	/* meta data write */
+	DRBD_FAULT_MD_RD = 1,	/*           read  */
+	DRBD_FAULT_RS_WR = 2,	/* resync          */
+	DRBD_FAULT_RS_RD = 3,
+	DRBD_FAULT_DT_WR = 4,	/* data            */
+	DRBD_FAULT_DT_RD = 5,
+	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
+	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
+	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+	DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+
+	DRBD_FAULT_MAX,
+};
+
+extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...);
+
+extern unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
+
+static inline int
+drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
+#ifdef DRBD_ENABLE_FAULTS
+	return fault_rate &&
+		(enable_faults & (1<<type)) &&
+		_drbd_insert_fault(mdev, type);
+#else
+	return 0;
+#endif
+}
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8)
+# define HAVE_KERNEL_SENDMSG 1
+#else
+# define HAVE_KERNEL_SENDMSG 0
+#endif
+
+/*
+ * our structs
+ *************************/
+
+#define SET_MDEV_MAGIC(x) \
+	({ typecheck(struct drbd_conf*, x); \
+	  (x)->magic = (long)(x) ^ DRBD_MAGIC; })
+#define IS_VALID_MDEV(x)  \
+	(typecheck(struct drbd_conf*, x) && \
+	  ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)) : 0))
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+/* 4th incarnation of the disk layout. */
+#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
+
+extern struct drbd_conf **minor_table;
+
+/* on the wire */
+enum drbd_packets {
+	/* receiver (data socket) */
+	P_DATA		      = 0x00,
+	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
+	P_RS_DATA_REPLY	      = 0x02, /* Response to P_RS_DATA_REQUEST */
+	P_BARRIER	      = 0x03,
+	P_BITMAP	      = 0x04,
+	P_BECOME_SYNC_TARGET  = 0x05,
+	P_BECOME_SYNC_SOURCE  = 0x06,
+	P_UNPLUG_REMOTE	      = 0x07, /* Used at various times to hint the peer */
+	P_DATA_REQUEST	      = 0x08, /* Used to ask for a data block */
+	P_RS_DATA_REQUEST     = 0x09, /* Used to ask for a data block for resync */
+	P_SYNC_PARAM	      = 0x0a,
+	P_PROTOCOL	      = 0x0b,
+	P_UUIDS		      = 0x0c,
+	P_SIZES		      = 0x0d,
+	P_STATE		      = 0x0e,
+	P_SYNC_UUID	      = 0x0f,
+	P_AUTH_CHALLENGE      = 0x10,
+	P_AUTH_RESPONSE	      = 0x11,
+	P_STATE_CHG_REQ	      = 0x12,
+
+	/* asender (meta socket */
+	P_PING		      = 0x13,
+	P_PING_ACK	      = 0x14,
+	P_RECV_ACK	      = 0x15, /* Used in protocol B */
+	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
+	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+	P_DISCARD_ACK	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
+	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
+	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
+	P_BARRIER_ACK	      = 0x1c,
+	P_STATE_CHG_REPLY     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	P_OV_REQUEST	      = 0x1e, /* data socket */
+	P_OV_REPLY	      = 0x1f,
+	P_OV_RESULT	      = 0x20, /* meta socket */
+	P_CSUM_RS_REQUEST     = 0x21, /* data socket */
+	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
+	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
+	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
+	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
+	P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
+	P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+
+	P_MAX_CMD	      = 0x2A,
+	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+	P_MAX_OPT_CMD	      = 0x101,
+
+	/* special command ids for handshake */
+
+	P_HAND_SHAKE_M	      = 0xfff1, /* First Packet on the MetaSock */
+	P_HAND_SHAKE_S	      = 0xfff2, /* First Packet on the Socket */
+
+	P_HAND_SHAKE	      = 0xfffe	/* FIXED for the next century! */
+};
+
+static inline const char *cmdname(enum drbd_packets cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_DISCARD_ACK]	        = "DiscardAck",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
+		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
+		[P_COMPRESSED_BITMAP]   = "CBitmap",
+		[P_DELAY_PROBE]         = "DelayProbe",
+		[P_OUT_OF_SYNC]		= "OutOfSync",
+		[P_MAX_CMD]	        = NULL,
+	};
+
+	if (cmd == P_HAND_SHAKE_M)
+		return "HandShakeM";
+	if (cmd == P_HAND_SHAKE_S)
+		return "HandShakeS";
+	if (cmd == P_HAND_SHAKE)
+		return "HandShake";
+	if (cmd >= P_MAX_CMD)
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+	/* "const"
+	 * stores total bits and long words
+	 * of the bitmap, so we don't need to
+	 * call the accessor functions over and again. */
+	unsigned long bm_bits;
+	unsigned long bm_words;
+	/* during xfer, current position within the bitmap */
+	unsigned long bit_offset;
+	unsigned long word_offset;
+
+	/* statistics; index: (h->command == P_BITMAP) */
+	unsigned packets[2];
+	unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+		const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+	/* word_offset counts "native long words" (32 or 64 bit),
+	 * aligned at 64 bit.
+	 * Encoded packet may end at an unaligned bit offset.
+	 * In case a fallback clear text packet is transmitted in
+	 * between, we adjust this offset back to the last 64bit
+	 * aligned "native long word", which makes coding and decoding
+	 * the plain text bitmap much more convenient.  */
+#if BITS_PER_LONG == 64
+	c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+	c->word_offset = c->bit_offset >> 5;
+	c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *	these are pointers to local structs
+ *	and have no relevance for the partner,
+ *	which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+	u32	  magic;
+	u16	  command;
+	u16	  length;	/* bytes of data after this header */
+	u8	  payload[0];
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+	u16	  magic;	/* use DRBD_MAGIC_BIG here */
+	u16	  command;
+	u32	  length;	/* Use only 24 bits of that. Ignore the highest 8 bit. */
+	u8	  payload[0];
+} __packed;
+
+union p_header {
+	struct p_header80 h80;
+	struct p_header95 h95;
+};
+
+/*
+ * short commands, packets without payload, plain p_header:
+ *   P_PING
+ *   P_PING_ACK
+ *   P_BECOME_SYNC_TARGET
+ *   P_BECOME_SYNC_SOURCE
+ *   P_UNPLUG_REMOTE
+ */
+
+/*
+ * commands with out-of-struct payload:
+ *   P_BITMAP    (no additional fields)
+ *   P_DATA, P_DATA_REPLY (see p_data)
+ *   P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
+ */
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER	      1 /* no longer used */
+#define DP_RW_SYNC	      2 /* equals REQ_SYNC    */
+#define DP_MAY_SET_IN_SYNC    4
+#define DP_UNPLUG             8 /* equals REQ_UNPLUG  */
+#define DP_FUA               16 /* equals REQ_FUA     */
+#define DP_FLUSH             32 /* equals REQ_FLUSH   */
+#define DP_DISCARD           64 /* equals REQ_DISCARD */
+
+struct p_data {
+	union p_header head;
+	u64	    sector;    /* 64 bits sector number */
+	u64	    block_id;  /* to identify the request in protocol B&C */
+	u32	    seq_num;
+	u32	    dp_flags;
+} __packed;
+
+/*
+ * commands which share a struct:
+ *  p_block_ack:
+ *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ *   P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ *  p_block_req:
+ *   P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+	struct p_header80 head;
+	u64	    sector;
+	u64	    block_id;
+	u32	    blksize;
+	u32	    seq_num;
+} __packed;
+
+
+struct p_block_req {
+	struct p_header80 head;
+	u64 sector;
+	u64 block_id;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ *   P_HAND_SHAKE
+ *   P_BARRIER
+ *   P_BARRIER_ACK
+ *   P_SYNC_PARAM
+ *   ReportParams
+ */
+
+struct p_handshake {
+	struct p_header80 head;	/* 8 bytes */
+	u32 protocol_min;
+	u32 feature_flags;
+	u32 protocol_max;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserved array shall be zero.
+	 */
+
+	u32 _pad;
+	u64 reserved[7];
+} __packed;
+/* 80 bytes, FIXED for the next century */
+
+struct p_barrier {
+	struct p_header80 head;
+	u32 barrier;	/* barrier number _handle_ only */
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+	struct p_header80 head;
+	u32 barrier;
+	u32 set_size;
+} __packed;
+
+struct p_rs_param {
+	struct p_header80 head;
+	u32 rate;
+
+	      /* Since protocol version 88 and higher. */
+	char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+	struct p_header80 head;
+	u32 rate;
+        /* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+	struct p_header80 head;
+	u32 rate;
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+	u32 c_plan_ahead;
+	u32 c_delay_target;
+	u32 c_fill_target;
+	u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+	CF_WANT_LOSE = 1,
+	CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+	struct p_header80 head;
+	u32 protocol;
+	u32 after_sb_0p;
+	u32 after_sb_1p;
+	u32 after_sb_2p;
+	u32 conn_flags;
+	u32 two_primaries;
+
+              /* Since protocol version 87 and higher. */
+	char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+	struct p_header80 head;
+	u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+	struct p_header80 head;
+	u64	    uuid;
+} __packed;
+
+struct p_sizes {
+	struct p_header80 head;
+	u64	    d_size;  /* size of disk */
+	u64	    u_size;  /* user requested size */
+	u64	    c_size;  /* current exported size */
+	u32	    max_bio_size;  /* Maximal size of a BIO */
+	u16	    queue_order_type;  /* not yet implemented in DRBD*/
+	u16	    dds_flags; /* use enum dds_flags here. */
+} __packed;
+
+struct p_state {
+	struct p_header80 head;
+	u32	    state;
+} __packed;
+
+struct p_req_state {
+	struct p_header80 head;
+	u32	    mask;
+	u32	    val;
+} __packed;
+
+struct p_req_state_reply {
+	struct p_header80 head;
+	u32	    retcode;
+} __packed;
+
+struct p_drbd06_param {
+	u64	  size;
+	u32	  state;
+	u32	  blksize;
+	u32	  protocol;
+	u32	  version;
+	u32	  gen_cnt[5];
+	u32	  bit_map_gen[5];
+} __packed;
+
+struct p_discard {
+	struct p_header80 head;
+	u64	    block_id;
+	u32	    seq_num;
+	u32	    pad;
+} __packed;
+
+struct p_block_desc {
+	struct p_header80 head;
+	u64 sector;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+	/* RLE_VLI_Bytes = 0,
+	 * and other bit variants had been defined during
+	 * algorithm evaluation. */
+	RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+	struct p_header80 head;
+	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+	 * (encoding & 0x80): polarity (set/unset) of first runlength
+	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+	 * used to pad up to head.length bytes
+	 */
+	u8 encoding;
+
+	u8 code[0];
+} __packed;
+
+struct p_delay_probe93 {
+	struct p_header80 head;
+	u32     seq_num; /* sequence number to match the two probe packets */
+	u32     offset;  /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/* DCBP: Drbd Compressed Bitmap Packet ... */
+static inline enum drbd_bitmap_code
+DCBP_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static inline void
+DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static inline int
+DCBP_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static inline void
+DCBP_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static inline int
+DCBP_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+static inline void
+DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+/* one bitmap packet, including the p_header,
+ * should fit within one _architecture independent_ page.
+ * so we need to use the fixed size 4KiB page size
+ * most architectures have used for a long time.
+ */
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
+#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
+#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
+#if (PAGE_SIZE < 4096)
+/* drbd_send_bitmap / receive_bitmap would break horribly */
+#error "PAGE_SIZE too small"
+#endif
+
+union p_polymorph {
+        union p_header           header;
+        struct p_handshake       handshake;
+        struct p_data            data;
+        struct p_block_ack       block_ack;
+        struct p_barrier         barrier;
+        struct p_barrier_ack     barrier_ack;
+        struct p_rs_param_89     rs_param_89;
+        struct p_rs_param_95     rs_param_95;
+        struct p_protocol        protocol;
+        struct p_sizes           sizes;
+        struct p_uuids           uuids;
+        struct p_state           state;
+        struct p_req_state       req_state;
+        struct p_req_state_reply req_state_reply;
+        struct p_block_req       block_req;
+	struct p_delay_probe93   delay_probe93;
+	struct p_rs_uuid         rs_uuid;
+	struct p_block_desc      block_desc;
+} __packed;
+
+/**********************************************************************/
+enum drbd_thread_state {
+	None,
+	Running,
+	Exiting,
+	Restarting
+};
+
+struct drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion startstop;
+	enum drbd_thread_state t_state;
+	int (*function) (struct drbd_thread *);
+	struct drbd_conf *mdev;
+	int reset_cpu_mask;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 *	--lge */
+
+	smp_rmb();
+	return thi->t_state;
+}
+
+struct drbd_work;
+typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
+struct drbd_work {
+	struct list_head list;
+	drbd_work_cb cb;
+};
+
+struct drbd_tl_epoch;
+struct drbd_request {
+	struct drbd_work w;
+	struct drbd_conf *mdev;
+
+	/* if local IO is not allowed, will be NULL.
+	 * if local IO _is_ allowed, holds the locally submitted bio clone,
+	 * or, after local IO completion, the ERR_PTR(error).
+	 * see drbd_endio_pri(). */
+	struct bio *private_bio;
+
+	struct hlist_node collision;
+	sector_t sector;
+	unsigned int size;
+	unsigned int epoch; /* barrier_nr */
+
+	/* barrier_nr: used to check on "completion" whether this req was in
+	 * the current epoch, and we therefore have to close it,
+	 * starting a new epoch...
+	 */
+
+	struct list_head tl_requests; /* ring list in the transfer log */
+	struct bio *master_bio;       /* master bio pointer */
+	unsigned long rq_state; /* see comments above _req_mod() */
+	unsigned long start_time;
+};
+
+struct drbd_tl_epoch {
+	struct drbd_work w;
+	struct list_head requests; /* requests before */
+	struct drbd_tl_epoch *next; /* pointer to the next barrier */
+	unsigned int br_number;  /* the barriers identifier. */
+	int n_writes;	/* number of requests attached before this barrier */
+};
+
+struct drbd_request;
+
+/* These Tl_epoch_entries may be in one of 6 lists:
+   active_ee .. data packet being written
+   sync_ee   .. syncer block being written
+   done_ee   .. block written, need to send P_WRITE_ACK
+   read_ee   .. [RS]P_DATA_REQUEST being read
+*/
+
+struct drbd_epoch {
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* drbd_epoch flag bits */
+enum {
+	DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
+	DE_BARRIER_IN_NEXT_EPOCH_DONE,
+	DE_CONTAINS_A_BARRIER,
+	DE_HAVE_BARRIER_NUMBER,
+	DE_IS_FINISHING,
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BARRIER_DONE,
+	EV_BECAME_LAST,
+	EV_TRACE_FLUSH,       /* TRACE_ are not real events, only used for tracing */
+	EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */
+	EV_TRACE_SETTING_BI,  /* Barrier is expressed with the first write of the next epoch */
+	EV_TRACE_ALLOC,
+	EV_TRACE_FREE,
+	EV_CLEANUP = 32, /* used as flag */
+};
+
+struct drbd_wq_barrier {
+	struct drbd_work w;
+	struct completion done;
+};
+
+struct digest_info {
+	int digest_size;
+	void *digest;
+};
+
+struct drbd_epoch_entry {
+	struct drbd_work w;
+	struct hlist_node collision;
+	struct drbd_epoch *epoch; /* for writes */
+	struct drbd_conf *mdev;
+	struct page *pages;
+	atomic_t pending_bios;
+	unsigned int size;
+	/* see comments on ee flag bits below */
+	unsigned long flags;
+	sector_t sector;
+	union {
+		u64 block_id;
+		struct digest_info *digest;
+	};
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
+enum {
+	__EE_CALL_AL_COMPLETE_IO,
+	__EE_MAY_SET_IN_SYNC,
+
+	/* This epoch entry closes an epoch using a barrier.
+	 * On successful completion, the epoch is released,
+	 * and the P_BARRIER_ACK send. */
+	__EE_IS_BARRIER,
+
+	/* In case a barrier failed,
+	 * we need to resubmit without the barrier flag. */
+	__EE_RESUBMITTED,
+
+	/* we may have several bios per epoch entry.
+	 * if any of those fail, we set this flag atomically
+	 * from the endio callback */
+	__EE_WAS_ERROR,
+
+	/* This ee has a pointer to a digest instead of a block id */
+	__EE_HAS_DIGEST,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
+#define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
+
+/* global flag bits */
+enum {
+	CREATE_BARRIER,		/* next P_DATA is preceded by a P_BARRIER */
+	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	SEND_PING,		/* whether asender should send a ping asap */
+
+	UNPLUG_QUEUED,		/* only relevant with kernel 2.4 */
+	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
+	MD_DIRTY,		/* current uuids and flags not yet on disk */
+	DISCARD_CONCURRENT,	/* Set on one node, cleared on the peer! */
+	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
+	CLUSTER_ST_CHANGE,	/* Cluster wide state change going on... */
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL,
+	CRASHED_PRIMARY,	/* This node was a crashed primary.
+				 * Gets cleared when the state.conn
+				 * goes into C_CONNECTED state. */
+	NO_BARRIER_SUPP,	/* underlying block device doesn't implement barriers */
+	CONSIDER_RESYNC,
+
+	MD_NO_BARRIER,		/* meta data device does not support barriers,
+				   so don't even try */
+	SUSPEND_IO,		/* suspend application io */
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
+	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
+	WAS_IO_ERROR,		/* Local disk failed returned IO error */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
+	NET_CONGESTED,		/* The data socket is congested */
+
+	CONFIG_PENDING,		/* serialization of (re)configuration requests.
+				 * if set, also prevents the device from dying */
+	DEVICE_DYING,		/* device became unconfigured,
+				 * but worker thread is still handling the cleanup.
+				 * reconfiguring (nl_disk_conf, nl_net_conf) is disallowed,
+				 * while this is set. */
+	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
+				 * the peer, if it changed there as well. */
+	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
+	GOT_PING_ACK,		/* set when we receive a ping_ack packet, misc wait gets woken */
+	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
+	AL_SUSPENDED,		/* Activity logging is currently suspended. */
+	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
+};
+
+struct drbd_bitmap; /* opaque for drbd_conf */
+
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+	/* do we need to kfree, or vfree bm_pages? */
+	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+	/* currently locked for bulk operation */
+	BM_LOCKED_MASK = 0xf,
+
+	/* in detail, that is: */
+	BM_DONT_CLEAR = 0x1,
+	BM_DONT_SET   = 0x2,
+	BM_DONT_TEST  = 0x4,
+
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
+	/* (test bit, count bit) allowed (common case) */
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+	/* testing bits, as well as setting new bits allowed, but clearing bits
+	 * would be unexpected.  Used during bitmap receive.  Setting new bits
+	 * requires sending of "out-of-sync" information, though. */
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
+
+struct drbd_work_queue {
+	struct list_head q;
+	struct semaphore s; /* producers up it, worker down()s it */
+	spinlock_t q_lock;  /* to protect the list. */
+};
+
+struct drbd_socket {
+	struct drbd_work_queue work;
+	struct mutex mutex;
+	struct socket    *socket;
+	/* this way we get our
+	 * send/receive buffers off the stack */
+	union p_polymorph sbuf;
+	union p_polymorph rbuf;
+};
+
+struct drbd_md {
+	u64 md_offset;		/* sector offset to 'super' block */
+
+	u64 la_size_sect;	/* last agreed size, unit sectors */
+	u64 uuid[UI_SIZE];
+	u64 device_uuid;
+	u32 flags;
+	u32 md_size_sect;
+
+	s32 al_offset;	/* signed relative sector offset to al area */
+	s32 bm_offset;	/* signed relative sector offset to bitmap */
+
+	/* u32 al_nr_extents;	   important for restoring the AL
+	 * is stored into  sync_conf.al_extents, which in turn
+	 * gets applied to act_log->nr_elements
+	 */
+};
+
+/* for sync_conf and other types... */
+#define NL_PACKET(name, number, fields) struct name { fields };
+#define NL_INTEGER(pn,pr,member) int member;
+#define NL_INT64(pn,pr,member) __u64 member;
+#define NL_BIT(pn,pr,member)   unsigned member:1;
+#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
+#include "linux/drbd_nl.h"
+
+struct drbd_backing_dev {
+	struct block_device *backing_bdev;
+	struct block_device *md_bdev;
+	struct drbd_md md;
+	struct disk_conf dc; /* The user provided config... */
+	sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+	unsigned int done;
+	int error;
+};
+
+struct bm_io_work {
+	struct drbd_work w;
+	char *why;
+	enum bm_flag flags;
+	int (*io_fn)(struct drbd_conf *mdev);
+	void (*done)(struct drbd_conf *mdev, int rv);
+};
+
+enum write_ordering_e {
+	WO_none,
+	WO_drain_io,
+	WO_bdev_flush,
+	WO_bio_barrier
+};
+
+struct fifo_buffer {
+	int *values;
+	unsigned int head_index;
+	unsigned int size;
+};
+
+struct drbd_conf {
+#ifdef PARANOIA
+	long magic;
+#endif
+	/* things that are stored as / read from meta data on disk */
+	unsigned long flags;
+
+	/* configured by drbdsetup */
+	struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
+	struct syncer_conf sync_conf;
+	struct drbd_backing_dev *ldev __protected_by(local);
+
+	sector_t p_size;     /* partner's disk size */
+	struct request_queue *rq_queue;
+	struct block_device *this_bdev;
+	struct gendisk	    *vdisk;
+
+	struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta; /* ping/ack (metadata) packets */
+	int agreed_pro_version;  /* actually used protocol version */
+	unsigned long last_received; /* in jiffies, either socket */
+	unsigned int ko_count;
+	struct drbd_work  resync_work,
+			  unplug_work,
+			  go_diskless,
+			  md_sync_work,
+			  start_resync_work;
+	struct timer_list resync_timer;
+	struct timer_list md_sync_timer;
+	struct timer_list start_resync_timer;
+	struct timer_list request_timer;
+#ifdef DRBD_DEBUG_MD_SYNC
+	struct {
+		unsigned int line;
+		const char* func;
+	} last_md_mark_dirty;
+#endif
+
+	/* Used after attach while negotiating new disk state. */
+	union drbd_state new_state_tmp;
+
+	union drbd_state state;
+	wait_queue_head_t misc_wait;
+	wait_queue_head_t state_wait;  /* upon each state change. */
+	wait_queue_head_t net_cnt_wait;
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+	atomic_t unacked_cnt;	 /* Need to send replies for */
+	atomic_t local_cnt;	 /* Waiting for local completion */
+	atomic_t net_cnt;	 /* Users of net_conf */
+	spinlock_t req_lock;
+	struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
+	struct drbd_tl_epoch *newest_tle;
+	struct drbd_tl_epoch *oldest_tle;
+	struct list_head out_of_sequence_requests;
+	struct list_head barrier_acked_requests;
+	struct hlist_head *tl_hash;
+	unsigned int tl_hash_s;
+
+	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+	unsigned long rs_total;
+	/* number of resync blocks that failed in this run */
+	unsigned long rs_failed;
+	/* Syncer's start time [unit jiffies] */
+	unsigned long rs_start;
+	/* cumulated time in PausedSyncX state [unit jiffies] */
+	unsigned long rs_paused;
+	/* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+	unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+	/* marks's time [unit jiffies] */
+	unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+	/* current index into rs_mark_{left,time} */
+	int rs_last_mark;
+
+	/* where does the admin want us to start? (sector) */
+	sector_t ov_start_sector;
+	/* where are we now? (sector) */
+	sector_t ov_position;
+	/* Start sector of out of sync range (to merge printk reporting). */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
+	unsigned long ov_left; /* in bits */
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *verify_tfm;
+
+	unsigned long last_reattach_jif;
+	unsigned long last_reconnect_jif;
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread asender;
+	struct drbd_bitmap *bitmap;
+	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+	/* Used to track operations of resync... */
+	struct lru_cache *resync;
+	/* Number of locked elements in resync LRU */
+	unsigned int resync_locked;
+	/* resync extent number waiting for application requests */
+	unsigned int resync_wenr;
+
+	int open_cnt;
+	u64 *p_uuid;
+	/* FIXME clean comments, restructure so it is more obvious which
+	 * members are protected by what */
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	enum write_ordering_e write_ordering;
+	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
+	struct list_head done_ee;   /* send ack */
+	struct list_head read_ee;   /* IO in progress (any read) */
+	struct list_head net_ee;    /* zero-copy network send in progress */
+	struct hlist_head *ee_hash; /* is protected by req_lock! */
+	unsigned int ee_hash_s;
+
+	/* this one is protected by ee_lock, single thread */
+	struct drbd_epoch_entry *last_write_w_barrier;
+
+	int next_barrier_nr;
+	struct hlist_head *app_reads_hash; /* is protected by req_lock */
+	struct list_head resync_reads;
+	atomic_t pp_in_use;		/* allocated from page pool */
+	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
+	wait_queue_head_t ee_wait;
+	struct page *md_io_page;	/* one page buffer for md_io */
+	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
+	struct drbd_md_io md_io;
+	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache *act_log;	/* activity log */
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	int al_tr_pos;   /* position of the next transaction in the journal */
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
+	struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
+	void *int_dig_out;
+	void *int_dig_in;
+	void *int_dig_vv;
+	wait_queue_head_t seq_wait;
+	atomic_t packet_seq;
+	unsigned int peer_seq;
+	spinlock_t peer_seq_lock;
+	unsigned int minor;
+	unsigned long comm_bm_set; /* communicated number of set bits. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) && !defined(cpumask_bits)
+	cpumask_t cpu_mask[1];
+#else
+	cpumask_var_t cpu_mask;
+#endif
+	struct bm_io_work bm_io_work;
+	u64 ed_uuid; /* UUID of the exposed data */
+	struct mutex state_mutex;
+	char congestion_reason;  /* Why we where congested... */
+	atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+	atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+	int rs_last_sect_ev; /* counter to compare with */
+	int rs_last_events;  /* counter of read or write "events" (unit sectors)
+			      * on the lower level device when we last looked. */
+	int c_sync_rate; /* current resync rate after syncer throttle magic */
+	struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+	int rs_planed;    /* resync sectors already planned */
+	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+	int peer_max_bio_size;
+	int local_max_bio_size;
+};
+
+static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
+{
+	struct drbd_conf *mdev;
+
+	mdev = minor < minor_count ? minor_table[minor] : NULL;
+
+	return mdev;
+}
+
+static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
+{
+	return mdev->minor;
+}
+
+/* returns 1 if it was successful,
+ * returns 0 if there was no data socket.
+ * so wherever you are going to use the data.socket, e.g. do
+ * if (!drbd_get_data_sock(mdev))
+ *	return 0;
+ *	CODE();
+ * drbd_put_data_sock(mdev);
+ */
+static inline int drbd_get_data_sock(struct drbd_conf *mdev)
+{
+	mutex_lock(&mdev->data.mutex);
+	/* drbd_disconnect() could have called drbd_free_sock()
+	 * while we were waiting in down()... */
+	if (unlikely(mdev->data.socket == NULL)) {
+		mutex_unlock(&mdev->data.mutex);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+{
+	mutex_unlock(&mdev->data.mutex);
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum chg_state_flags {
+	CS_HARD	= 1,
+	CS_VERBOSE = 2,
+	CS_WAIT_COMPLETE = 4,
+	CS_SERIALIZE    = 8,
+	CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
+};
+
+enum dds_flags {
+	DDSF_FORCED    = 1,
+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
+extern void drbd_init_set_defaults(struct drbd_conf *mdev);
+extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
+					    enum chg_state_flags f,
+					    union drbd_state mask,
+					    union drbd_state val);
+extern void drbd_force_state(struct drbd_conf *, union drbd_state,
+			union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
+					      union drbd_state,
+					      union drbd_state,
+					      enum chg_state_flags);
+extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
+					   enum chg_state_flags,
+					   struct completion *done);
+extern void print_st_err(struct drbd_conf *, union drbd_state,
+			union drbd_state, int);
+extern int  drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
+extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+extern void drbd_free_resources(struct drbd_conf *mdev);
+extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(struct drbd_conf *mdev);
+extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
+extern void drbd_free_sock(struct drbd_conf *mdev);
+extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+			void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern int drbd_send_uuids(struct drbd_conf *mdev);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
+extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
+#define drbd_send_state(m, s) drbd_send_state_(m, s, __func__ , __LINE__ )
+#define drbd_send_current_state(m) drbd_send_current_state_(m, __func__ , __LINE__ )
+extern int drbd_send_state_(struct drbd_conf *mdev,
+		union drbd_state s,
+		const char *func, unsigned int line);
+extern int drbd_send_current_state_(struct drbd_conf *mdev,
+		const char *func, unsigned int line);
+extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+			enum drbd_packets cmd, struct p_header80 *h,
+			size_t size, unsigned msg_flags);
+#define USE_DATA_SOCKET 1
+#define USE_META_SOCKET 0
+extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+			enum drbd_packets cmd, struct p_header80 *h,
+			size_t size);
+extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
+			char *data, size_t size);
+extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
+extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
+			u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct drbd_epoch_entry *e);
+extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct p_block_req *rp);
+extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
+extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+			   struct drbd_epoch_entry *e);
+extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+			      sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
+				   sector_t sector,int size,
+				   void *digest, int digest_size,
+				   enum drbd_packets cmd);
+extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
+
+extern int drbd_send_bitmap(struct drbd_conf *mdev);
+extern int _drbd_send_bitmap(struct drbd_conf *mdev);
+extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
+void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
+
+extern void drbd_md_sync(struct drbd_conf *mdev);
+extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+#ifndef DRBD_DEBUG_MD_SYNC
+extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+#else
+#define drbd_md_mark_dirty(m)	drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+		unsigned int line, const char *func);
+#endif
+extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+				 int (*io_fn)(struct drbd_conf *),
+				 void (*done)(struct drbd_conf *, int),
+				 char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_conf *mdev,
+		int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
+extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
+extern void drbd_go_diskless(struct drbd_conf *mdev);
+extern void drbd_ldev_destroy(struct drbd_conf *mdev);
+
+
+/* Meta data layout
+   We reserve a 128MB Block (4k aligned)
+   * either at the end of the backing device
+   * or on a separate meta data device. */
+
+#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
+/* The following numbers are sectors */
+#define MD_AL_OFFSET 8	    /* 8 Sectors after start of meta area */
+#define MD_AL_MAX_SIZE 64   /* = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage */
+/* Allows up to about 3.8TB */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
+
+/* Since the smallest IO unit is usually 512 byte */
+#define MD_SECTOR_SHIFT	 9
+#define MD_SECTOR_SIZE	 (1<<MD_SECTOR_SHIFT)
+
+/* activity log */
+#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
+#define AL_EXTENT_SHIFT 22		 /* One extent represents 4M Storage */
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+	int rs_left; /* number of bits set (out of sync) in this extent. */
+	int rs_failed; /* number of failed resync requests in this extent. */
+	unsigned long flags;
+	struct lc_element lce;
+};
+
+#define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY   2  /* finish resync IO on this extent ASAP! App IO waiting! */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define SLEEP_TIME (HZ/10)
+
+#define BM_BLOCK_SHIFT  12			 /* 4k per bit */
+#define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
+/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
+ * per sector of on disk bitmap */
+#define BM_EXT_SHIFT	 (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3)  /* = 24 */
+#define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+
+/* how much _storage_ sectors we have per bitmap sector */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+
+#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
+#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *		     of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit	 0	  bit 37   bit 38	     bit (512*8)-1
+ *	     ...|........|........|.. // ..|........|
+ * sect. 0	 `296	  `304			   ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )	 //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+#define DRBD_MAX_SECTORS_BM \
+	  ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
+#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
+#elif !defined(CONFIG_LBDAF) && !defined(CONFIG_LBD) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
+#endif
+#endif
+
+/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
+ * With a value of 8 all IO in one 128K block make it to the same slot of the
+ * hash table. */
+#define HT_SHIFT 8
+#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12)       /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+
+/* Number of elements in the app_reads_hash */
+#define APP_R_HSIZE 15
+
+extern int  drbd_bm_init(struct drbd_conf *mdev);
+extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_conf *mdev);
+extern void drbd_bm_set_all(struct drbd_conf *mdev);
+extern void drbd_bm_clear_all(struct drbd_conf *mdev);
+/* set/clear/test only a few bits at a time */
+extern int  drbd_bm_set_bits(
+		struct drbd_conf *mdev, unsigned long s, unsigned long e);
+extern int  drbd_bm_clear_bits(
+		struct drbd_conf *mdev, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+	struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
+		const unsigned long s, const unsigned long e);
+extern int  drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
+extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
+extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
+extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
+extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
+		unsigned long al_enr);
+extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
+extern sector_t      drbd_bm_capacity(struct drbd_conf *mdev);
+
+#define DRBD_END_OF_BITMAP	(~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
+extern int drbd_bm_rs_done(struct drbd_conf *mdev);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
+		size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
+		size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_conf *mdev);
+/* drbd_main.c */
+
+/* needs to be included here,
+ * because of kmem_cache_t weirdness */
+#include "drbd_wrappers.h"
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
+extern spinlock_t   drbd_pp_lock;
+extern int	    drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
+extern rwlock_t global_state_lock;
+
+extern struct drbd_conf *drbd_new_device(unsigned int minor);
+extern void drbd_free_mdev(struct drbd_conf *mdev);
+
+extern int proc_details;
+
+/* drbd_req */
+extern MAKE_REQUEST_TYPE drbd_make_request(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q,
+#ifdef HAVE_bvec_merge_data
+		struct bvec_merge_data *bvm,
+#else
+		struct bio *bvm,
+#endif
+		struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_conf *mdev);
+extern void drbd_resume_io(struct drbd_conf *mdev);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
+enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
+extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_conf *);
+extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
+extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
+					enum drbd_role new_role,
+					int force);
+extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
+extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
+
+/* drbd_worker.c */
+extern int drbd_worker(struct drbd_thread *thi);
+extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_conf *mdev);
+extern void suspend_other_sg(struct drbd_conf *mdev);
+extern int drbd_resync_finished(struct drbd_conf *mdev);
+/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
+extern void drbd_md_put_buffer(struct drbd_conf *mdev);
+extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
+				struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+					    unsigned int *done);
+extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
+
+static inline void ov_oos_print(struct drbd_conf *mdev)
+{
+	if (mdev->ov_last_oos_size) {
+		dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
+		     (unsigned long long)mdev->ov_last_oos_start,
+		     (unsigned long)mdev->ov_last_oos_size);
+	}
+	mdev->ov_last_oos_size=0;
+}
+
+
+extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
+/* worker callbacks */
+extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
+extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
+extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
+
+/* drbd_receiver.c */
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+		const unsigned rw, const int fault_type);
+extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
+extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+					    u64 id,
+					    sector_t sector,
+					    unsigned int data_size,
+					    gfp_t gfp_mask) __must_hold(local);
+extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+		int is_net);
+#define drbd_free_ee(m,e)	drbd_free_some_ee(m, e, 0)
+#define drbd_free_net_ee(m,e)	drbd_free_some_ee(m, e, 1)
+extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+		struct list_head *head);
+extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+		struct list_head *head);
+extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
+extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+extern void drbd_free_tl_hash(struct drbd_conf *mdev);
+
+/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
+ * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+			char __user *optval, int optlen)
+{
+	int err;
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, optval,
+					    optlen);
+	return err;
+}
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+	int __user val = 0;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+	int __user val = 2;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			(char __user *)&val, sizeof(val));
+}
+
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern const struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
+extern int drbd_rs_del_all(struct drbd_conf *mdev);
+extern void drbd_rs_failed_io(struct drbd_conf *mdev,
+		sector_t sector, int size);
+extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
+extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
+extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_in_sync(mdev, sector, size) \
+	__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
+extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_out_of_sync(mdev, sector, size) \
+	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
+extern void drbd_al_shrink(struct drbd_conf *mdev);
+
+
+/* drbd_nl.c */
+
+void drbd_nl_cleanup(void);
+int __init drbd_nl_init(void);
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
+void drbd_bcast_sync_progress(struct drbd_conf *mdev);
+void drbd_bcast_ee(struct drbd_conf *mdev,
+		const char *reason, const int dgs,
+		const char* seen_hash, const char* calc_hash,
+		const struct drbd_epoch_entry* e);
+
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+/* drbd state debug */
+#if DRBD_DEBUG_STATE_CHANGES
+extern void drbd_state_dbg(struct drbd_conf *mdev, const unsigned long long seq,
+		const char *func, unsigned int line,
+		const char *name, union drbd_state s);
+#define DRBD_STATE_DEBUG_INIT_VAL(s) ({ (s).seq = 0; (s).line = __LINE__; (s).func = __func__; })
+#else
+#define drbd_state_dbg(...) do { } while (0)
+#define DRBD_STATE_DEBUG_INIT_VAL(s) do { } while (0)
+#endif
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+/*
+ * inline helper functions
+ *************************/
+
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+	return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+			page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+{
+	struct page *page = e->pages;
+	page_chain_for_each(page) {
+		if (page_count(page) > 1)
+			return 1;
+	}
+	return 0;
+}
+
+static inline void drbd_state_lock(struct drbd_conf *mdev)
+{
+	wait_event(mdev->misc_wait,
+		   !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+}
+
+static inline void drbd_state_unlock(struct drbd_conf *mdev)
+{
+	clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
+		enum chg_state_flags flags, struct completion *done)
+{
+	enum drbd_state_rv rv;
+
+	read_lock(&global_state_lock);
+	rv = __drbd_set_state(mdev, ns, flags, done);
+	read_unlock(&global_state_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_conf *mdev,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+{
+	switch (mdev->ldev->dc.on_io_error) {
+	case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+		if (!forcedetach) {
+			if (DRBD_ratelimit(5*HZ, 5))
+				dev_err(DEV, "Local IO failed in %s.\n", where);
+			if (mdev->state.disk > D_INCONSISTENT)
+				_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
+			break;
+		}
+		/* NOTE fall through to detach case if forcedetach set */
+	case EP_DETACH:
+	case EP_CALL_HELPER:
+		set_bit(WAS_IO_ERROR, &mdev->flags);
+		if (mdev->state.disk > D_FAILED) {
+			_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
+			dev_err(DEV,
+				"Local IO failed in %s. Detaching...\n", where);
+		}
+		break;
+	}
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @mdev:	 DRBD device.
+ * @error:	 Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
+	int error, int forcedetach, const char *where)
+{
+	if (error) {
+		unsigned long flags;
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		__drbd_chk_io_error_(mdev, forcedetach, where);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+	}
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev:	Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.bm_offset;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset;
+	}
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + MD_AL_OFFSET - 1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect;
+	}
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev:	Meta data block device.
+ *
+ * returns the capacity we announce to out peer.  we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+	sector_t s;
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+					drbd_md_first_sector(bdev))
+			: 0;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
+	default:
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
+	}
+	return s;
+}
+
+/**
+ * drbd_md_ss__() - Return the sector number of our meta data super block
+ * @mdev:	DRBD device.
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
+				    struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	default: /* external, some index */
+		return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+	case DRBD_MD_INDEX_INTERNAL:
+		/* with drbd08, internal meta data is always "flexible" */
+	case DRBD_MD_INDEX_FLEX_INT:
+		/* sizeof(struct md_on_disk_07) == 4k
+		 * position: last 4k aligned block of 4k size */
+		if (!bdev->backing_bdev) {
+			if (DRBD_ratelimit(5*HZ, 5)) {
+				dev_err(DEV, "bdev->backing_bdev==NULL\n");
+				dump_stack();
+			}
+			return 0;
+		}
+		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
+			- MD_AL_OFFSET;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		return 0;
+	}
+}
+
+static inline void
+drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add(&w->list, &q->q);
+	up(&q->s); /* within the spinlock,
+		      see comment near end of drbd_worker() */
+	spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add_tail(&w->list, &q->q);
+	up(&q->s); /* within the spinlock,
+		      see comment near end of drbd_worker() */
+	spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void wake_asender(struct drbd_conf *mdev)
+{
+	if (test_bit(SIGNAL_ASENDER, &mdev->flags))
+		force_sig(DRBD_SIG, mdev->asender.task);
+}
+
+static inline void request_ping(struct drbd_conf *mdev)
+{
+	set_bit(SEND_PING, &mdev->flags);
+	wake_asender(mdev);
+}
+
+static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
+	enum drbd_packets cmd)
+{
+	struct p_header80 h;
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping(struct drbd_conf *mdev)
+{
+	struct p_header80 h;
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
+{
+	struct p_header80 h;
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
+}
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, true);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, false);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, true, false);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ *  w_send_barrier
+ *  _req_mod(req, queue_for_net_write or queue_for_net_read);
+ *    it is much easier and equally valid to count what we queue for the
+ *    worker, even before it actually was queued or send.
+ *    (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ *  _req_mod(req, data_received)
+ *     [from receive_DataReply]
+ *  _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ *     FIXME
+ *     for some reason it is NOT decreased in got_NegAck,
+ *     but in the resulting cleanup code from report_params.
+ *     we should try to remember the reason for that...
+ *  _req_mod(req, send_failed or send_canceled)
+ *  _req_mod(req, connection_lost_while_pending)
+ *     [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which)				\
+	if (atomic_read(&mdev->which) < 0)			\
+		dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n",	\
+		    __func__ , __LINE__ ,			\
+		    atomic_read(&mdev->which))
+
+#define dec_ap_pending(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	if (atomic_dec_and_test(&mdev->ap_pending_cnt))		\
+		wake_up(&mdev->misc_wait);			\
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+
+/* counts how many resync-related answers we still expect from the peer
+ *		     increase			decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY   (and expects P_WRITE_ACK with ID_SYNCER)
+ *					   (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->rs_pending_cnt);
+}
+
+#define dec_rs_pending(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_dec(&mdev->rs_pending_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ *  receive_Data	unless protocol A;
+ *			we need to send a P_RECV_ACK (proto B)
+ *			or P_WRITE_ACK (proto C)
+ *  receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ *  receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ *  receive_Barrier_*	we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->unacked_cnt);
+}
+
+#define dec_unacked(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_dec(&mdev->unacked_cnt);				\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+#define sub_unacked(mdev, n)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_sub(n, &mdev->unacked_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+
+static inline void put_net_conf(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->net_cnt))
+		wake_up(&mdev->net_cnt_wait);
+}
+
+/**
+ * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
+ * @mdev:	DRBD device.
+ *
+ * You have to call put_net_conf() when finished working with mdev->net_conf.
+ */
+static inline int get_net_conf(struct drbd_conf *mdev)
+{
+	int have_net_conf;
+
+	atomic_inc(&mdev->net_cnt);
+	have_net_conf = mdev->state.conn >= C_UNCONNECTED;
+	if (!have_net_conf)
+		put_net_conf(mdev);
+	return have_net_conf;
+}
+
+/**
+ * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
+ * @M:		DRBD device.
+ *
+ * You have to call put_ldev() when finished working with mdev->ldev.
+ */
+#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
+#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
+
+static inline void put_ldev(struct drbd_conf *mdev)
+{
+	int i = atomic_dec_return(&mdev->local_cnt);
+
+	/* This may be called from some endio handler,
+	 * so we must not sleep here. */
+
+	__release(local);
+	D_ASSERT(i >= 0);
+	if (i == 0) {
+		if (mdev->state.disk == D_DISKLESS)
+			/* even internal references gone, safe to destroy */
+			drbd_ldev_destroy(mdev);
+		if (mdev->state.disk == D_FAILED)
+			/* all application IO references gone. */
+			drbd_go_diskless(mdev);
+		wake_up(&mdev->misc_wait);
+	}
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	/* never get a reference while D_DISKLESS */
+	if (mdev->state.disk == D_DISKLESS)
+		return 0;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed)
+		put_ldev(mdev);
+	return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
+#endif
+
+/* you must have an "get_ldev" reference */
+static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
+	typecheck(unsigned long, mdev->rs_total);
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+		*bits_left = mdev->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > mdev->rs_total) {
+		/* doh. maybe a logic bug somewhere.
+		 * may also be just a race condition
+		 * between this and a disconnect during sync.
+		 * for now, just prevent in-kernel buffer overflow.
+		 */
+		smp_rmb();
+		dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+				drbd_conn_str(mdev->state.conn),
+				*bits_left, mdev->rs_total, mdev->rs_failed);
+		*per_mil_done = 0;
+	} else {
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (mdev->rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
+		*per_mil_done = tmp;
+	}
+}
+
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
+{
+	int mxb = 1000000; /* arbitrary limit on open requests */
+	if (get_net_conf(mdev)) {
+		mxb = mdev->net_conf->max_buffers;
+		put_net_conf(mdev);
+	}
+	return mxb;
+}
+
+static inline int drbd_state_is_stable(struct drbd_conf *mdev)
+{
+	union drbd_state s = mdev->state;
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case C_STANDALONE:
+	case C_WF_CONNECTION:
+	/* ... or there is a well established connection. */
+	case C_CONNECTED:
+	case C_SYNC_SOURCE:
+	case C_SYNC_TARGET:
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+	case C_PAUSED_SYNC_S:
+	case C_PAUSED_SYNC_T:
+	case C_AHEAD:
+	case C_BEHIND:
+		/* transitional states, IO allowed */
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_REPORT_PARAMS:
+	case C_STARTING_SYNC_S:
+	case C_STARTING_SYNC_T:
+		break;
+
+		/* Allow IO in BM exchange states with new protocols */
+	case C_WF_BITMAP_S:
+		if (mdev->agreed_pro_version < 96)
+			return 0;
+		break;
+
+		/* no new io accepted in these states */
+	case C_WF_BITMAP_T:
+	case C_WF_SYNC_UUID:
+	case C_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+	case D_FAILED:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during transitional states */
+	case D_ATTACHING:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int is_susp(union drbd_state s)
+{
+	return s.susp || s.susp_nod || s.susp_fen;
+}
+
+static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
+{
+	int mxb = drbd_get_max_buffers(mdev);
+
+	if (is_susp(mdev->state))
+		return false;
+	if (test_bit(SUSPEND_IO, &mdev->flags))
+		return false;
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(mdev))
+		return false;
+
+	/* since some older kernels don't have atomic_add_unless,
+	 * and we are within the spinlock anyways, we have this workaround.  */
+	if (atomic_read(&mdev->ap_bio_cnt) > mxb)
+		return false;
+	if (test_bit(BITMAP_IO, &mdev->flags))
+		return false;
+	return true;
+}
+
+static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
+{
+	bool rv = false;
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = may_inc_ap_bio(mdev);
+	if (rv)
+		atomic_add(count, &mdev->ap_bio_cnt);
+	spin_unlock_irq(&mdev->req_lock);
+
+	return rv;
+}
+
+static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
+{
+	/* we wait here
+	 *    as long as the device is suspended
+	 *    until the bitmap is no longer on the fly during connection
+	 *    handshake as long as we would exceed the max_buffer limit.
+	 *
+	 * to avoid races with the reconnect code,
+	 * we need to atomic_inc within the spinlock. */
+
+	wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
+}
+
+static inline void dec_ap_bio(struct drbd_conf *mdev)
+{
+	int mxb = drbd_get_max_buffers(mdev);
+	int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
+
+	D_ASSERT(ap_bio >= 0);
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+	if (ap_bio < mxb)
+		wake_up(&mdev->misc_wait);
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+	}
+}
+
+static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
+{
+	int changed = mdev->ed_uuid != val;
+	mdev->ed_uuid = val;
+	return changed;
+}
+
+static inline int seq_cmp(u32 a, u32 b)
+{
+	/* we assume wrap around at 32bit.
+	 * for wrap around at 24bit (old atomic_t),
+	 * we'd have to
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)(a) - (s32)(b);
+}
+#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
+#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
+#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
+#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
+/* CAUTION: please no side effects in arguments! */
+#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
+
+static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
+{
+	unsigned int m;
+	spin_lock(&mdev->peer_seq_lock);
+	m = seq_max(mdev->peer_seq, new_seq);
+	mdev->peer_seq = m;
+	spin_unlock(&mdev->peer_seq_lock);
+	if (m == new_seq)
+		wake_up(&mdev->seq_wait);
+}
+
+static inline void drbd_update_congested(struct drbd_conf *mdev)
+{
+	struct sock *sk = mdev->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &mdev->flags);
+}
+
+static inline int drbd_queue_order_type(struct drbd_conf *mdev)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+	return QUEUE_ORDERED_NONE;
+}
+
+#ifdef blk_queue_plugged
+static inline void drbd_blk_run_queue(struct request_queue *q)
+{
+	if (q && q->unplug_fn)
+		q->unplug_fn(q);
+}
+
+static inline void drbd_kick_lo(struct drbd_conf *mdev)
+{
+	if (get_ldev(mdev)) {
+		drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
+		put_ldev(mdev);
+	}
+}
+#else
+static inline void drbd_blk_run_queue(struct request_queue *q)
+{
+}
+static inline void drbd_kick_lo(struct drbd_conf *mdev)
+{
+}
+#endif
+
+static inline void drbd_md_flush(struct drbd_conf *mdev)
+{
+	int r;
+
+	if (test_bit(MD_NO_BARRIER, &mdev->flags))
+		return;
+
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
+	if (r) {
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
+	}
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_main.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_main.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_main.c	2015-01-21 12:02:58.381823991 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_main.c	2015-01-21 12:02:58.381823991 +0300
@@ -0,0 +1,4661 @@
+/*
+   drbd.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/dynamic_debug.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+#include "drbd_vli.h"
+
+#ifdef HAVE_LINUX_BYTEORDER_SWABB_H
+#include <linux/byteorder/swabb.h>
+#else
+#include <linux/swab.h>
+#endif
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+};
+
+int drbdd_init(struct drbd_thread *);
+int drbd_worker(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+int drbd_init(void);
+#ifdef BD_OPS_USE_FMODE
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static int drbd_release(struct gendisk *gd, fmode_t mode);
+#else
+static int drbd_open(struct inode *inode, struct file *file);
+static int drbd_release(struct inode *inode, struct file *file);
+#endif
+STATIC int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags);
+STATIC int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+STATIC void md_sync_timer_fn(unsigned long data);
+STATIC int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+STATIC int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+
+DEFINE_TRACE(drbd_unplug);
+DEFINE_TRACE(drbd_uuid);
+DEFINE_TRACE(drbd_ee);
+DEFINE_TRACE(drbd_packet);
+DEFINE_TRACE(drbd_md_io);
+DEFINE_TRACE(drbd_epoch);
+DEFINE_TRACE(drbd_netlink);
+DEFINE_TRACE(drbd_actlog);
+DEFINE_TRACE(drbd_bio);
+DEFINE_TRACE(_drbd_resync);
+DEFINE_TRACE(drbd_req);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
+		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(cn_idx, uint, 0444);
+module_param(proc_details, int, 0644);
+
+#ifdef DRBD_ENABLE_FAULTS
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
+int disable_sendpage;
+int allow_oos;
+unsigned int cn_idx = CN_IDX_DRBD;
+int proc_details;       /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct drbd_conf **minor_table;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
+
+/* I do not use a standard mempool, because:
+   1) I want to hand out the pre-allocated objects first.
+   2) I want to be able to interrupt sleeping allocation with a signal.
+   Note: This is a single linked list, the next pointer is the private
+	 member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t   drbd_pp_lock;
+int          drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+STATIC const struct block_device_operations drbd_ops = {
+	.owner =   THIS_MODULE,
+	.open =    drbd_open,
+	.release = drbd_release,
+};
+
+static void bio_destructor_drbd(struct bio *bio)
+{
+	bio_free(bio, drbd_md_io_bio_set);
+}
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!drbd_md_io_bio_set)
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	bio->bi_destructor = bio_destructor_drbd;
+	return bio;
+}
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+   give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&mdev->local_cnt))
+			wake_up(&mdev->misc_wait);
+	}
+	return io_allowed;
+}
+
+#endif
+
+/**
+ * DOC: The transfer log
+ *
+ * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
+ * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
+ * of the list. There is always at least one &struct drbd_tl_epoch object.
+ *
+ * Each &struct drbd_tl_epoch has a circular double linked list of requests
+ * attached.
+ */
+STATIC int tl_init(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+
+	/* during device minor initialization, we may well use GFP_KERNEL */
+	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
+	if (!b)
+		return 0;
+	INIT_LIST_HEAD(&b->requests);
+	INIT_LIST_HEAD(&b->w.list);
+	b->next = NULL;
+	b->br_number = 4711;
+	b->n_writes = 0;
+	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+
+	mdev->oldest_tle = b;
+	mdev->newest_tle = b;
+	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+	INIT_LIST_HEAD(&mdev->barrier_acked_requests);
+
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+
+	return 1;
+}
+
+STATIC void tl_cleanup(struct drbd_conf *mdev)
+{
+	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+	kfree(mdev->oldest_tle);
+	mdev->oldest_tle = NULL;
+	kfree(mdev->unused_spare_tle);
+	mdev->unused_spare_tle = NULL;
+	kfree(mdev->tl_hash);
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+}
+
+/**
+ * _tl_add_barrier() - Adds a barrier to the transfer log
+ * @mdev:	DRBD device.
+ * @new:	Barrier to be added before the current head of the TL.
+ *
+ * The caller must hold the req_lock.
+ */
+void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
+{
+	struct drbd_tl_epoch *newest_before;
+
+	INIT_LIST_HEAD(&new->requests);
+	INIT_LIST_HEAD(&new->w.list);
+	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+	new->next = NULL;
+	new->n_writes = 0;
+
+	newest_before = mdev->newest_tle;
+	new->br_number = newest_before->br_number+1;
+	if (mdev->newest_tle != new) {
+		mdev->newest_tle->next = new;
+		mdev->newest_tle = new;
+	}
+}
+
+/**
+ * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
+ * @mdev:	DRBD device.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * &struct drbd_tl_epoch objects this function will cause a termination
+ * of the connection.
+ */
+void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+		       unsigned int set_size)
+{
+	struct drbd_tl_epoch *b, *nob; /* next old barrier */
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	b = mdev->oldest_tle;
+
+	/* first some paranoia code */
+	if (b == NULL) {
+		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			barrier_nr);
+		goto bail;
+	}
+	if (b->br_number != barrier_nr) {
+		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
+			barrier_nr, b->br_number);
+		goto bail;
+	}
+	if (b->n_writes != set_size) {
+		dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+			barrier_nr, set_size, b->n_writes);
+		goto bail;
+	}
+
+	/* Clean up list of requests processed during current epoch */
+	list_for_each_safe(le, tle, &b->requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		_req_mod(r, barrier_acked);
+	}
+	/* There could be requests on the list waiting for completion
+	   of the write to the local disk. To avoid corruptions of
+	   slab's data structures we have to remove the lists head.
+
+	   Also there could have been a barrier ack out of sequence, overtaking
+	   the write acks - which would be a bug and violating write ordering.
+	   To not deadlock in case we lose connection while such requests are
+	   still pending, we need some way to find them for the
+	   _req_mode(connection_lost_while_pending).
+
+	   These have been list_move'd to the out_of_sequence_requests list in
+	   _req_mod(, barrier_acked) above.
+	   */
+	list_splice_init(&b->requests, &mdev->barrier_acked_requests);
+
+	nob = b->next;
+	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+		_tl_add_barrier(mdev, b);
+		if (nob)
+			mdev->oldest_tle = nob;
+		/* if nob == NULL b was the only barrier, and becomes the new
+		   barrier. Therefore mdev->oldest_tle points already to b */
+	} else {
+		D_ASSERT(nob != NULL);
+		mdev->oldest_tle = nob;
+		kfree(b);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+	dec_ap_pending(mdev);
+
+	return;
+
+bail:
+	spin_unlock_irq(&mdev->req_lock);
+	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+}
+
+
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @mdev:	DRBD device.
+ * @what:       The action/event to perform with all request objects
+ *
+ * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
+ * restart_frozen_disk_io.
+ */
+static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+	struct drbd_tl_epoch *b, *tmp, **pn;
+	struct list_head *le, *tle, carry_reads;
+	struct drbd_request *req;
+	int rv, n_writes, n_reads;
+
+	b = mdev->oldest_tle;
+	pn = &mdev->oldest_tle;
+	while (b) {
+		n_writes = 0;
+		n_reads = 0;
+		INIT_LIST_HEAD(&carry_reads);
+		list_for_each_safe(le, tle, &b->requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			rv = _req_mod(req, what);
+
+			n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
+			n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
+		}
+		tmp = b->next;
+
+		if (n_writes) {
+			if (what == resend) {
+				b->n_writes = n_writes;
+				if (b->w.cb == NULL) {
+					b->w.cb = w_send_barrier;
+					inc_ap_pending(mdev);
+					set_bit(CREATE_BARRIER, &mdev->flags);
+				}
+
+				drbd_queue_work(&mdev->data.work, &b->w);
+			}
+			pn = &b->next;
+		} else {
+			if (n_reads)
+				list_add(&carry_reads, &b->requests);
+			/* there could still be requests on that ring list,
+			 * in case local io is still pending */
+			list_del(&b->requests);
+
+			/* dec_ap_pending corresponding to queue_barrier.
+			 * the newest barrier may not have been queued yet,
+			 * in which case w.cb is still NULL. */
+			if (b->w.cb != NULL)
+				dec_ap_pending(mdev);
+
+			if (b == mdev->newest_tle) {
+				/* recycle, but reinit! */
+				D_ASSERT(tmp == NULL);
+				INIT_LIST_HEAD(&b->requests);
+				list_splice(&carry_reads, &b->requests);
+				INIT_LIST_HEAD(&b->w.list);
+				b->w.cb = NULL;
+				b->br_number = net_random();
+				b->n_writes = 0;
+
+				*pn = b;
+				break;
+			}
+			*pn = tmp;
+			kfree(b);
+		}
+		b = tmp;
+		list_splice(&carry_reads, &b->requests);
+	}
+
+	/* Actions operating on the disk state, also want to work on
+	   requests that got barrier acked. */
+	switch (what) {
+	case fail_frozen_disk_io:
+	case restart_frozen_disk_io:
+		list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			_req_mod(req, what);
+		}
+
+	case connection_lost_while_pending:
+	case resend:
+		break;
+	default:
+		dev_err(DEV, "what = %d in _tl_restart()\n", what);
+	}
+}
+
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_conf *mdev)
+{
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	_tl_restart(mdev, connection_lost_while_pending);
+
+	/* we expect this list to be empty. */
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+
+	/* but just in case, clean it up anyways! */
+	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		/* It would be nice to complete outside of spinlock.
+		 * But this is easier for now. */
+		_req_mod(r, connection_lost_while_pending);
+	}
+
+	/* ensure bit indicating barrier is required is clear */
+	clear_bit(CREATE_BARRIER, &mdev->flags);
+
+	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+	spin_lock_irq(&mdev->req_lock);
+	_tl_restart(mdev, what);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+	struct list_head *le, *tle;
+	struct drbd_request *req;
+
+	spin_lock_irq(&mdev->req_lock);
+	b = mdev->oldest_tle;
+	while (b) {
+		list_for_each_safe(le, tle, &b->requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			if (!(req->rq_state & RQ_LOCAL_PENDING))
+				continue;
+			_req_mod(req, abort_disk_io);
+		}
+		b = b->next;
+	}
+
+	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+		req = list_entry(le, struct drbd_request, tl_requests);
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		_req_mod(req, abort_disk_io);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @mdev:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+STATIC int cl_wide_st_chg(struct drbd_conf *mdev,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+		  union drbd_state mask, union drbd_state val)
+{
+#if DRBD_DEBUG_STATE_CHANGES
+	static unsigned long long sseq = 0x00f00000LLU;
+#endif
+
+	unsigned long flags;
+	union drbd_state os, ns;
+	enum drbd_state_rv rv;
+
+	ns = val; /* assign debug info, if any. */
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+#if DRBD_DEBUG_STATE_CHANGES
+	ns.seq = ++sseq;
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "!os", os);
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "!ns", ns);
+#endif
+	rv = _drbd_set_state(mdev, ns, f, NULL);
+	ns.i = mdev->state.i;
+#if DRBD_DEBUG_STATE_CHANGES
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "=ns", ns);
+#endif
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(mdev, CS_HARD, mask, val);
+}
+
+STATIC enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
+STATIC enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
+						    union drbd_state,
+						    union drbd_state);
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
+STATIC union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
+int drbd_send_state_req(struct drbd_conf *,
+			union drbd_state, union drbd_state);
+
+STATIC enum drbd_state_rv
+_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
+	     union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	enum drbd_state_rv rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	rv = 0;
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	ns = sanitize_state(mdev, os, ns, NULL);
+
+	if (!cl_wide_st_chg(mdev, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (!rv) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS) {
+			rv = is_valid_state_transition(mdev, ns, os);
+			if (rv == SS_SUCCESS)
+				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+STATIC enum drbd_state_rv
+drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
+	       union drbd_state val, enum chg_state_flags f)
+{
+#if DRBD_DEBUG_STATE_CHANGES
+	static unsigned long long sseq = 0;
+#endif
+
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	enum drbd_state_rv rv;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(&mdev->state_mutex);
+
+	ns = val; /* assign debug info, if any */
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i; /* assign state info */
+
+#if DRBD_DEBUG_STATE_CHANGES
+	ns.seq = ++sseq;
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "?os", os);
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "?ns", ns);
+#endif
+	ns = sanitize_state(mdev, os, ns, NULL);
+
+#if DRBD_DEBUG_STATE_CHANGES
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "?=>ns", ns);
+#endif
+
+	if (cl_wide_st_chg(mdev, os, ns)) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_state_transition(mdev, ns, os);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		drbd_state_lock(mdev);
+		if (!drbd_send_state_req(mdev, mask, val)) {
+			drbd_state_unlock(mdev);
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(mdev->state_wait,
+			(rv = _req_st_cond(mdev, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			drbd_state_unlock(mdev);
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		os = mdev->state;
+		ns.i = (os.i & ~mask.i) | val.i;
+		rv = _drbd_set_state(mdev, ns, f, &done);
+		drbd_state_unlock(mdev);
+	} else {
+		rv = _drbd_set_state(mdev, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(current != mdev->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+#if DRBD_DEBUG_STATE_CHANGES
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, ":os", os);
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, ":ns", ns);
+#endif
+
+	if (f & CS_SERIALIZE)
+		mutex_unlock(&mdev->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	wait_event(mdev->state_wait,
+		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+/* pretty print of drbd internal state */
+
+#define STATE_FMT	" %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n"
+#define STATE_ARGS(tag, s)		\
+		tag,			\
+		drbd_conn_str(s.conn),	\
+		drbd_role_str(s.role),	\
+		drbd_role_str(s.peer),	\
+		drbd_disk_str(s.disk),	\
+		drbd_disk_str(s.pdsk),	\
+		is_susp(s) ? 's' : 'r',	\
+		s.aftr_isp ? 'a' : '-',	\
+		s.peer_isp ? 'p' : '-',	\
+		s.user_isp ? 'u' : '-', \
+		s.susp_fen ? 'F' : '-', \
+		s.susp_nod ? 'N' : '-'
+
+#if DRBD_DEBUG_STATE_CHANGES
+void drbd_state_dbg(struct drbd_conf *mdev, const unsigned long long seq,
+		const char *func, unsigned int line,
+		const char *name, union drbd_state s)
+{
+	int i;
+	/* some paranoia code,
+	 * in case this is called with unitizalized data. */
+	if (!name || !func) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	for (i = 0; i < 16 && name[i]; i++)
+		;
+	if (i == 16) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	for (i = 0; i < 32 && func[i]; i++)
+		;
+	if (i == 32) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	/* the actual debug prink */
+	dynamic_dev_dbg(DEV, " %8llx [%s] %s:%u" STATE_FMT,
+		seq, current->comm, func, line, STATE_ARGS(name, s));
+}
+#endif
+
+void print_st(struct drbd_conf *mdev, const char *tag, union drbd_state s)
+{
+	dev_err(DEV, STATE_FMT, STATE_ARGS(tag, s));
+}
+
+
+void print_st_err(struct drbd_conf *mdev, union drbd_state os,
+	          union drbd_state ns, enum drbd_state_rv err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(mdev, " state", os);
+	print_st(mdev, "wanted", ns);
+}
+
+#undef STATE_FMT
+#undef STATE_ARGS
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @mdev:	DRBD device.
+ * @ns:		State to consider.
+ */
+STATIC enum drbd_state_rv
+is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	if (get_net_conf(mdev)) {
+		if (!mdev->net_conf->two_primaries &&
+		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
+			rv = SS_TWO_PRIMARIES;
+		put_net_conf(mdev);
+	}
+
+	if (rv <= 0)
+		/* already found a reason to abort */;
+	else if (ns.role == R_SECONDARY && mdev->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (mdev->sync_conf.verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  mdev->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+		rv = SS_CONNECTED_OUTDATES;
+
+	return rv;
+}
+
+/**
+ * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+STATIC enum drbd_state_rv
+is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
+			  union drbd_state os)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... */
+	if (test_bit(STATE_SENT, &mdev->flags) &&
+	    !(os.conn == C_WF_REPORT_PARAMS ||
+	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+	    && os.conn < C_WF_REPORT_PARAMS)
+		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+	return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+STATIC union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+	enum drbd_fencing_p fp;
+	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+	if (warn)
+		*warn = NO_WARNING;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	/* Disallow Network errors to configure a device's network part */
+	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
+	    os.conn <= C_DISCONNECTING)
+		ns.conn = os.conn;
+
+	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
+	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
+	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
+		ns.conn = os.conn;
+
+	/* we cannot fail (again) if we already detached */
+	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+		ns.disk = D_DISKLESS;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
+		ns.conn = os.conn;
+
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	/* Abort resync if a disk fails/detaches */
+	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
+	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn)
+			*warn =	os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+		ns.conn = C_CONNECTED;
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = mdev->new_state_tmp.disk;
+			ns.pdsk = mdev->new_state_tmp.pdsk;
+		} else {
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(mdev);
+	}
+
+	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+			ns.disk = D_UP_TO_DATE;
+		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+			ns.pdsk = D_UP_TO_DATE;
+	}
+
+	/* Implications of the connection stat on the disk states */
+	disk_min = D_DISKLESS;
+	disk_max = D_UP_TO_DATE;
+	pdsk_min = D_INCONSISTENT;
+	pdsk_max = D_UNKNOWN;
+	switch ((enum drbd_conns)ns.conn) {
+	case C_WF_BITMAP_T:
+	case C_PAUSED_SYNC_T:
+	case C_STARTING_SYNC_T:
+	case C_WF_SYNC_UUID:
+	case C_BEHIND:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_OUTDATED;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_CONNECTED:
+		disk_min = D_DISKLESS;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_DISKLESS;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_WF_BITMAP_S:
+	case C_PAUSED_SYNC_S:
+	case C_STARTING_SYNC_S:
+	case C_AHEAD:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+		break;
+	case C_SYNC_TARGET:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_INCONSISTENT;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_SYNC_SOURCE:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_INCONSISTENT;
+		break;
+	case C_STANDALONE:
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_CONNECTION:
+	case C_WF_REPORT_PARAMS:
+	case C_MASK:
+		break;
+	}
+	if (ns.disk > disk_max)
+		ns.disk = disk_max;
+
+	if (ns.disk < disk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
+		ns.disk = disk_min;
+	}
+	if (ns.pdsk > pdsk_max)
+		ns.pdsk = pdsk_max;
+
+	if (ns.pdsk < pdsk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
+		ns.pdsk = pdsk_min;
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+{
+	if (mdev->agreed_pro_version < 90)
+		mdev->ov_start_sector = 0;
+	mdev->rs_total = drbd_bm_bits(mdev);
+	mdev->ov_position = 0;
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		mdev->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
+		if (bit >= mdev->rs_total) {
+			mdev->ov_start_sector =
+				BM_BIT_TO_SECT(mdev->rs_total - 1);
+			mdev->rs_total = 1;
+		} else
+			mdev->rs_total -= bit;
+		mdev->ov_position = mdev->ov_start_sector;
+	}
+	mdev->ov_left = mdev->rs_total;
+}
+
+static void drbd_resume_al(struct drbd_conf *mdev)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
+		dev_info(DEV, "Resumed AL updates\n");
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
+	         enum chg_state_flags flags, struct completion *done)
+{
+#if DRBD_DEBUG_STATE_CHANGES
+	static unsigned long long sseq = 0xff000000LLU;
+#endif
+	union drbd_state os;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	enum sanitize_state_warnings ssw;
+	struct after_state_chg_work *ascw;
+
+
+	os = mdev->state;
+
+#if DRBD_DEBUG_STATE_CHANGES
+	if (!ns.seq) {
+		ns.seq = ++sseq;
+		drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "==os", os);
+		drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "==ns", ns);
+	}
+#endif
+
+	ns = sanitize_state(mdev, os, ns, &ssw);
+
+#if DRBD_DEBUG_STATE_CHANGES
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, "=>ns", ns);
+#endif
+
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(mdev, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(mdev, os) == rv)
+				rv = is_valid_state_transition(mdev, ns, os);
+		} else
+			rv = is_valid_state_transition(mdev, ns, os);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(mdev, os, ns, rv);
+		return rv;
+	}
+
+	print_sanitize_warnings(mdev, ssw);
+
+#if DUMP_MD >= 2
+	{
+	char *pbp, pb[300];
+	pbp = pb;
+	*pbp = 0;
+	if (ns.role != os.role)
+		pbp += sprintf(pbp, "role( %s -> %s ) ",
+			       drbd_role_str(os.role),
+			       drbd_role_str(ns.role));
+	if (ns.peer != os.peer)
+		pbp += sprintf(pbp, "peer( %s -> %s ) ",
+			       drbd_role_str(os.peer),
+			       drbd_role_str(ns.peer));
+	if (ns.conn != os.conn)
+		pbp += sprintf(pbp, "conn( %s -> %s ) ",
+			       drbd_conn_str(os.conn),
+			       drbd_conn_str(ns.conn));
+	if (ns.disk != os.disk)
+		pbp += sprintf(pbp, "disk( %s -> %s ) ",
+			       drbd_disk_str(os.disk),
+			       drbd_disk_str(ns.disk));
+	if (ns.pdsk != os.pdsk)
+		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+			       drbd_disk_str(os.pdsk),
+			       drbd_disk_str(ns.pdsk));
+	if (is_susp(ns) != is_susp(os))
+		pbp += sprintf(pbp, "susp( %d -> %d ) ",
+			       is_susp(os),
+			       is_susp(ns));
+	if (ns.aftr_isp != os.aftr_isp)
+		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+			       os.aftr_isp,
+			       ns.aftr_isp);
+	if (ns.peer_isp != os.peer_isp)
+		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+			       os.peer_isp,
+			       ns.peer_isp);
+	if (ns.user_isp != os.user_isp)
+		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+			       os.user_isp,
+			       ns.user_isp);
+	dev_info(DEV, "%s\n", pb);
+	}
+#endif
+
+#if DRBD_DEBUG_STATE_CHANGES
+	drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, ":=ns", ns);
+#endif
+
+	/* solve the race between becoming unconfigured,
+	 * worker doing the cleanup, and
+	 * admin reconfiguring us:
+	 * on (re)configure, first set CONFIG_PENDING,
+	 * then wait for a potentially exiting worker,
+	 * start the worker, and schedule one no_op.
+	 * then proceed with configuration.
+	 */
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY &&
+	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
+		set_bit(DEVICE_DYING, &mdev->flags);
+
+	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+	 * drbd_ldev_destroy() won't happen before our corresponding
+	 * after_state_ch works run, where we put_ldev again. */
+	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+		atomic_inc(&mdev->local_cnt);
+
+	/* assignment inclusive debug info about what code path
+	 * initiated this state change. */
+	mdev->state = ns;
+
+	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+		drbd_print_uuids(mdev, "attached to UUIDs");
+
+	wake_up(&mdev->misc_wait);
+	wake_up(&mdev->state_wait);
+
+	/* aborted verify run. log the last position */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn < C_CONNECTED) {
+		mdev->ov_start_sector =
+			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
+		dev_info(DEV, "Online Verify reached sector %llu\n",
+			(unsigned long long)mdev->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		dev_info(DEV, "Syncer continues.\n");
+		mdev->rs_paused += (long)jiffies
+				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&mdev->resync_timer, jiffies);
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		dev_info(DEV, "Resync suspended\n");
+		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		unsigned long now = jiffies;
+		int i;
+
+		set_ov_position(mdev, ns.conn);
+		mdev->rs_start = now;
+		mdev->rs_last_events = 0;
+		mdev->rs_last_sect_ev = 0;
+		mdev->ov_last_oos_size = 0;
+		mdev->ov_last_oos_start = 0;
+
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			mdev->rs_mark_left[i] = mdev->ov_left;
+			mdev->rs_mark_time[i] = now;
+		}
+
+		drbd_rs_controller_reset(mdev);
+
+		if (ns.conn == C_VERIFY_S) {
+			dev_info(DEV, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)mdev->ov_position);
+			mod_timer(&mdev->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(mdev)) {
+		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (mdev->state.role == R_PRIMARY ||
+		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (mdev->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (mdev->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (mdev->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != mdev->ldev->md.flags) {
+			mdev->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(mdev);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(mdev);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &mdev->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&mdev->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&mdev->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_WF_CONNECTION &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&mdev->receiver);
+
+	/* Resume AL writing if we get a connection */
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+		drbd_resume_al(mdev);
+
+	/* remember last connect and attach times so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
+		mdev->last_reconnect_jif = jiffies;
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		mdev->last_reattach_jif = jiffies;
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->done = done;
+		drbd_queue_work(&mdev->data.work, &ascw->w);
+	} else {
+		dev_warn(DEV, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+STATIC int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
+	if (ascw->flags & CS_WAIT_COMPLETE) {
+		D_ASSERT(ascw->done != NULL);
+		complete(ascw->done);
+	}
+	kfree(ascw);
+
+	return 1;
+}
+
+static void abw_start_sync(struct drbd_conf *mdev, int rv)
+{
+	if (rv) {
+		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (mdev->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+		int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(current == mdev->worker.task);
+
+	/* open coded non-blocking drbd_suspend_io(mdev); */
+	set_bit(SUSPEND_IO, &mdev->flags);
+
+	drbd_bm_lock(mdev, why, flags);
+	rv = io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ */
+STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags)
+{
+	enum drbd_fencing_p fp;
+	enum drbd_req_event what = nothing;
+	union drbd_state nsm = (union drbd_state){ .i = -1 };
+
+#if DRBD_DEBUG_STATE_CHANGES
+	if (ns.seq) {
+		drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, ">>os", os);
+		drbd_state_dbg(mdev, ns.seq, ns.func, ns.line, ">>ns", ns);
+	}
+#endif
+
+	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+		if (mdev->p_uuid)
+			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_state(mdev, ns);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(mdev, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
+		mod_timer(&mdev->request_timer, jiffies + HZ);
+
+	nsm.i = -1;
+	if (ns.susp_nod) {
+		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+			what = resend;
+
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    ns.disk > D_NEGOTIATING)
+			what = restart_frozen_disk_io;
+
+		if (what != nothing)
+			nsm.susp_nod = 0;
+	}
+
+	if (ns.susp_fen) {
+		/* case1: The outdate peer handler is successful: */
+		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
+			tl_clear(mdev);
+			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+				drbd_uuid_new_current(mdev);
+				clear_bit(NEW_CUR_UUID, &mdev->flags);
+			}
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+		/* case2: The connection was established again: */
+		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+			clear_bit(NEW_CUR_UUID, &mdev->flags);
+			what = resend;
+			nsm.susp_fen = 0;
+		}
+	}
+
+	if (what != nothing) {
+		spin_lock_irq(&mdev->req_lock);
+		_tl_restart(mdev, what);
+		nsm.i &= mdev->state.i;
+		_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+	}
+
+	/* Became sync source.  With protocol >= 96, we still need to send out
+	 * the sync uuid now. Need to do that before any drbd_send_state, or
+	 * the other side may go "paused sync" before receiving the sync uuids,
+	 * which is unexpected. */
+	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+	    mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
+		drbd_gen_and_send_sync_uuid(mdev);
+		put_ldev(mdev);
+	}
+
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev, ns);
+	}
+	/* No point in queuing send_bitmap if we don't have a connection
+	 * anymore, so check also the _current_ state, not only the new state
+	 * at the time this work was queued. */
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+	    mdev->state.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
+				"send_bitmap (WFBitMapS)",
+				BM_LOCKED_TEST_ALLOWED);
+
+	/* Lost contact to peer's copy of the data */
+	if ((os.pdsk >= D_INCONSISTENT &&
+	     os.pdsk != D_UNKNOWN &&
+	     os.pdsk != D_OUTDATED)
+	&&  (ns.pdsk < D_INCONSISTENT ||
+	     ns.pdsk == D_UNKNOWN ||
+	     ns.pdsk == D_OUTDATED)) {
+		if (get_ldev(mdev)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				if (is_susp(mdev->state)) {
+					set_bit(NEW_CUR_UUID, &mdev->flags);
+				} else {
+					drbd_uuid_new_current(mdev);
+					drbd_send_uuids(mdev);
+				}
+			}
+			put_ldev(mdev);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+			drbd_uuid_new_current(mdev);
+			drbd_send_uuids(mdev);
+		}
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			/* We may still be Primary ourselves.
+			 * No harm done if the bitmap still changes,
+			 * redirtied pages will follow later. */
+			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	/* Write out all changed bits on demote.
+	 * Though, no need to da that just yet
+	 * if there is a resync going on still */
+	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+		/* No changes to the bitmap expected this time, so assert that,
+		 * even though no harm was done if it did change. */
+		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+				"demote", BM_LOCKED_TEST_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev, ns);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(mdev, ns);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(mdev);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
+
+	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+		drbd_send_state(mdev, ns);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		/* no other bitmap changes expected during this phase */
+		drbd_queue_bitmap_io(mdev,
+			&drbd_bmio_set_n_write, &abw_start_sync,
+			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+	/* We are invalidating our self... */
+	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
+	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
+		/* other bitmap operation expected during this phase */
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
+			"set_n_write from invalidate", BM_LOCKED_MASK);
+
+	/* first half of local IO error, failure to attach,
+	 * or administrative detach */
+	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
+		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (mdev->ldev) {
+			eh = mdev->ldev->dc.on_io_error;
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+			/* Immediately allow completion of all application IO, that waits
+			   for completion from the local disk. */
+			tl_abort_disk_io(mdev);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (mdev->state.disk != D_FAILED)
+				dev_err(DEV,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(mdev->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(mdev, ns);
+
+			drbd_rs_cancel_all(mdev);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(mdev);
+		}
+		put_ldev(mdev);
+
+		if (was_io_error && eh == EP_CALL_HELPER)
+			drbd_khelper(mdev, "local-io-error");
+	}
+
+        /* second half of local IO error, failure to attach,
+         * or administrative detach,
+         * after local_cnt references have reached zero again */
+        if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+                /* We must still be diskless,
+                 * re-attach has to be serialized with this! */
+                if (mdev->state.disk != D_DISKLESS)
+                        dev_err(DEV,
+                                "ASSERT FAILED: disk is %s while going diskless\n",
+                                drbd_disk_str(mdev->state.disk));
+
+                mdev->rs_total = 0;
+                mdev->rs_failed = 0;
+                atomic_set(&mdev->rs_pending_cnt, 0);
+
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(mdev, ns);
+
+		/* corresponding get_ldev in __drbd_set_state
+		 * this may finally trigger drbd_ldev_destroy. */
+		put_ldev(mdev);
+	}
+
+	/* Notify peer that I had a local IO error, and did not detached.. */
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(mdev);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(mdev);
+
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(mdev, ns);
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &mdev->flags);
+		wake_up(&mdev->state_wait);
+	}
+
+	/* This triggers bitmap writeout of potentially still unwritten pages
+	 * if the resync finished cleanly, or aborted because of peer disk
+	 * failure, or because of connection loss.
+	 * For resync aborted because of local disk failure, we cannot do
+	 * any bitmap writeout anymore.
+	 * No harm done if some bits change during this phase.
+	 */
+	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	/* free tl_hash if we Got thawed and are C_STANDALONE */
+	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
+		drbd_free_tl_hash(mdev);
+
+	/* Upon network connection, we need to start the receiver */
+	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
+		drbd_thread_start(&mdev->receiver);
+
+	/* Terminate worker thread if we are unconfigured - it will be
+	   restarted as needed... */
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(mdev);
+		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
+		if (test_bit(DEVICE_DYING, &mdev->flags))
+			drbd_thread_stop_nowait(&mdev->worker);
+	}
+
+	drbd_md_sync(mdev);
+}
+
+
+STATIC int drbd_thread_setup(void *arg)
+{
+	struct drbd_thread *thi = (struct drbd_thread *) arg;
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned long flags;
+	long timeout;
+	int retval;
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
+
+	daemonize("drbd_thread");
+	D_ASSERT(get_t_state(thi) == Running);
+	D_ASSERT(thi->task == NULL);
+	/* state engine takes this lock (in drbd_thread_stop_nowait)
+	 * while holding the req_lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+	thi->task = current;
+	smp_mb();
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	complete(&thi->startstop); /* notify: thi->task is set. */
+	timeout = schedule_timeout(10*HZ);
+	D_ASSERT(timeout != 0);
+
+restart:
+	retval = thi->function(thi);
+
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* if the receiver has been "Exiting", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "Restarting" in that case.
+	 * t_state check and assignment needs to be within the same spinlock,
+	 * so either thread_start sees Exiting, and can remap to Restarting,
+	 * or thread_start see None, and can proceed as normal.
+	 */
+
+	if (thi->t_state == Restarting) {
+		dev_info(DEV, "Restarting %s thread\n", me);
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		goto restart;
+	}
+
+	thi->task = NULL;
+	thi->t_state = None;
+	smp_mb();
+
+	/* THINK maybe two different completions? */
+	complete(&thi->startstop); /* notify: thi->task unset. */
+	dev_info(DEV, "Terminating %s thread\n", me);
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	/* Release mod reference taken when thread was started */
+	module_put(THIS_MODULE);
+	return retval;
+}
+
+STATIC void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
+		      int (*func) (struct drbd_thread *))
+{
+	spin_lock_init(&thi->t_lock);
+	thi->task    = NULL;
+	thi->t_state = None;
+	thi->function = func;
+	thi->mdev = mdev;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+	int pid;
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned long flags;
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
+
+	/* is used from state engine doing drbd_thread_stop_nowait,
+	 * while holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	switch (thi->t_state) {
+	case None:
+		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+
+		/* Get ref on module for thread - this is released when thread exits */
+		if (!try_module_get(THIS_MODULE)) {
+			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return false;
+		}
+
+		init_completion(&thi->startstop);
+		D_ASSERT(thi->task == NULL);
+		thi->reset_cpu_mask = 1;
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+		pid = kernel_thread(drbd_thread_setup, (void *) thi, CLONE_FS);
+		if (pid < 0) {
+			dev_err(DEV, "Couldn't start thread (%d)\n", pid);
+
+			module_put(THIS_MODULE);
+			return false;
+		}
+		/* waits until thi->task is set */
+		wait_for_completion(&thi->startstop);
+		if (thi->t_state != Running)
+			dev_err(DEV, "ASSERT FAILED: %s t_state == %d expected %d.\n",
+					me, thi->t_state, Running);
+		if (thi->task)
+			wake_up_process(thi->task);
+		else
+			dev_err(DEV, "ASSERT FAILED thi->task is NULL where it should be set!?\n");
+		break;
+	case Exiting:
+		thi->t_state = Restarting;
+		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+		/* fall through */
+	case Running:
+	case Restarting:
+	default:
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		break;
+	}
+
+	return true;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned long flags;
+	enum drbd_thread_state ns = restart ? Restarting : Exiting;
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
+
+	/* may be called from state engine, holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* dev_info(DEV, "drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n",
+	     current->comm, current->pid,
+	     thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */
+
+	if (thi->t_state == None) {
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		if (restart)
+			drbd_thread_start(thi);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		if (thi->task == NULL) {
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		init_completion(&thi->startstop);
+		if (thi->task != current)
+			force_sig(DRBD_SIGKILL, thi->task);
+		else
+			D_ASSERT(!wait);
+	}
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	if (wait) {
+		D_ASSERT(thi->task != current);
+		wait_for_completion(&thi->startstop);
+		spin_lock_irqsave(&thi->t_lock, flags);
+		D_ASSERT(thi->task == NULL);
+		if (thi->t_state != None)
+			dev_err(DEV, "ASSERT FAILED: %s t_state == %d expected %d.\n",
+					me, thi->t_state, None);
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+	}
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ * @mdev:	DRBD device.
+ *
+ * Forces all threads of a device onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+{
+	int ord, cpu;
+
+	/* user override. */
+	if (cpumask_weight(mdev->cpu_mask))
+		return;
+
+	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+	for_each_online_cpu(cpu) {
+		if (ord-- == 0) {
+			cpumask_set_cpu(cpu, mdev->cpu_mask);
+			return;
+		}
+	}
+	/* should not be reached */
+	cpumask_setall(mdev->cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @mdev:	DRBD device.
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+{
+	struct task_struct *p = current;
+	struct drbd_thread *thi =
+		p == mdev->asender.task  ? &mdev->asender  :
+		p == mdev->receiver.task ? &mdev->receiver :
+		p == mdev->worker.task   ? &mdev->worker   :
+		NULL;
+	ERR_IF(thi == NULL)
+		return;
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	set_cpus_allowed_ptr(p, mdev->cpu_mask);
+}
+#endif
+
+/* the appropriate socket mutex must be held already */
+int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+			  enum drbd_packets cmd, struct p_header80 *h,
+			  size_t size, unsigned msg_flags)
+{
+	int sent, ok;
+
+	ERR_IF(!h) return false;
+	ERR_IF(!size) return false;
+
+	h->magic   = BE_DRBD_MAGIC;
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size-sizeof(struct p_header80));
+
+	trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__);
+	sent = drbd_send(mdev, sock, h, size, msg_flags);
+
+	ok = (sent == size);
+	if (!ok && !signal_pending(current))
+		dev_warn(DEV, "short sent %s size=%d sent=%d\n",
+		    cmdname(cmd), (int)size, sent);
+	return ok;
+}
+
+/* don't pass the socket. we may only look at it
+ * when we hold the appropriate socket mutex.
+ */
+int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+		  enum drbd_packets cmd, struct p_header80 *h, size_t size)
+{
+	int ok = 0;
+	struct socket *sock;
+
+	if (use_data_socket) {
+		mutex_lock(&mdev->data.mutex);
+		sock = mdev->data.socket;
+	} else {
+		mutex_lock(&mdev->meta.mutex);
+		sock = mdev->meta.socket;
+	}
+
+	/* drbd_disconnect() could have called drbd_free_sock()
+	 * while we were waiting in down()... */
+	if (likely(sock != NULL))
+		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+
+	if (use_data_socket)
+		mutex_unlock(&mdev->data.mutex);
+	else
+		mutex_unlock(&mdev->meta.mutex);
+	return ok;
+}
+
+int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
+		   size_t size)
+{
+	struct p_header80 h;
+	int ok;
+
+	h.magic   = BE_DRBD_MAGIC;
+	h.command = cpu_to_be16(cmd);
+	h.length  = cpu_to_be16(size);
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__);
+
+	ok = (sizeof(h) ==
+		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
+	ok = ok && (size ==
+		drbd_send(mdev, mdev->data.socket, data, size, 0));
+
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+{
+	struct p_rs_param_95 *p;
+	struct socket *sock;
+	int size, rv;
+	const int apv = mdev->agreed_pro_version;
+
+	size = apv <= 87 ? sizeof(struct p_rs_param)
+		: apv == 88 ? sizeof(struct p_rs_param)
+			+ strlen(mdev->sync_conf.verify_alg) + 1
+		: apv <= 94 ? sizeof(struct p_rs_param_89)
+		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	/* used from admin command context and receiver/worker context.
+	 * to avoid kmalloc, grab the socket right here,
+	 * then use the pre-allocated sbuf there */
+	mutex_lock(&mdev->data.mutex);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+		p = &mdev->data.sbuf.rs_param_95;
+
+		/* initialize verify_alg and csums_alg */
+		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+		p->rate = cpu_to_be32(sc->rate);
+		p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
+		p->c_delay_target = cpu_to_be32(sc->c_delay_target);
+		p->c_fill_target = cpu_to_be32(sc->c_fill_target);
+		p->c_max_rate = cpu_to_be32(sc->c_max_rate);
+
+		if (apv >= 88)
+			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
+		if (apv >= 89)
+			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
+
+		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
+	} else
+		rv = 0; /* not ok */
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return rv;
+}
+
+int drbd_send_protocol(struct drbd_conf *mdev)
+{
+	struct p_protocol *p;
+	int size, cf, rv;
+
+	size = sizeof(struct p_protocol);
+
+	if (mdev->agreed_pro_version >= 87)
+		size += strlen(mdev->net_conf->integrity_alg) + 1;
+
+	/* we must not recurse into our own queue,
+	 * as that is blocked during handshake */
+	p = kmalloc(size, GFP_NOIO);
+	if (p == NULL)
+		return 0;
+
+	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
+	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+
+	cf = 0;
+	if (mdev->net_conf->want_lose)
+		cf |= CF_WANT_LOSE;
+	if (mdev->net_conf->dry_run) {
+		if (mdev->agreed_pro_version >= 92)
+			cf |= CF_DRY_RUN;
+		else {
+			dev_err(DEV, "--dry-run is not supported by peer");
+			kfree(p);
+			return -1;
+		}
+	}
+	p->conn_flags    = cpu_to_be32(cf);
+
+	if (mdev->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+
+	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
+			   (struct p_header80 *)p, size);
+	kfree(p);
+	return rv;
+}
+
+int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
+{
+	struct p_uuids p;
+	int i;
+
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+		return 1;
+
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+
+	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
+	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
+	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+	put_ldev(mdev);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
+			     (struct p_header80 *)&p, sizeof(p));
+}
+
+int drbd_send_uuids(struct drbd_conf *mdev)
+{
+	return _drbd_send_uuids(mdev, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
+{
+	return _drbd_send_uuids(mdev, 8);
+}
+
+void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
+{
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		u64 *uuid = mdev->ldev->md.uuid;
+		dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
+		     text,
+		     (unsigned long long)uuid[UI_CURRENT],
+		     (unsigned long long)uuid[UI_BITMAP],
+		     (unsigned long long)uuid[UI_HISTORY_START],
+		     (unsigned long long)uuid[UI_HISTORY_END]);
+		put_ldev(mdev);
+	} else {
+		dev_info(DEV, "%s effective data uuid: %016llX\n",
+				text,
+				(unsigned long long)mdev->ed_uuid);
+	}
+}
+
+int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
+{
+	struct p_rs_uuid p;
+	u64 uuid;
+
+	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
+
+	uuid = mdev->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
+	drbd_uuid_set(mdev, UI_BITMAP, uuid);
+	drbd_print_uuids(mdev, "updated sync UUID");
+	drbd_md_sync(mdev);
+	p.uuid = cpu_to_be64(uuid);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
+			     (struct p_header80 *)&p, sizeof(p));
+}
+
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
+{
+	struct p_sizes p;
+	sector_t d_size, u_size;
+	int q_order_type, max_bio_size;
+	int ok;
+
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		D_ASSERT(mdev->ldev->backing_bdev);
+		d_size = drbd_get_max_capacity(mdev->ldev);
+		u_size = mdev->ldev->dc.disk_size;
+		q_order_type = drbd_queue_order_type(mdev);
+		max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
+		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
+		put_ldev(mdev);
+	} else {
+		d_size = 0;
+		u_size = 0;
+		q_order_type = QUEUE_ORDERED_NONE;
+		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+	}
+
+	/* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
+	if (mdev->agreed_pro_version <= 94)
+		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+
+	p.d_size = cpu_to_be64(d_size);
+	p.u_size = cpu_to_be64(u_size);
+	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+	p.max_bio_size = cpu_to_be32(max_bio_size);
+	p.queue_order_type = cpu_to_be16(q_order_type);
+	p.dds_flags = cpu_to_be16(flags);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
+			   (struct p_header80 *)&p, sizeof(p));
+	return ok;
+}
+
+/**
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @mdev:	DRBD device.
+ */
+int drbd_send_current_state_(struct drbd_conf *mdev, const char *func, unsigned int line)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	/* Grab state lock so we wont send state if we're in the middle
+	 * of a cluster wide state change on another thread */
+	drbd_state_lock(mdev);
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		drbd_state_dbg(mdev, mdev->state.seq, func, line, "send-current", mdev->state);
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header80 *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	drbd_state_unlock(mdev);
+	return ok;
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @mdev:	DRBD device.
+ * @state:	the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state_(struct drbd_conf *mdev, union drbd_state state, const char *func, unsigned int line)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(state.i);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		drbd_state_dbg(mdev, state.seq, func, line, "send", state);
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header80 *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
+int drbd_send_state_req(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	struct p_req_state p;
+
+	p.mask    = cpu_to_be32(mask.i);
+	p.val     = cpu_to_be32(val.i);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
+			     (struct p_header80 *)&p, sizeof(p));
+}
+
+int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
+{
+	struct p_req_state_reply p;
+
+	p.retcode    = cpu_to_be32(retcode);
+
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
+			     (struct p_header80 *)&p, sizeof(p));
+}
+
+int fill_bitmap_rle_bits(struct drbd_conf *mdev,
+	struct p_compressed_bm *p,
+	struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	unsigned long plain_bits;
+	unsigned long tmp;
+	unsigned long rl;
+	unsigned len;
+	unsigned toggle;
+	int bits;
+
+	/* may we use this feature? */
+	if ((mdev->sync_conf.use_rle == 0) ||
+		(mdev->agreed_pro_version < 90))
+			return 0;
+
+	if (c->bit_offset >= c->bm_bits)
+		return 0; /* nothing to do. */
+
+	/* use at most thus many bytes */
+	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
+	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+	/* plain bits covered in this code string */
+	plain_bits = 0;
+
+	/* p->encoding & 0x80 stores whether the first run length is set.
+	 * bit offset is implicit.
+	 * start with toggle == 2 to be able to tell the first iteration */
+	toggle = 2;
+
+	/* see how much plain bits we can stuff into one packet
+	 * using RLE and VLI. */
+	do {
+		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
+				    : _drbd_bm_find_next(mdev, c->bit_offset);
+		if (tmp == -1UL)
+			tmp = c->bm_bits;
+		rl = tmp - c->bit_offset;
+
+		if (toggle == 2) { /* first iteration */
+			if (rl == 0) {
+				/* the first checked bit was set,
+				 * store start value, */
+				DCBP_set_start(p, 1);
+				/* but skip encoding of zero run length */
+				toggle = !toggle;
+				continue;
+			}
+			DCBP_set_start(p, 0);
+		}
+
+		/* paranoia: catch zero runlength.
+		 * can only happen if bitmap is modified while we scan it. */
+		if (rl == 0) {
+			dev_err(DEV, "unexpected zero runlength while encoding bitmap "
+			    "t:%u bo:%lu\n", toggle, c->bit_offset);
+			return -1;
+		}
+
+		bits = vli_encode_bits(&bs, rl);
+		if (bits == -ENOBUFS) /* buffer full */
+			break;
+		if (bits <= 0) {
+			dev_err(DEV, "error while encoding bitmap: %d\n", bits);
+			return 0;
+		}
+
+		toggle = !toggle;
+		plain_bits += rl;
+		c->bit_offset = tmp;
+	} while (c->bit_offset < c->bm_bits);
+
+	len = bs.cur.b - p->code + !!bs.cur.bit;
+
+	if (plain_bits < (len << 3)) {
+		/* incompressible with this method.
+		 * we need to rewind both word and bit position. */
+		c->bit_offset -= plain_bits;
+		bm_xfer_ctx_bit_to_word_offset(c);
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+		return 0;
+	}
+
+	/* RLE + VLI was able to compress it just fine.
+	 * update c->word_offset. */
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	/* store pad_bits */
+	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+	return len;
+}
+
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+STATIC int
+send_bitmap_rle_or_plain(struct drbd_conf *mdev,
+			 struct p_header80 *h, struct bm_xfer_ctx *c)
+{
+	struct p_compressed_bm *p = (void*)h;
+	unsigned long num_words;
+	int len;
+	int ok;
+
+	len = fill_bitmap_rle_bits(mdev, p, c);
+
+	if (len < 0)
+		return -EIO;
+
+	if (len) {
+		DCBP_set_code(p, RLE_VLI_Bits);
+		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
+			sizeof(*p) + len, 0);
+
+		c->packets[0]++;
+		c->bytes[0] += sizeof(*p) + len;
+
+		if (c->bit_offset >= c->bm_bits)
+			len = 0; /* DONE */
+	} else {
+		/* was not compressible.
+		 * send a buffer full of plain text bits instead. */
+		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+		len = num_words * sizeof(long);
+		if (len)
+			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
+		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
+				   h, sizeof(struct p_header80) + len, 0);
+		c->word_offset += num_words;
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+		c->packets[1]++;
+		c->bytes[1] += sizeof(struct p_header80) + len;
+
+		if (c->bit_offset > c->bm_bits)
+			c->bit_offset = c->bm_bits;
+	}
+	if (ok) {
+		if (len == 0) {
+			INFO_bm_xfer_stats(mdev, "send", c);
+			return 0;
+		} else
+			return 1;
+	}
+	return -EIO;
+}
+
+/* See the comment at receive_bitmap() */
+int _drbd_send_bitmap(struct drbd_conf *mdev)
+{
+	struct bm_xfer_ctx c;
+	struct p_header80 *p;
+	int err;
+
+	ERR_IF(!mdev->bitmap) return false;
+
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
+	if (!p) {
+		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+		return false;
+	}
+
+	if (get_ldev(mdev)) {
+		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+			dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
+			drbd_bm_set_all(mdev);
+			if (drbd_bm_write(mdev)) {
+				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
+				 * but otherwise process as per normal - need to tell other
+				 * side that a full resync is required! */
+				dev_err(DEV, "Failed to write bitmap to disk!\n");
+			} else {
+				drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+				drbd_md_sync(mdev);
+			}
+		}
+		put_ldev(mdev);
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(mdev),
+		.bm_words = drbd_bm_words(mdev),
+	};
+
+	do {
+		err = send_bitmap_rle_or_plain(mdev, p, &c);
+	} while (err > 0);
+
+	free_page((unsigned long) p);
+	return err == 0;
+}
+
+int drbd_send_bitmap(struct drbd_conf *mdev)
+{
+	int err;
+
+	if (!drbd_get_data_sock(mdev))
+		return -1;
+	err = !_drbd_send_bitmap(mdev);
+	drbd_put_data_sock(mdev);
+	return err;
+}
+
+int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+{
+	int ok;
+	struct p_barrier_ack p;
+
+	p.barrier  = barrier_nr;
+	p.set_size = cpu_to_be32(set_size);
+
+	if (mdev->state.conn < C_CONNECTED)
+		return false;
+	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
+			(struct p_header80 *)&p, sizeof(p));
+	return ok;
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @mdev:	DRBD device.
+ * @cmd:	Packet command code.
+ * @sector:	sector, needs to be in big endian byte order
+ * @blksize:	size in byte, needs to be in big endian byte order
+ * @block_id:	Id, big endian byte order
+ */
+STATIC int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+			  u64 sector,
+			  u32 blksize,
+			  u64 block_id)
+{
+	int ok;
+	struct p_block_ack p;
+
+	p.sector   = sector;
+	p.block_id = block_id;
+	p.blksize  = blksize;
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+
+	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
+		return false;
+	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
+				(struct p_header80 *)&p, sizeof(p));
+	return ok;
+}
+
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
+int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     struct p_data *dp, int data_size)
+{
+	data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+			      dp->block_id);
+}
+
+int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     struct p_block_req *rp)
+{
+	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @mdev:	DRBD device.
+ * @cmd:	Packet command code.
+ * @e:		Epoch entry.
+ */
+int drbd_send_ack(struct drbd_conf *mdev,
+	enum drbd_packets cmd, struct drbd_epoch_entry *e)
+{
+	return _drbd_send_ack(mdev, cmd,
+			      cpu_to_be64(e->sector),
+			      cpu_to_be32(e->size),
+			      e->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     sector_t sector, int blksize, u64 block_id)
+{
+	return _drbd_send_ack(mdev, cmd,
+			      cpu_to_be64(sector),
+			      cpu_to_be32(blksize),
+			      cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+		       sector_t sector, int size, u64 block_id)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = block_id;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
+				(struct p_header80 *)&p, sizeof(p));
+	return ok;
+}
+
+int drbd_send_drequest_csum(struct drbd_conf *mdev,
+			    sector_t sector, int size,
+			    void *digest, int digest_size,
+			    enum drbd_packets cmd)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = BE_DRBD_MAGIC + 0xbeef;
+	p.blksize  = cpu_to_be32(size);
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(cmd);
+	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
+
+	mutex_lock(&mdev->data.mutex);
+
+	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
+	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
+int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = BE_DRBD_MAGIC + 0xbabe;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
+			   (struct p_header80 *)&p, sizeof(p));
+	return ok;
+}
+
+/* called on sndtimeo
+ * returns false if we should retry,
+ * true if we think connection is dead
+ */
+STATIC int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+{
+	int drop_it;
+	/* long elapsed = (long)(jiffies - mdev->last_received); */
+	/* DUMPLU(elapsed); // elapsed ignored for now. */
+
+	drop_it =   mdev->meta.socket == sock
+		|| !mdev->asender.task
+		|| get_t_state(&mdev->asender) != Running
+		|| mdev->state.conn < C_CONNECTED;
+
+	if (drop_it)
+		return true;
+
+	drop_it = !--mdev->ko_count;
+	if (!drop_it) {
+		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+		       current->comm, current->pid, mdev->ko_count);
+		request_ping(mdev);
+	}
+
+	return drop_it; /* && (mdev->state == R_PRIMARY) */;
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ *   reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+STATIC int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
+		   int offset, size_t size, unsigned msg_flags)
+{
+	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
+	kunmap(page);
+	if (sent == size)
+		mdev->send_cnt += size>>9;
+	return sent == size;
+}
+
+STATIC int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
+		    int offset, size_t size, unsigned msg_flags)
+{
+	mm_segment_t oldfs = get_fs();
+	int sent, ok;
+	int len = size;
+
+	/* e.g. XFS meta- & log-data is in slab pages, which have a
+	 * page_count of 0 and/or have PageSlab() set.
+	 * we cannot use send_page for those, as that does get_page();
+	 * put_page(); and would cause either a VM_BUG directly, or
+	 * __page_cache_release a page that would actually still be referenced
+	 * by someone, leading to some obscure delayed Oops somewhere else. */
+	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+		return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
+
+	msg_flags |= MSG_NOSIGNAL;
+	drbd_update_congested(mdev);
+	set_fs(KERNEL_DS);
+	do {
+		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
+							offset, len,
+							msg_flags);
+		if (sent == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev,
+							  mdev->data.socket))
+				break;
+			else
+				continue;
+		}
+		if (sent <= 0) {
+			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
+			     __func__, (int)size, len, sent);
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
+	set_fs(oldfs);
+	clear_bit(NET_CONGESTED, &mdev->flags);
+
+	ok = (len == 0);
+	if (likely(ok))
+		mdev->send_cnt += size>>9;
+	return ok;
+}
+
+static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, i) {
+		if (!_drbd_no_send_page(mdev, bvec->bv_page,
+				     bvec->bv_offset, bvec->bv_len,
+				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
+			return 0;
+	}
+	return 1;
+}
+
+static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, i) {
+		if (!_drbd_send_page(mdev, bvec->bv_page,
+				     bvec->bv_offset, bvec->bv_len,
+				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
+			return 0;
+	}
+	return 1;
+}
+
+static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	struct page *page = e->pages;
+	unsigned len = e->size;
+	/* hint all but last page with MSG_MORE */
+	page_chain_for_each(page) {
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+		if (!_drbd_send_page(mdev, page, 0, l,
+				page_chain_next(page) ? MSG_MORE : 0))
+			return 0;
+		len -= l;
+	}
+	return 1;
+}
+
+/* see also wire_flags_to_bio()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
+{
+	if (mdev->agreed_pro_version >= 95)
+		return  (bi_rw & DRBD_REQ_SYNC ? DP_RW_SYNC : 0) |
+			(bi_rw & DRBD_REQ_UNPLUG ? DP_UNPLUG : 0) |
+			(bi_rw & DRBD_REQ_FUA ? DP_FUA : 0) |
+			(bi_rw & DRBD_REQ_FLUSH ? DP_FLUSH : 0) |
+			(bi_rw & DRBD_REQ_DISCARD ? DP_DISCARD : 0);
+
+	/* else: we used to communicate one bit only in older DRBD */
+	return bi_rw & (DRBD_REQ_SYNC | DRBD_REQ_UNPLUG) ? DP_RW_SYNC : 0;
+}
+
+/* Used to send write requests
+ * R_PRIMARY -> Peer	(P_DATA)
+ */
+int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	int ok = 1;
+	struct p_data p;
+	unsigned int dp_flags = 0;
+	void *dgb;
+	int dgs;
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+	if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
+		p.head.h80.magic   = BE_DRBD_MAGIC;
+		p.head.h80.command = cpu_to_be16(P_DATA);
+		p.head.h80.length  =
+			cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+	} else {
+		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
+		p.head.h95.command = cpu_to_be16(P_DATA);
+		p.head.h95.length  =
+			cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+	}
+
+	p.sector   = cpu_to_be64(req->sector);
+	p.block_id = (unsigned long)req;
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+
+	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
+
+	if (mdev->state.conn >= C_SYNC_SOURCE &&
+	    mdev->state.conn <= C_PAUSED_SYNC_T)
+		dp_flags |= DP_MAY_SET_IN_SYNC;
+
+	p.dp_flags = cpu_to_be32(dp_flags);
+	trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__);
+	ok = (sizeof(p) ==
+		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
+	if (ok && dgs) {
+		dgb = mdev->int_dig_out;
+		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
+	}
+	if (ok) {
+		/* For protocol A, we have to memcpy the payload into
+		 * socket buffers, as we may complete right away
+		 * as soon as we handed it over to tcp, at which point the data
+		 * pages may become invalid.
+		 *
+		 * For data-integrity enabled, we copy it as well, so we can be
+		 * sure that even if the bio pages may still be modified, it
+		 * won't change the data on the wire, thus if the digest checks
+		 * out ok after sending on this side, but does not fit on the
+		 * receiving side, we sure have detected corruption elsewhere.
+		 */
+		if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
+			ok = _drbd_send_bio(mdev, req->master_bio);
+		else
+			ok = _drbd_send_zc_bio(mdev, req->master_bio);
+
+		/* double check digest, sometimes buffers have been modified in flight. */
+		if (dgs > 0 && dgs <= 64) {
+			/* 64 byte, 512 bit, is the largest digest size
+			 * currently supported in kernel crypto. */
+			unsigned char digest[64];
+			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
+			if (memcmp(mdev->int_dig_out, digest, dgs)) {
+				dev_warn(DEV,
+					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+					(unsigned long long)req->sector, req->size);
+			}
+		} /* else if (dgs > 64) {
+		     ... Be noisy about digest too large ...
+		} */
+	}
+
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+/* answer packet, used to send data back for read requests:
+ *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
+ *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+		    struct drbd_epoch_entry *e)
+{
+	int ok;
+	struct p_data p;
+	void *dgb;
+	int dgs;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+	if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
+		p.head.h80.magic   = BE_DRBD_MAGIC;
+		p.head.h80.command = cpu_to_be16(cmd);
+		p.head.h80.length  =
+			cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+	} else {
+		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
+		p.head.h95.command = cpu_to_be16(cmd);
+		p.head.h95.length  =
+			cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+	}
+
+	p.sector   = cpu_to_be64(e->sector);
+	p.block_id = e->block_id;
+	/* p.seq_num  = 0;    No sequence numbers here.. */
+
+	/* Only called by our kernel thread.
+	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
+	 * in response to admin command or module unload.
+	 */
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__);
+	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
+	if (ok && dgs) {
+		dgb = mdev->int_dig_out;
+		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
+		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
+	}
+	if (ok)
+		ok = _drbd_send_zc_ee(mdev, e);
+
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	struct p_block_desc p;
+
+	p.sector  = cpu_to_be64(req->sector);
+	p.blksize = cpu_to_be32(req->size);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+	      void *buf, size_t size, unsigned msg_flags)
+{
+#if !HAVE_KERNEL_SENDMSG
+	mm_segment_t oldfs;
+	struct iovec iov;
+#else
+	struct kvec iov;
+#endif
+	struct msghdr msg;
+	int rv, sent = 0;
+
+	if (!sock)
+		return -1000;
+
+	/* THINK  if (signal_pending) return ... ? */
+
+	iov.iov_base = buf;
+	iov.iov_len  = size;
+
+	msg.msg_name       = NULL;
+	msg.msg_namelen    = 0;
+#if !HAVE_KERNEL_SENDMSG
+	msg.msg_iov        = &iov;
+	msg.msg_iovlen     = 1;
+#endif
+	msg.msg_control    = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
+
+#if !HAVE_KERNEL_SENDMSG
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+#endif
+
+	if (sock == mdev->data.socket) {
+		mdev->ko_count = mdev->net_conf->ko_count;
+		drbd_update_congested(mdev);
+	}
+	do {
+		/* STRANGE
+		 * tcp_sendmsg does _not_ use its size parameter at all ?
+		 *
+		 * -EAGAIN on timeout, -EINTR on signal.
+		 */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+#if !HAVE_KERNEL_SENDMSG
+		rv = sock_sendmsg(sock, &msg, iov.iov_len);
+#else
+		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+#endif
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev, sock))
+				break;
+			else
+				continue;
+		}
+		D_ASSERT(rv != 0);
+		if (rv == -EINTR) {
+#if 0
+			/* FIXME this happens all the time.
+			 * we don't care for now!
+			 * eventually this should be sorted out be the proper
+			 * use of the SIGNAL_ASENDER bit... */
+			if (DRBD_ratelimit(5*HZ, 5)) {
+				dev_dbg(DEV, "Got a signal in drbd_send(,%c,)!\n",
+				    sock == mdev->meta.socket ? 'm' : 's');
+				/* dump_stack(); */
+			}
+#endif
+			flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0)
+			break;
+		sent += rv;
+		iov.iov_base += rv;
+		iov.iov_len  -= rv;
+	} while (sent < size);
+
+	if (sock == mdev->data.socket)
+		clear_bit(NET_CONGESTED, &mdev->flags);
+
+#if !HAVE_KERNEL_SENDMSG
+	set_fs(oldfs);
+#endif
+
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			dev_err(DEV, "%s_sendmsg returned %d\n",
+			    sock == mdev->meta.socket ? "msock" : "sock",
+			    rv);
+			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+		} else
+			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+	}
+
+	return sent;
+}
+
+#ifdef BD_OPS_USE_FMODE
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+#else
+static int drbd_open(struct inode *inode, struct file *file)
+#endif
+{
+#ifdef BD_OPS_USE_FMODE
+	struct drbd_conf *mdev = bdev->bd_disk->private_data;
+#else
+	int mode = file->f_mode;
+	struct drbd_conf *mdev = inode->i_bdev->bd_disk->private_data;
+#endif
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	/* to have a stable mdev->state.role
+	 * and no race with updating open_cnt */
+
+	if (mdev->state.role != R_PRIMARY) {
+		if (mode & FMODE_WRITE)
+			rv = -EROFS;
+		else if (!allow_oos)
+			rv = -EMEDIUMTYPE;
+	}
+
+	if (!rv)
+		mdev->open_cnt++;
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+#ifdef BD_OPS_USE_FMODE
+static int drbd_release(struct gendisk *gd, fmode_t mode)
+{
+	struct drbd_conf *mdev = gd->private_data;
+	mdev->open_cnt--;
+	return 0;
+}
+#else
+static int drbd_release(struct inode *inode, struct file *file)
+{
+	struct drbd_conf *mdev = inode->i_bdev->bd_disk->private_data;
+	mdev->open_cnt--;
+	return 0;
+}
+#endif
+
+#ifdef blk_queue_plugged
+STATIC void drbd_unplug_fn(struct request_queue *q)
+{
+	struct drbd_conf *mdev = q->queuedata;
+
+	trace_drbd_unplug(mdev, "got unplugged");
+
+	/* unplug FIRST */
+	spin_lock_irq(q->queue_lock);
+	blk_remove_plug(q);
+	spin_unlock_irq(q->queue_lock);
+
+	/* only if connected */
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
+		D_ASSERT(mdev->state.role == R_PRIMARY);
+		if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
+			/* add to the data.work queue,
+			 * unless already queued.
+			 * XXX this might be a good addition to drbd_queue_work
+			 * anyways, to detect "double queuing" ... */
+			if (list_empty(&mdev->unplug_work.list))
+				drbd_queue_work(&mdev->data.work,
+						&mdev->unplug_work);
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
+}
+#endif
+
+STATIC void drbd_set_defaults(struct drbd_conf *mdev)
+{
+	/* This way we get a compile error when sync_conf grows,
+	   and we forgot to initialize it here */
+	mdev->sync_conf = (struct syncer_conf) {
+		/* .rate = */		DRBD_RATE_DEF,
+		/* .after = */		DRBD_AFTER_DEF,
+		/* .al_extents = */	DRBD_AL_EXTENTS_DEF,
+		/* .verify_alg = */	{}, 0,
+		/* .cpu_mask = */	{}, 0,
+		/* .csums_alg = */	{}, 0,
+		/* .use_rle = */	0,
+		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF,
+		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
+		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
+		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
+		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF,
+		/* .c_min_rate = */	DRBD_C_MIN_RATE_DEF
+	};
+
+	/* Have to use that way, because the layout differs between
+	   big endian and little endian */
+	mdev->state = (union drbd_state) {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = C_STANDALONE,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		  .susp = 0,
+		  .susp_nod = 0,
+		  .susp_fen = 0
+		} };
+}
+
+void drbd_init_set_defaults(struct drbd_conf *mdev)
+{
+	/* the memset(,0,) did most of this.
+	 * note: only assignments, no allocation in here */
+
+#ifdef PARANOIA
+	SET_MDEV_MAGIC(mdev);
+#endif
+
+	drbd_set_defaults(mdev);
+
+	/* for now, we do NOT yet support it,
+	 * even though we start some framework
+	 * to eventually support barriers */
+	set_bit(NO_BARRIER_SUPP, &mdev->flags);
+
+	atomic_set(&mdev->ap_bio_cnt, 0);
+	atomic_set(&mdev->ap_pending_cnt, 0);
+	atomic_set(&mdev->rs_pending_cnt, 0);
+	atomic_set(&mdev->unacked_cnt, 0);
+	atomic_set(&mdev->local_cnt, 0);
+	atomic_set(&mdev->net_cnt, 0);
+	atomic_set(&mdev->packet_seq, 0);
+	atomic_set(&mdev->pp_in_use, 0);
+	atomic_set(&mdev->pp_in_use_by_net, 0);
+	atomic_set(&mdev->rs_sect_in, 0);
+	atomic_set(&mdev->rs_sect_ev, 0);
+	atomic_set(&mdev->ap_in_flight, 0);
+	atomic_set(&mdev->md_io_in_use, 0);
+
+	mutex_init(&mdev->data.mutex);
+	mutex_init(&mdev->meta.mutex);
+	sema_init(&mdev->data.work.s, 0);
+	sema_init(&mdev->meta.work.s, 0);
+	mutex_init(&mdev->state_mutex);
+
+	spin_lock_init(&mdev->data.work.q_lock);
+	spin_lock_init(&mdev->meta.work.q_lock);
+
+	spin_lock_init(&mdev->al_lock);
+	spin_lock_init(&mdev->req_lock);
+	spin_lock_init(&mdev->peer_seq_lock);
+	spin_lock_init(&mdev->epoch_lock);
+
+	INIT_LIST_HEAD(&mdev->active_ee);
+	INIT_LIST_HEAD(&mdev->sync_ee);
+	INIT_LIST_HEAD(&mdev->done_ee);
+	INIT_LIST_HEAD(&mdev->read_ee);
+	INIT_LIST_HEAD(&mdev->net_ee);
+	INIT_LIST_HEAD(&mdev->resync_reads);
+	INIT_LIST_HEAD(&mdev->data.work.q);
+	INIT_LIST_HEAD(&mdev->meta.work.q);
+	INIT_LIST_HEAD(&mdev->resync_work.list);
+	INIT_LIST_HEAD(&mdev->unplug_work.list);
+	INIT_LIST_HEAD(&mdev->go_diskless.list);
+	INIT_LIST_HEAD(&mdev->md_sync_work.list);
+	INIT_LIST_HEAD(&mdev->start_resync_work.list);
+	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+
+	mdev->resync_work.cb  = w_resync_timer;
+	mdev->unplug_work.cb  = w_send_write_hint;
+	mdev->go_diskless.cb  = w_go_diskless;
+	mdev->md_sync_work.cb = w_md_sync;
+	mdev->bm_io_work.w.cb = w_bitmap_io;
+	mdev->start_resync_work.cb = w_start_resync;
+	init_timer(&mdev->resync_timer);
+	init_timer(&mdev->md_sync_timer);
+	init_timer(&mdev->start_resync_timer);
+	init_timer(&mdev->request_timer);
+	mdev->resync_timer.function = resync_timer_fn;
+	mdev->resync_timer.data = (unsigned long) mdev;
+	mdev->md_sync_timer.function = md_sync_timer_fn;
+	mdev->md_sync_timer.data = (unsigned long) mdev;
+	mdev->start_resync_timer.function = start_resync_timer_fn;
+	mdev->start_resync_timer.data = (unsigned long) mdev;
+	mdev->request_timer.function = request_timer_fn;
+	mdev->request_timer.data = (unsigned long) mdev;
+
+	init_waitqueue_head(&mdev->misc_wait);
+	init_waitqueue_head(&mdev->state_wait);
+	init_waitqueue_head(&mdev->net_cnt_wait);
+	init_waitqueue_head(&mdev->ee_wait);
+	init_waitqueue_head(&mdev->al_wait);
+	init_waitqueue_head(&mdev->seq_wait);
+
+	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
+	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
+	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
+
+	mdev->agreed_pro_version = PRO_VERSION_MAX;
+	mdev->write_ordering = WO_bio_barrier;
+	mdev->resync_wenr = LC_FREE;
+	mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+	mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_mdev_cleanup(struct drbd_conf *mdev)
+{
+	int i;
+	if (mdev->receiver.t_state != None)
+		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				mdev->receiver.t_state);
+
+	/* no need to lock it, I'm the only thread alive */
+	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
+		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
+	mdev->al_writ_cnt  =
+	mdev->bm_writ_cnt  =
+	mdev->read_cnt     =
+	mdev->recv_cnt     =
+	mdev->send_cnt     =
+	mdev->writ_cnt     =
+	mdev->p_size       =
+	mdev->rs_start     =
+	mdev->rs_total     =
+	mdev->rs_failed    = 0;
+	mdev->rs_last_events = 0;
+	mdev->rs_last_sect_ev = 0;
+	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+		mdev->rs_mark_left[i] = 0;
+		mdev->rs_mark_time[i] = 0;
+	}
+	D_ASSERT(mdev->net_conf == NULL);
+
+	drbd_set_my_capacity(mdev, 0);
+	if (mdev->bitmap) {
+		/* maybe never allocated. */
+		drbd_bm_resize(mdev, 0, 1);
+		drbd_bm_cleanup(mdev);
+	}
+
+	drbd_free_resources(mdev);
+	clear_bit(AL_SUSPENDED, &mdev->flags);
+
+	/*
+	 * currently we drbd_init_ee only on module load, so
+	 * we may do drbd_release_ee only on module unload!
+	 */
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->net_ee));
+	D_ASSERT(list_empty(&mdev->resync_reads));
+	D_ASSERT(list_empty(&mdev->data.work.q));
+	D_ASSERT(list_empty(&mdev->meta.work.q));
+	D_ASSERT(list_empty(&mdev->resync_work.list));
+	D_ASSERT(list_empty(&mdev->unplug_work.list));
+	D_ASSERT(list_empty(&mdev->go_diskless.list));
+
+	drbd_set_defaults(mdev);
+}
+
+#if !defined(COMPAT_HAVE_MEMPOOL_CREATE_PAGE_POOL) || LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+/* sles10 (2.6.16 +patches) has it,
+ * but does not EXPORT_SYMBOL() the helpers :( */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	return alloc_pages(gfp_mask, order);
+}
+void mempool_free_pages(void *element, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	__free_pages(element, order);
+}
+#endif
+#if !defined(COMPAT_HAVE_MEMPOOL_CREATE_PAGE_POOL)
+/* Introduced with 6e0678f3, 2.6.17 */
+static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
+{
+	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
+			      (void *)(long)order);
+}
+#endif
+
+
+STATIC void drbd_destroy_mempools(void)
+{
+	struct page *page;
+
+	while (drbd_pp_pool) {
+		page = drbd_pp_pool;
+		drbd_pp_pool = (struct page *)page_private(page);
+		__free_page(page);
+		drbd_pp_vacant--;
+	}
+
+	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+
+	if (drbd_md_io_bio_set)
+		bioset_free(drbd_md_io_bio_set);
+	if (drbd_md_io_page_pool)
+		mempool_destroy(drbd_md_io_page_pool);
+	if (drbd_ee_mempool)
+		mempool_destroy(drbd_ee_mempool);
+	if (drbd_request_mempool)
+		mempool_destroy(drbd_request_mempool);
+	if (drbd_ee_cache)
+		kmem_cache_destroy(drbd_ee_cache);
+	if (drbd_request_cache)
+		kmem_cache_destroy(drbd_request_cache);
+	if (drbd_bm_ext_cache)
+		kmem_cache_destroy(drbd_bm_ext_cache);
+	if (drbd_al_ext_cache)
+		kmem_cache_destroy(drbd_al_ext_cache);
+
+	drbd_md_io_bio_set   = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_ee_mempool      = NULL;
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+
+	return;
+}
+
+STATIC int drbd_create_mempools(void)
+{
+	struct page *page;
+	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
+	int i;
+
+	/* prepare our caches and mempools */
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+	drbd_pp_pool         = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_md_io_bio_set   = NULL;
+
+	/* caches */
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	drbd_bm_ext_cache = kmem_cache_create(
+		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+	if (drbd_bm_ext_cache == NULL)
+		goto Enomem;
+
+	drbd_al_ext_cache = kmem_cache_create(
+		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+	if (drbd_al_ext_cache == NULL)
+		goto Enomem;
+
+	/* mempools */
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_bio_set == NULL)
+		goto Enomem;
+#endif
+
+	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_page_pool == NULL)
+		goto Enomem;
+
+	drbd_request_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	drbd_ee_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
+	if (drbd_ee_mempool == NULL)
+		goto Enomem;
+
+	/* drbd's page pool */
+	spin_lock_init(&drbd_pp_lock);
+
+	for (i = 0; i < number; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto Enomem;
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+	}
+	drbd_pp_vacant = number;
+
+	return 0;
+
+Enomem:
+	drbd_destroy_mempools(); /* in case we allocated some */
+	return -ENOMEM;
+}
+
+STATIC int drbd_notify_sys(struct notifier_block *this, unsigned long code,
+	void *unused)
+{
+	/* just so we have it.  you never know what interesting things we
+	 * might want to do here some day...
+	 */
+
+	return NOTIFY_DONE;
+}
+
+STATIC struct notifier_block drbd_notifier = {
+	.notifier_call = drbd_notify_sys,
+};
+
+static void drbd_release_ee_lists(struct drbd_conf *mdev)
+{
+	int rr;
+
+	rr = drbd_release_ee(mdev, &mdev->active_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in active list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->sync_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in sync list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->read_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in read list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->done_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in done list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->net_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking.
+ * currently only used from module cleanup code. */
+static void drbd_delete_device(unsigned int minor)
+{
+	struct drbd_conf *mdev = minor_to_mdev(minor);
+
+	if (!mdev)
+		return;
+
+	del_timer_sync(&mdev->request_timer);
+
+	/* paranoia asserts */
+	if (mdev->open_cnt != 0)
+		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
+				__FILE__ , __LINE__);
+
+	ERR_IF (!list_empty(&mdev->data.work.q)) {
+		struct list_head *lp;
+		list_for_each(lp, &mdev->data.work.q) {
+			DUMPP(lp);
+		}
+	};
+	/* end paranoia asserts */
+
+	del_gendisk(mdev->vdisk);
+
+	/* cleanup stuff that may have been allocated during
+	 * device (re-)configuration or state changes */
+
+	if (mdev->this_bdev)
+		bdput(mdev->this_bdev);
+
+	drbd_free_resources(mdev);
+
+	drbd_release_ee_lists(mdev);
+
+	/* should be freed on disconnect? */
+	kfree(mdev->ee_hash);
+	/*
+	mdev->ee_hash_s = 0;
+	mdev->ee_hash = NULL;
+	*/
+
+	lc_destroy(mdev->act_log);
+	lc_destroy(mdev->resync);
+
+	kfree(mdev->p_uuid);
+	/* mdev->p_uuid = NULL; */
+
+	kfree(mdev->int_dig_out);
+	kfree(mdev->int_dig_in);
+	kfree(mdev->int_dig_vv);
+
+	/* cleanup the rest that has been
+	 * allocated from drbd_new_device
+	 * and actually free the mdev itself */
+	drbd_free_mdev(mdev);
+}
+
+STATIC void drbd_cleanup(void)
+{
+	unsigned int i;
+
+	unregister_reboot_notifier(&drbd_notifier);
+
+	/* first remove proc,
+	 * drbdsetup uses it's presence to detect
+	 * whether DRBD is loaded.
+	 * If we would get stuck in proc removal,
+	 * but have netlink already deregistered,
+	 * some drbdsetup commands may wait forever
+	 * for an answer.
+	 */
+	if (drbd_proc)
+		remove_proc_entry("drbd", NULL);
+
+	drbd_nl_cleanup();
+
+	if (minor_table) {
+		i = minor_count;
+		while (i--)
+			drbd_delete_device(i);
+		drbd_destroy_mempools();
+	}
+
+	kfree(minor_table);
+
+	drbd_unregister_blkdev(DRBD_MAJOR, "drbd");
+
+	printk(KERN_INFO "drbd: module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for pdflush
+ * @congested_data:	User data
+ * @bdi_bits:		Bits pdflush is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+	struct drbd_conf *mdev = congested_data;
+	struct request_queue *q;
+	char reason = '-';
+	int r = 0;
+
+	if (!may_inc_ap_bio(mdev)) {
+		/* DRBD has frozen IO */
+		r = bdi_bits;
+		reason = 'd';
+		goto out;
+	}
+
+	if (get_ldev(mdev)) {
+		q = bdev_get_queue(mdev->ldev->backing_bdev);
+		r = bdi_congested(&q->backing_dev_info, bdi_bits);
+		put_ldev(mdev);
+		if (r)
+			reason = 'b';
+	}
+
+	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
+		r |= (1 << BDI_async_congested);
+		reason = reason == 'b' ? 'a' : 'n';
+	}
+
+out:
+	mdev->congestion_reason = reason;
+	return r;
+}
+
+struct drbd_conf *drbd_new_device(unsigned int minor)
+{
+	struct drbd_conf *mdev;
+	struct gendisk *disk;
+	struct request_queue *q;
+
+	/* GFP_KERNEL, we are outside of all write-out paths */
+	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
+	if (!mdev)
+		return NULL;
+	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
+		goto out_no_cpumask;
+
+	mdev->minor = minor;
+
+	drbd_init_set_defaults(mdev);
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		goto out_no_q;
+	mdev->rq_queue = q;
+	q->queuedata   = mdev;
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_no_disk;
+	mdev->vdisk = disk;
+
+	set_disk_ro(disk, true);
+
+	disk->queue = q;
+	disk->major = DRBD_MAJOR;
+	disk->first_minor = minor;
+	disk->fops = &drbd_ops;
+	sprintf(disk->disk_name, "drbd%d", minor);
+	disk->private_data = mdev;
+
+	mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+	/* we have no partitions. we contain only ourselves. */
+	mdev->this_bdev->bd_contains = mdev->this_bdev;
+
+	q->backing_dev_info.congested_fn = drbd_congested;
+	q->backing_dev_info.congested_data = mdev;
+
+	blk_queue_make_request(q, drbd_make_request);
+	/* Setting the max_hw_sectors to an odd value of 8kibyte here
+	   This triggers a max_bio_size message upon first attach or connect */
+	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	blk_queue_merge_bvec(q, drbd_merge_bvec);
+	q->queue_lock = &mdev->req_lock; /* needed since we use */
+#ifdef blk_queue_plugged
+		/* plugging on a queue, that actually has no requests! */
+	q->unplug_fn = drbd_unplug_fn;
+#endif
+
+	mdev->md_io_page = alloc_page(GFP_KERNEL);
+	if (!mdev->md_io_page)
+		goto out_no_io_page;
+
+	if (drbd_bm_init(mdev))
+		goto out_no_bitmap;
+	/* no need to lock access, we are still initializing this minor device. */
+	if (!tl_init(mdev))
+		goto out_no_tl;
+
+	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
+	if (!mdev->app_reads_hash)
+		goto out_no_app_reads;
+
+	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!mdev->current_epoch)
+		goto out_no_epoch;
+
+	INIT_LIST_HEAD(&mdev->current_epoch->list);
+	mdev->epochs = 1;
+
+	return mdev;
+
+/* out_whatever_else:
+	kfree(mdev->current_epoch); */
+out_no_epoch:
+	kfree(mdev->app_reads_hash);
+out_no_app_reads:
+	tl_cleanup(mdev);
+out_no_tl:
+	drbd_bm_cleanup(mdev);
+out_no_bitmap:
+	__free_page(mdev->md_io_page);
+out_no_io_page:
+	put_disk(disk);
+out_no_disk:
+	blk_cleanup_queue(q);
+out_no_q:
+	free_cpumask_var(mdev->cpu_mask);
+out_no_cpumask:
+	kfree(mdev);
+	return NULL;
+}
+
+/* counterpart of drbd_new_device.
+ * last part of drbd_delete_device. */
+void drbd_free_mdev(struct drbd_conf *mdev)
+{
+	kfree(mdev->current_epoch);
+	kfree(mdev->app_reads_hash);
+	tl_cleanup(mdev);
+	if (mdev->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(mdev);
+	__free_page(mdev->md_io_page);
+	put_disk(mdev->vdisk);
+	blk_cleanup_queue(mdev->rq_queue);
+	free_cpumask_var(mdev->cpu_mask);
+	drbd_free_tl_hash(mdev);
+	kfree(mdev);
+}
+
+
+int __init drbd_init(void)
+{
+	int err;
+
+	if (sizeof(struct p_handshake) != 80) {
+		printk(KERN_ERR
+		       "drbd: never change the size or layout "
+		       "of the HandShake packet.\n");
+		return -EINVAL;
+	}
+
+	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
+		printk(KERN_ERR
+			"drbd: invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		minor_count = 8;
+#endif
+	}
+
+	err = drbd_nl_init();
+	if (err)
+		return err;
+
+	err = register_blkdev(DRBD_MAJOR, "drbd");
+	if (err) {
+		printk(KERN_ERR
+		       "drbd: unable to register block device major %d\n",
+		       DRBD_MAJOR);
+		return err;
+	}
+
+	register_reboot_notifier(&drbd_notifier);
+
+	/*
+	 * allocate all necessary structs
+	 */
+	err = -ENOMEM;
+
+	init_waitqueue_head(&drbd_pp_wait);
+
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
+	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
+				GFP_KERNEL);
+	if (!minor_table)
+		goto Enomem;
+
+	err = drbd_create_mempools();
+	if (err)
+		goto Enomem;
+
+	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+	if (!drbd_proc)	{
+		printk(KERN_ERR "drbd: unable to register proc file\n");
+		goto Enomem;
+	}
+
+	rwlock_init(&global_state_lock);
+
+	printk(KERN_INFO "drbd: initialized. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
+	printk(KERN_INFO "drbd: registered as block device major %d\n",
+		DRBD_MAJOR);
+	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
+
+	return 0; /* Success! */
+
+Enomem:
+	drbd_cleanup();
+	if (err == -ENOMEM)
+		/* currently always the case */
+		printk(KERN_ERR "drbd: ran out of memory\n");
+	else
+		printk(KERN_ERR "drbd: initialization failure\n");
+	return err;
+}
+
+void drbd_free_bc(struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+	kfree(ldev);
+}
+
+
+void drbd_free_sock(struct drbd_conf *mdev)
+{
+	if (mdev->data.socket) {
+		mutex_lock(&mdev->data.mutex);
+		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
+		sock_release(mdev->data.socket);
+		mdev->data.socket = NULL;
+		mutex_unlock(&mdev->data.mutex);
+	}
+	if (mdev->meta.socket) {
+		mutex_lock(&mdev->meta.mutex);
+		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
+		sock_release(mdev->meta.socket);
+		mdev->meta.socket = NULL;
+		mutex_unlock(&mdev->meta.mutex);
+	}
+}
+
+
+void drbd_free_resources(struct drbd_conf *mdev)
+{
+	crypto_free_hash(mdev->csums_tfm);
+	mdev->csums_tfm = NULL;
+	crypto_free_hash(mdev->verify_tfm);
+	mdev->verify_tfm = NULL;
+	crypto_free_hash(mdev->cram_hmac_tfm);
+	mdev->cram_hmac_tfm = NULL;
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = NULL;
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = NULL;
+
+	drbd_free_sock(mdev);
+
+	__no_warn(local,
+		  drbd_free_bc(mdev->ldev);
+		  mdev->ldev = NULL;);
+}
+
+/* meta data management */
+
+struct meta_data_on_disk {
+	u64 la_size;           /* last agreed size. */
+	u64 uuid[UI_SIZE];   /* UUIDs. */
+	u64 device_uuid;
+	u64 reserved_u64_1;
+	u32 flags;             /* MDF */
+	u32 magic;
+	u32 md_size_sect;
+	u32 al_offset;         /* offset to this block */
+	u32 al_nr_extents;     /* important for restoring the AL */
+	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+	u32 bm_offset;         /* offset to the bitmap, from here */
+	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
+	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
+	u32 reserved_u32[3];
+
+} __packed;
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @mdev:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_conf *mdev)
+{
+	struct meta_data_on_disk *buffer;
+	sector_t sector;
+	int i;
+
+	del_timer(&mdev->md_sync_timer);
+	/* timer may be rearmed by drbd_md_mark_dirty() now. */
+	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+		return;
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(mdev, D_FAILED))
+		return;
+
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
+
+	memset(buffer, 0, 512);
+
+	trace_drbd_md_io(mdev, WRITE, mdev->ldev);
+	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
+	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+
+	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
+	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
+	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
+	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
+
+	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
+	buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
+
+	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+	sector = mdev->ldev->md.md_offset;
+
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+		/* this was a try anyways ... */
+		dev_err(DEV, "meta data update failed!\n");
+		drbd_chk_io_error(mdev, 1, true);
+	}
+
+	/* Update mdev->ldev->md.la_size_sect,
+	 * since we updated it on metadata. */
+	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
+
+	drbd_md_put_buffer(mdev);
+out:
+	put_ldev(mdev);
+}
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @mdev:	DRBD device.
+ * @bdev:	Device from which the meta data should be read in.
+ *
+ * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
+ * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ */
+int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	struct meta_data_on_disk *buffer;
+	int i, rv = NO_ERROR;
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING))
+		return ERR_IO_MD_DISK;
+
+	trace_drbd_md_io(mdev, READ, bdev);
+
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
+
+	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+		/* NOTE: can't do normal error processing here as this is
+		   called BEFORE disk is attached */
+		dev_err(DEV, "Error while reading metadata.\n");
+		rv = ERR_IO_MD_DISK;
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
+		dev_err(DEV, "Error while reading metadata, magic not found.\n");
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
+		dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+		dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+		dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
+		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+		dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+
+	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+	bdev->md.flags = be32_to_cpu(buffer->flags);
+	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
+	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.conn < C_CONNECTED) {
+		int peer;
+		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+		peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
+		mdev->peer_max_bio_size = peer;
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+ err:
+	drbd_md_put_buffer(mdev);
+ out:
+	put_ldev(mdev);
+
+	return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @mdev:	DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+#ifdef DRBD_DEBUG_MD_SYNC
+void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
+{
+	if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
+		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
+		mdev->last_md_mark_dirty.line = line;
+		mdev->last_md_mark_dirty.func = func;
+	}
+}
+#else
+void drbd_md_mark_dirty(struct drbd_conf *mdev)
+{
+	if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
+		mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+}
+#endif
+
+static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
+{
+	int i;
+
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+}
+
+void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	if (idx == UI_CURRENT) {
+		if (mdev->state.role == R_PRIMARY)
+			val |= 1;
+		else
+			val &= ~((u64)1);
+
+		drbd_set_ed_uuid(mdev, val);
+	}
+
+	mdev->ldev->md.uuid[idx] = val;
+	drbd_md_mark_dirty(mdev);
+}
+
+
+void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	if (mdev->ldev->md.uuid[idx]) {
+		drbd_uuid_move_history(mdev);
+		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+	}
+	_drbd_uuid_set(mdev, idx, val);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @mdev:	DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
+{
+	u64 val;
+	unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
+
+	if (bm_uuid)
+		dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
+
+	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+
+	get_random_bytes(&val, sizeof(u64));
+	_drbd_uuid_set(mdev, UI_CURRENT, val);
+	drbd_print_uuids(mdev, "new current UUID");
+	/* get it to stable storage _now_ */
+	drbd_md_sync(mdev);
+}
+
+void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
+{
+	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+		return;
+
+	if (val == 0) {
+		drbd_uuid_move_history(mdev);
+		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
+		mdev->ldev->md.uuid[UI_BITMAP] = 0;
+	} else {
+		unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
+		if (bm_uuid)
+			dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
+
+		mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
+	}
+	drbd_md_mark_dirty(mdev);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev:	DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		drbd_md_set_flag(mdev, MDF_FULL_SYNC);
+		drbd_md_sync(mdev);
+		drbd_bm_set_all(mdev);
+
+		rv = drbd_bm_write(mdev);
+
+		if (!rv) {
+			drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+			drbd_md_sync(mdev);
+		}
+
+		put_ldev(mdev);
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev:	DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	drbd_resume_al(mdev);
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		drbd_bm_clear_all(mdev);
+		rv = drbd_bm_write(mdev);
+		put_ldev(mdev);
+	}
+
+	return rv;
+}
+
+STATIC int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+	int rv = -EIO;
+
+	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
+
+	if (get_ldev(mdev)) {
+		drbd_bm_lock(mdev, work->why, work->flags);
+		rv = work->io_fn(mdev);
+		drbd_bm_unlock(mdev);
+		put_ldev(mdev);
+	}
+
+	clear_bit(BITMAP_IO, &mdev->flags);
+	smp_mb__after_clear_bit();
+	wake_up(&mdev->misc_wait);
+
+	if (work->done)
+		work->done(mdev, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
+	work->why = NULL;
+	work->flags = 0;
+
+	return 1;
+}
+
+void drbd_ldev_destroy(struct drbd_conf *mdev)
+{
+	lc_destroy(mdev->resync);
+	mdev->resync = NULL;
+	lc_destroy(mdev->act_log);
+	mdev->act_log = NULL;
+	__no_warn(local,
+		drbd_free_bc(mdev->ldev);
+		mdev->ldev = NULL;);
+
+	if (mdev->md_io_tmpp) {
+		__free_page(mdev->md_io_tmpp);
+		mdev->md_io_tmpp = NULL;
+	}
+	clear_bit(GO_DISKLESS, &mdev->flags);
+}
+
+STATIC int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	D_ASSERT(mdev->state.disk == D_FAILED);
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
+	drbd_force_state(mdev, NS(disk, D_DISKLESS));
+	return 1;
+}
+
+void drbd_go_diskless(struct drbd_conf *mdev)
+{
+	D_ASSERT(mdev->state.disk == D_FAILED);
+	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @mdev:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @done:	callback to be called after the bitmap IO was performed
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ */
+void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+			  int (*io_fn)(struct drbd_conf *),
+			  void (*done)(struct drbd_conf *, int),
+			  char *why, enum bm_flag flags)
+{
+	D_ASSERT(current == mdev->worker.task);
+
+	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
+	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
+	if (mdev->bm_io_work.why)
+		dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
+			why, mdev->bm_io_work.why);
+
+	mdev->bm_io_work.io_fn = io_fn;
+	mdev->bm_io_work.done = done;
+	mdev->bm_io_work.why = why;
+	mdev->bm_io_work.flags = flags;
+
+	spin_lock_irq(&mdev->req_lock);
+	set_bit(BITMAP_IO, &mdev->flags);
+	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
+ * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
+ * @mdev:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(current != mdev->worker.task);
+
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_suspend_io(mdev);
+
+	drbd_bm_lock(mdev, why, flags);
+	rv = io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_resume_io(mdev);
+
+	return rv;
+}
+
+void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+	if ((mdev->ldev->md.flags & flag) != flag) {
+		drbd_md_mark_dirty(mdev);
+		mdev->ldev->md.flags |= flag;
+	}
+}
+
+void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+	if ((mdev->ldev->md.flags & flag) != 0) {
+		drbd_md_mark_dirty(mdev);
+		mdev->ldev->md.flags &= ~flag;
+	}
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+	return (bdev->md.flags & flag) != 0;
+}
+
+STATIC void md_sync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+}
+
+STATIC int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+#ifdef DRBD_DEBUG_MD_SYNC
+	dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
+		mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
+#endif
+	drbd_md_sync(mdev);
+	return 1;
+}
+
+#ifdef DRBD_ENABLE_FAULTS
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+STATIC unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (!rsp->count--) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+STATIC char *
+_drbd_fault_str(unsigned int type) {
+	static char *_faults[] = {
+		[DRBD_FAULT_MD_WR] = "Meta-data write",
+		[DRBD_FAULT_MD_RD] = "Meta-data read",
+		[DRBD_FAULT_RS_WR] = "Resync write",
+		[DRBD_FAULT_RS_RD] = "Resync read",
+		[DRBD_FAULT_DT_WR] = "Data write",
+		[DRBD_FAULT_DT_RD] = "Data read",
+		[DRBD_FAULT_DT_RA] = "Data read ahead",
+		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
+		[DRBD_FAULT_AL_EE] = "EE allocation",
+		[DRBD_FAULT_RECEIVE] = "receive data corruption",
+	};
+
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
+{
+	static struct fault_random_state rrs = {0, 0};
+
+	unsigned int ret = (
+		(fault_devs == 0 ||
+			((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
+		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+	if (ret) {
+		fault_count++;
+
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_warn(DEV, "***Simulating %s failure\n",
+				_drbd_fault_str(type));
+	}
+
+	return ret;
+}
+#endif
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+/* For drbd_tracing: */
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_nl.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_nl.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_nl.c	2015-01-21 12:02:58.382823964 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_nl.c	2015-01-21 12:02:58.382823964 +0300
@@ -0,0 +1,2774 @@
+/*
+   drbd_nl.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/connector.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_wrappers.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_tag_magic.h>
+#include <linux/drbd_limits.h>
+#include <linux/compiler.h>
+#include <linux/kthread.h>
+
+static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
+static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
+static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
+
+/* see get_sb_bdev and bd_claim */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+/* Generate the tag_list to struct functions */
+#define NL_PACKET(name, number, fields) \
+STATIC int name ## _from_tags(struct drbd_conf *mdev, \
+	unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
+STATIC int name ## _from_tags(struct drbd_conf *mdev, \
+	unsigned short *tags, struct name *arg) \
+{ \
+	int tag; \
+	int dlen; \
+	\
+	while ((tag = get_unaligned(tags++)) != TT_END) {	\
+		dlen = get_unaligned(tags++);			\
+		switch (tag_number(tag)) { \
+		fields \
+		default: \
+			if (tag & T_MANDATORY) { \
+				dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
+				return 0; \
+			} \
+		} \
+		tags = (unsigned short *)((char *)tags + dlen); \
+	} \
+	return 1; \
+}
+#define NL_INTEGER(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
+		arg->member = get_unaligned((int *)(tags));	\
+		break;
+#define NL_INT64(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
+		arg->member = get_unaligned((u64 *)(tags));	\
+		break;
+#define NL_BIT(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
+		arg->member = *(char *)(tags) ? 1 : 0; \
+		break;
+#define NL_STRING(pn, pr, member, len) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
+		if (dlen > len) { \
+			dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
+				#member, dlen, (unsigned int)len); \
+			return 0; \
+		} \
+		 arg->member ## _len = dlen; \
+		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
+		 break;
+#include "linux/drbd_nl.h"
+
+/* Generate the struct to tag_list functions */
+#define NL_PACKET(name, number, fields) \
+STATIC unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+	struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
+STATIC unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+	struct name *arg, unsigned short *tags) \
+{ \
+	fields \
+	return tags; \
+}
+
+#define NL_INTEGER(pn, pr, member) \
+	put_unaligned(pn | pr | TT_INTEGER, tags++);	\
+	put_unaligned(sizeof(int), tags++);		\
+	put_unaligned(arg->member, (int *)tags);	\
+	tags = (unsigned short *)((char *)tags+sizeof(int));
+#define NL_INT64(pn, pr, member) \
+	put_unaligned(pn | pr | TT_INT64, tags++);	\
+	put_unaligned(sizeof(u64), tags++);		\
+	put_unaligned(arg->member, (u64 *)tags);	\
+	tags = (unsigned short *)((char *)tags+sizeof(u64));
+#define NL_BIT(pn, pr, member) \
+	put_unaligned(pn | pr | TT_BIT, tags++);	\
+	put_unaligned(sizeof(char), tags++);		\
+	*(char *)tags = arg->member; \
+	tags = (unsigned short *)((char *)tags+sizeof(char));
+#define NL_STRING(pn, pr, member, len) \
+	put_unaligned(pn | pr | TT_STRING, tags++);	\
+	put_unaligned(arg->member ## _len, tags++);	\
+	memcpy(tags, arg->member, arg->member ## _len); \
+	tags = (unsigned short *)((char *)tags + arg->member ## _len);
+#include "linux/drbd_nl.h"
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
+void drbd_nl_send_reply(struct cn_msg *, int);
+
+int drbd_khelper(struct drbd_conf *mdev, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			NULL, /* Will be set to address family */
+			NULL, /* Will be set to address */
+			NULL };
+
+	char mb[12], af[20], ad[60], *afs;
+	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	int ret;
+
+	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
+
+	if (get_net_conf(mdev)) {
+		switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
+		case AF_INET6:
+			afs = "ipv6";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
+				 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
+			break;
+		case AF_INET:
+			afs = "ipv4";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+			break;
+		default:
+			afs = "ssocks";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+		}
+		snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
+		envp[3]=af;
+		envp[4]=ad;
+		put_net_conf(mdev);
+	}
+
+	/* The helper may take some time.
+	 * write out any unsynced meta data changes now */
+	drbd_md_sync(mdev);
+
+	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+
+	drbd_bcast_ev_helper(mdev, cmd);
+	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+	if (ret)
+		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+{
+	char *ex_to_string;
+	int r;
+	enum drbd_disk_state nps;
+	enum drbd_fencing_p fp;
+
+	D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+
+	if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	} else {
+		dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
+		nps = mdev->state.pdsk;
+		goto out;
+	}
+
+	r = drbd_khelper(mdev, "fence-peer");
+
+	switch ((r>>8) & 0xff) {
+	case 3: /* peer is inconsistent */
+		ex_to_string = "peer is inconsistent or worse";
+		nps = D_INCONSISTENT;
+		break;
+	case 4: /* peer got outdated, or was already outdated */
+		ex_to_string = "peer was fenced";
+		nps = D_OUTDATED;
+		break;
+	case 5: /* peer was down */
+		if (mdev->state.disk == D_UP_TO_DATE) {
+			/* we will(have) create(d) a new UUID anyways... */
+			ex_to_string = "peer is unreachable, assumed to be dead";
+			nps = D_OUTDATED;
+		} else {
+			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+			nps = mdev->state.pdsk;
+		}
+		break;
+	case 6: /* Peer is primary, voluntarily outdate myself.
+		 * This is useful when an unconnected R_SECONDARY is asked to
+		 * become R_PRIMARY, but finds the other peer being active. */
+		ex_to_string = "peer is active";
+		dev_warn(DEV, "Peer is primary, outdating myself.\n");
+		nps = D_UNKNOWN;
+		_drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+		break;
+	case 7:
+		/* THINK: do we need to handle this
+		 * like case 4, or more like case 5? */
+		if (fp != FP_STONITH)
+			dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+		ex_to_string = "peer was stonithed";
+		nps = D_OUTDATED;
+		break;
+	default:
+		/* The script is broken ... */
+		nps = D_UNKNOWN;
+		dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return nps;
+	}
+
+	dev_info(DEV, "fence-peer helper returned %d (%s)\n",
+			(r>>8) & 0xff, ex_to_string);
+
+out:
+	if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
+		/* The handler was not successful... unfreeze here, the
+		   state engine can not unfreeze... */
+		_drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
+	}
+
+	return nps;
+}
+
+static int _try_outdate_peer_async(void *data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *)data;
+	enum drbd_disk_state nps;
+	union drbd_state ns;
+
+	nps = drbd_try_outdate_peer(mdev);
+
+	/* Not using
+	   drbd_request_state(mdev, NS(pdsk, nps));
+	   here, because we might were able to re-establish the connection
+	   in the meantime. This can only partially be solved in the state's
+	   engine is_valid_state() and is_valid_state_transition()
+	   functions.
+
+	   nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
+	   pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
+	   therefore we have to have the pre state change check here.
+	*/
+	spin_lock_irq(&mdev->req_lock);
+	ns = mdev->state;
+	if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
+		ns.pdsk = nps;
+		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return 0;
+}
+
+void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+{
+	struct task_struct *opa;
+
+	opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
+	if (IS_ERR(opa))
+		dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+}
+
+enum drbd_state_rv
+drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
+{
+	const int max_tries = 4;
+	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+	int try = 0;
+	int forced = 0;
+	union drbd_state mask, val;
+	enum drbd_disk_state nps;
+
+	if (new_role == R_PRIMARY)
+		request_ping(mdev); /* Detect a dead peer ASAP */
+
+	mutex_lock(&mdev->state_mutex);
+
+	mask.i = 0; mask.role = R_MASK;
+	val.i  = 0; val.role  = new_role;
+
+	while (try++ < max_tries) {
+		DRBD_STATE_DEBUG_INIT_VAL(val);
+		rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
+
+		/* in case we first succeeded to outdate,
+		 * but now suddenly could establish a connection */
+		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+			val.pdsk = 0;
+			mask.pdsk = 0;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+		    (mdev->state.disk < D_UP_TO_DATE &&
+		     mdev->state.disk >= D_INCONSISTENT)) {
+			mask.disk = D_MASK;
+			val.disk  = D_UP_TO_DATE;
+			forced = 1;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK &&
+		    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+			D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+			nps = drbd_try_outdate_peer(mdev);
+
+			if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+				val.disk = D_UP_TO_DATE;
+				mask.disk = D_MASK;
+			}
+
+			val.pdsk = nps;
+			mask.pdsk = D_MASK;
+
+			continue;
+		}
+
+		if (rv == SS_NOTHING_TO_DO)
+			goto fail;
+		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+			nps = drbd_try_outdate_peer(mdev);
+
+			if (force && nps > D_OUTDATED) {
+				dev_warn(DEV, "Forced into split brain situation!\n");
+				nps = D_OUTDATED;
+			}
+
+			mask.pdsk = D_MASK;
+			val.pdsk  = nps;
+
+			continue;
+		}
+		if (rv == SS_TWO_PRIMARIES) {
+			/* Maybe the peer is detected as dead very soon...
+			   retry at most once more in this case. */
+			schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
+			if (try < max_tries)
+				try = max_tries - 1;
+			continue;
+		}
+		if (rv < SS_SUCCESS) {
+			DRBD_STATE_DEBUG_INIT_VAL(val);
+			rv = _drbd_request_state(mdev, mask, val,
+						CS_VERBOSE + CS_WAIT_COMPLETE);
+			if (rv < SS_SUCCESS)
+				goto fail;
+		}
+		break;
+	}
+
+	if (rv < SS_SUCCESS)
+		goto fail;
+
+	if (forced)
+		dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
+
+	/* Wait until nothing is on the fly :) */
+	wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+
+	if (new_role == R_SECONDARY) {
+		set_disk_ro(mdev->vdisk, true);
+		if (get_ldev(mdev)) {
+			mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+			put_ldev(mdev);
+		}
+	} else {
+		if (get_net_conf(mdev)) {
+			mdev->net_conf->want_lose = 0;
+			put_net_conf(mdev);
+		}
+		set_disk_ro(mdev->vdisk, false);
+		if (get_ldev(mdev)) {
+			if (((mdev->state.conn < C_CONNECTED ||
+			       mdev->state.pdsk <= D_FAILED)
+			      && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+				drbd_uuid_new_current(mdev);
+
+			mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+			put_ldev(mdev);
+		}
+	}
+
+	/* writeout of activity log covered areas of the bitmap
+	 * to stable storage done in after state change already */
+
+	if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
+		/* if this was forced, we should consider sync */
+		if (forced)
+			drbd_send_uuids(mdev);
+		drbd_send_current_state(mdev);
+	}
+
+	drbd_md_sync(mdev);
+
+	drbd_kobject_uevent(mdev);
+ fail:
+	mutex_unlock(&mdev->state_mutex);
+	return rv;
+}
+
+STATIC struct drbd_conf *ensure_mdev(int minor, int create)
+{
+	struct drbd_conf *mdev;
+
+	if (minor >= minor_count)
+		return NULL;
+
+	mdev = minor_to_mdev(minor);
+
+	if (!mdev && create) {
+		struct gendisk *disk = NULL;
+		mdev = drbd_new_device(minor);
+
+		spin_lock_irq(&drbd_pp_lock);
+		if (minor_table[minor] == NULL) {
+			minor_table[minor] = mdev;
+			disk = mdev->vdisk;
+			mdev = NULL;
+		} /* else: we lost the race */
+		spin_unlock_irq(&drbd_pp_lock);
+
+		if (disk) /* we won the race above */
+			/* in case we ever add a drbd_delete_device(),
+			 * don't forget the del_gendisk! */
+			add_disk(disk);
+		else /* we lost the race above */
+			drbd_free_mdev(mdev);
+
+		mdev = minor_to_mdev(minor);
+	}
+
+	return mdev;
+}
+
+STATIC int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	struct primary primary_args;
+
+	memset(&primary_args, 0, sizeof(struct primary));
+	if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	reply->ret_code =
+		drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
+
+	return 0;
+}
+
+STATIC int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+
+	return 0;
+}
+
+/* initializes the md.*_offset members, so we are able to find
+ * the on disk meta data */
+STATIC void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
+				       struct drbd_backing_dev *bdev)
+{
+	sector_t md_size_sect = 0;
+	switch (bdev->dc.meta_dev_idx) {
+	default:
+		/* v07 style fixed size indexed meta data */
+		bdev->md.md_size_sect = MD_RESERVED_SECT;
+		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+		bdev->md.al_offset = MD_AL_OFFSET;
+		bdev->md.bm_offset = MD_BM_OFFSET;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		/* just occupy the full device; unit: sectors */
+		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+		bdev->md.md_offset = 0;
+		bdev->md.al_offset = MD_AL_OFFSET;
+		bdev->md.bm_offset = MD_BM_OFFSET;
+		break;
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+		/* al size is still fixed */
+		bdev->md.al_offset = -MD_AL_MAX_SIZE;
+		/* we need (slightly less than) ~ this much bitmap sectors: */
+		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+		md_size_sect = ALIGN(md_size_sect, 8);
+
+		/* plus the "drbd meta data super block",
+		 * and the activity log; */
+		md_size_sect += MD_BM_OFFSET;
+
+		bdev->md.md_size_sect = md_size_sect;
+		/* bitmap offset is adjusted by 'super' block size */
+		bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
+		break;
+	}
+}
+
+/* input size is expected to be in KB */
+char *ppsize(char *buf, unsigned long long size)
+{
+	/* Needs 9 bytes at max including trailing NUL:
+	 * -1ULL ==> "16384 EB" */
+	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+	int base = 0;
+	while (size >= 10000 && base < sizeof(units)-1) {
+		/* shift + round */
+		size = (size >> 10) + !!(size & (1<<9));
+		base++;
+	}
+	sprintf(buf, "%u %cB", (unsigned)size, units[base]);
+
+	return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ *  (not connected, or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ *  peer may not initiate a resize.
+ */
+void drbd_suspend_io(struct drbd_conf *mdev)
+{
+	set_bit(SUSPEND_IO, &mdev->flags);
+	if (is_susp(mdev->state))
+		return;
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_conf *mdev)
+{
+	clear_bit(SUSPEND_IO, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
+ * @mdev:	DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
+{
+	sector_t prev_first_sect, prev_size; /* previous meta location */
+	sector_t la_size;
+	sector_t size;
+	char ppb[10];
+
+	int md_moved, la_size_changed;
+	enum determine_dev_size rv = unchanged;
+
+	/* race:
+	 * application request passes inc_ap_bio,
+	 * but then cannot get an AL-reference.
+	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	 *
+	 * to avoid that:
+	 * Suspend IO right here.
+	 * still lock the act_log to not trigger ASSERTs there.
+	 */
+	drbd_suspend_io(mdev);
+
+	/* no wait necessary anymore, actually we could assert that */
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	prev_first_sect = drbd_md_first_sector(mdev->ldev);
+	prev_size = mdev->ldev->md.md_size_sect;
+	la_size = mdev->ldev->md.la_size_sect;
+
+	/* TODO: should only be some assert here, not (re)init... */
+	drbd_md_set_sector_offsets(mdev, mdev->ldev);
+
+	size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
+
+	if (drbd_get_capacity(mdev->this_bdev) != size ||
+	    drbd_bm_capacity(mdev) != size) {
+		int err;
+		err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(mdev)>>1;
+			if (size == 0) {
+				dev_err(DEV, "OUT OF MEMORY! "
+				    "Could not allocate bitmap!\n");
+			} else {
+				dev_err(DEV, "BM resizing failed. "
+				    "Leaving size unchanged at size = %lu KB\n",
+				    (unsigned long)size);
+			}
+			rv = dev_size_error;
+		}
+		/* racy, see comments above. */
+		drbd_set_my_capacity(mdev, size);
+		mdev->ldev->md.la_size_sect = size;
+		dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+		     (unsigned long long)size>>1);
+	}
+	if (rv == dev_size_error)
+		goto out;
+
+	la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+
+	md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
+		|| prev_size	   != mdev->ldev->md.md_size_sect;
+
+	if (la_size_changed || md_moved) {
+		int err;
+
+		drbd_al_shrink(mdev); /* All extents inactive. */
+		dev_info(DEV, "Writing the whole bitmap, %s\n",
+			 la_size_changed && md_moved ? "size changed and md moved" :
+			 la_size_changed ? "size changed" : "md moved");
+		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+		err = drbd_bitmap_io(mdev, &drbd_bm_write,
+				"size changed", BM_LOCKED_MASK);
+		if (err) {
+			rv = dev_size_error;
+			goto out;
+		}
+		drbd_md_mark_dirty(mdev);
+	}
+
+	if (size > la_size)
+		rv = grew;
+	if (size < la_size)
+		rv = shrunk;
+out:
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
+{
+	sector_t p_size = mdev->p_size;   /* partner's disk size. */
+	sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+	sector_t m_size; /* my size */
+	sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
+	sector_t size = 0;
+
+	m_size = drbd_get_max_capacity(bdev);
+
+	if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) {
+		dev_warn(DEV, "Resize while not connected was forced by the user!\n");
+		p_size = m_size;
+	}
+
+	if (p_size && m_size) {
+		size = min_t(sector_t, p_size, m_size);
+	} else {
+		if (la_size) {
+			size = la_size;
+			if (m_size && m_size < size)
+				size = m_size;
+			if (p_size && p_size < size)
+				size = p_size;
+		} else {
+			if (m_size)
+				size = m_size;
+			if (p_size)
+				size = p_size;
+		}
+	}
+
+	if (size == 0)
+		dev_err(DEV, "Both nodes diskless!\n");
+
+	if (u_size) {
+		if (u_size > size)
+			dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size>>1, (unsigned long)size>>1);
+		else
+			size = u_size;
+	}
+
+	return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @mdev:	DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+STATIC int drbd_check_al_size(struct drbd_conf *mdev)
+{
+	struct lru_cache *n, *t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	ERR_IF(mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+	if (mdev->act_log &&
+	    mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+		return 0;
+
+	in_use = 0;
+	t = mdev->act_log;
+	n = lc_create("act_log", drbd_al_ext_cache,
+		mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+
+	if (n == NULL) {
+		dev_err(DEV, "Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&mdev->al_lock);
+	if (t) {
+		for (i = 0; i < t->nr_elements; i++) {
+			e = lc_element_by_index(t, i);
+			if (e->refcnt)
+				dev_err(DEV, "refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use)
+		mdev->act_log = n;
+	spin_unlock_irq(&mdev->al_lock);
+	if (in_use) {
+		dev_err(DEV, "Activity log still in use!\n");
+		lc_destroy(n);
+		return -EBUSY;
+	} else {
+		if (t)
+			lc_destroy(t);
+	}
+	drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
+	return 0;
+}
+
+static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
+{
+	struct request_queue * const q = mdev->rq_queue;
+	int max_hw_sectors = max_bio_size >> 9;
+	int max_segments = 0;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
+
+		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+		max_segments = mdev->ldev->dc.max_bio_bvecs;
+		put_ldev(mdev);
+	}
+
+	blk_queue_logical_block_size(q, 512);
+	blk_queue_max_hw_sectors(q, max_hw_sectors);
+	/* This is the workaround for "bio would need to, but cannot, be split" */
+	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
+
+		blk_queue_stack_limits(q, b);
+
+		if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+			dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+				 q->backing_dev_info.ra_pages,
+				 b->backing_dev_info.ra_pages);
+			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+		}
+		put_ldev(mdev);
+	}
+}
+
+void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
+{
+	int now, new, local, peer;
+
+	now = queue_max_hw_sectors(mdev->rq_queue) << 9;
+	local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
+	peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
+		mdev->local_max_bio_size = local;
+		put_ldev(mdev);
+	}
+
+	/* We may ignore peer limits if the peer is modern enough.
+	   Because new from 8.3.8 onwards the peer can use multiple
+	   BIOs for a single peer_request */
+	if (mdev->state.conn >= C_CONNECTED) {
+		if (mdev->agreed_pro_version < 94) {
+			peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		} else if (mdev->agreed_pro_version == 94)
+			peer = DRBD_MAX_SIZE_H80_PACKET;
+		else /* drbd 8.3.8 onwards */
+			peer = DRBD_MAX_BIO_SIZE;
+	}
+
+	new = min_t(int, local, peer);
+
+	if (mdev->state.role == R_PRIMARY && new < now)
+		dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
+
+	if (new != now)
+		dev_info(DEV, "max BIO size = %u\n", new);
+
+	drbd_setup_queue_param(mdev, new);
+}
+
+/* serialize deconfig (worker exiting, doing cleanup)
+ * and reconfig (drbdsetup disk, drbdsetup net)
+ *
+ * Wait for a potentially exiting worker, then restart it,
+ * or start a new one.  Flush any pending work, there may still be an
+ * after_state_change queued.
+ */
+static void drbd_reconfig_start(struct drbd_conf *mdev)
+{
+	wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
+	wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
+	drbd_thread_start(&mdev->worker);
+	drbd_flush_workqueue(mdev);
+}
+
+/* if still unconfigured, stops worker again.
+ * if configured now, clears CONFIG_PENDING.
+ * wakes potential waiters */
+static void drbd_reconfig_done(struct drbd_conf *mdev)
+{
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.disk == D_DISKLESS &&
+	    mdev->state.conn == C_STANDALONE &&
+	    mdev->state.role == R_SECONDARY) {
+		set_bit(DEVICE_DYING, &mdev->flags);
+		drbd_thread_stop_nowait(&mdev->worker);
+	} else
+		clear_bit(CONFIG_PENDING, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+	wake_up(&mdev->state_wait);
+}
+
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_conf *mdev)
+{
+	int s = 0;
+
+	if (lc_try_lock(mdev->act_log)) {
+		drbd_al_shrink(mdev);
+		lc_unlock(mdev->act_log);
+	} else {
+		dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
+		return;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.conn < C_CONNECTED)
+		s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (s)
+		dev_info(DEV, "Suspended AL updates\n");
+}
+
+/* does always return 0;
+ * interesting return code is in reply->ret_code */
+STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
+	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct block_device *bdev;
+	struct lru_cache *resync_lru = NULL;
+	union drbd_state ns, os;
+	enum drbd_state_rv rv;
+	int cp_discovered = 0;
+	int logical_block_size;
+
+	drbd_reconfig_start(mdev);
+
+	/* if you want to reconfigure, please tear down first */
+	if (mdev->state.disk > D_DISKLESS) {
+		retcode = ERR_DISK_CONFIGURED;
+		goto fail;
+	}
+	/* It may just now have detached because of IO error.  Make sure
+	 * drbd_ldev_destroy is done already, we may end up here very fast,
+	 * e.g. if someone calls attach from the on-io-error handler,
+	 * to realize a "hot spare" feature (not that I'd recommend that) */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+
+	/* allocation not in the IO path, cqueue thread context */
+	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+	if (!nbc) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	nbc->dc.disk_size     = DRBD_DISK_SIZE_SECT_DEF;
+	nbc->dc.on_io_error   = DRBD_ON_IO_ERROR_DEF;
+	nbc->dc.fencing       = DRBD_FENCING_DEF;
+	nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
+
+	if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	if (get_net_conf(mdev)) {
+		int prot = mdev->net_conf->wire_protocol;
+		put_net_conf(mdev);
+		if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
+
+	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
+	if (IS_ERR(bdev)) {
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+			PTR_ERR(bdev));
+		retcode = ERR_OPEN_DISK;
+		goto fail;
+	}
+	nbc->backing_bdev = bdev;
+
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				  (nbc->dc.meta_dev_idx < 0) ?
+				  (void *)mdev : (void *)drbd_m_holder);
+	if (IS_ERR(bdev)) {
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+			PTR_ERR(bdev));
+		retcode = ERR_OPEN_MD_DISK;
+		goto fail;
+	}
+	nbc->md_bdev = bdev;
+
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	resync_lru = lc_create("resync", drbd_bm_ext_cache,
+			61, sizeof(struct bm_extent),
+			offsetof(struct bm_extent, lce));
+	if (!resync_lru) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) nbc->dc.disk_size);
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	if (nbc->dc.meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = ERR_MD_DISK_TOO_SMALL;
+		dev_warn(DEV, "refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto fail;
+	}
+
+	/* Make sure the new disk is big enough
+	 * (we may currently be R_PRIMARY with no local disk...) */
+	if (drbd_get_max_capacity(nbc) <
+	    drbd_get_capacity(mdev->this_bdev)) {
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+	if (nbc->known_size > max_possible_sectors) {
+		dev_warn(DEV, "==> truncating very big lower level device "
+			"to currently maximum possible %llu sectors <==\n",
+			(unsigned long long) max_possible_sectors);
+		if (nbc->dc.meta_dev_idx >= 0)
+			dev_warn(DEV, "==>> using internal or flexible "
+				      "meta data may help <<==\n");
+	}
+
+	drbd_suspend_io(mdev);
+	/* also wait for the last barrier ack. */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
+	/* and for any other previously queued work */
+	drbd_flush_workqueue(mdev);
+
+	rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
+	retcode = rv;  /* FIXME: Type mismatch. */
+	drbd_resume_io(mdev);
+	if (rv < SS_SUCCESS)
+		goto fail;
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING))
+		goto force_diskless;
+
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	/* allocate a second IO page if logical_block_size != 512 */
+	logical_block_size = bdev_logical_block_size(nbc->md_bdev);
+	if (logical_block_size == 0)
+		logical_block_size = MD_SECTOR_SIZE;
+
+	if (logical_block_size != MD_SECTOR_SIZE) {
+		if (!mdev->md_io_tmpp) {
+			struct page *page = alloc_page(GFP_NOIO);
+			if (!page)
+				goto force_diskless_dec;
+
+			dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
+			     logical_block_size, MD_SECTOR_SIZE);
+			dev_warn(DEV, "Workaround engaged (has performance impact).\n");
+
+			mdev->md_io_tmpp = page;
+		}
+	}
+
+	if (!mdev->bitmap) {
+		if (drbd_bm_init(mdev)) {
+			retcode = ERR_NOMEM;
+			goto force_diskless_dec;
+		}
+	}
+
+	retcode = drbd_md_read(mdev, nbc);
+	if (retcode != NO_ERROR)
+		goto force_diskless_dec;
+
+	if (mdev->state.conn < C_CONNECTED &&
+	    mdev->state.role == R_PRIMARY &&
+	    (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+		dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
+		    (unsigned long long)mdev->ed_uuid);
+		retcode = ERR_DATA_NOT_CURRENT;
+		goto force_diskless_dec;
+	}
+
+	/* Since we are diskless, fix the activity log first... */
+	if (drbd_check_al_size(mdev)) {
+		retcode = ERR_NOMEM;
+		goto force_diskless_dec;
+	}
+
+	/* Prevent shrinking of consistent devices ! */
+	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
+		dev_warn(DEV, "refusing to truncate a consistent device\n");
+		retcode = ERR_DISK_TOO_SMALL;
+		goto force_diskless_dec;
+	}
+
+	if (!drbd_al_read_log(mdev, nbc)) {
+		retcode = ERR_IO_MD_DISK;
+		goto force_diskless_dec;
+	}
+
+	/* Reset the "barriers don't work" bits here, then force meta data to
+	 * be written, to ensure we determine if barriers are supported. */
+	if (nbc->dc.no_md_flush)
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+	else
+		clear_bit(MD_NO_BARRIER, &mdev->flags);
+
+	/* Point of no return reached.
+	 * Devices and memory are no longer released by error cleanup below.
+	 * now mdev takes over responsibility, and the state engine should
+	 * clean it up somewhere.  */
+	D_ASSERT(mdev->ldev == NULL);
+	mdev->ldev = nbc;
+	mdev->resync = resync_lru;
+	nbc = NULL;
+	resync_lru = NULL;
+
+	mdev->write_ordering = WO_bio_barrier;
+	drbd_bump_write_ordering(mdev, WO_bio_barrier);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
+		set_bit(CRASHED_PRIMARY, &mdev->flags);
+	else
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+	    !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
+		set_bit(CRASHED_PRIMARY, &mdev->flags);
+		cp_discovered = 1;
+	}
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+	mdev->read_cnt = 0;
+	mdev->writ_cnt = 0;
+
+	drbd_reconsider_max_bio_size(mdev);
+
+	/* If I am currently not R_PRIMARY,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been R_PRIMARY before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded R_PRIMARY), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	if (mdev->state.role != R_PRIMARY &&
+	     drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+	    !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
+		set_bit(USE_DEGR_WFC_T, &mdev->flags);
+
+	dd = drbd_determine_dev_size(mdev, 0);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto force_diskless_dec;
+	} else if (dd == grew)
+		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+		dev_info(DEV, "Assuming that all blocks are out of sync "
+		     "(aka FullSync)\n");
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
+			"set_n_write from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	} else {
+		if (drbd_bitmap_io(mdev, &drbd_bm_read,
+			"read from attaching", BM_LOCKED_MASK) < 0) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (cp_discovered) {
+		drbd_al_apply_to_bm(mdev);
+		if (drbd_bitmap_io(mdev, &drbd_bm_write,
+			"crashed primary apply AL", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
+		drbd_suspend_al(mdev); /* IO is still suspended here... */
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+	ns.i = os.i;
+	/* If MDF_CONSISTENT is not set go into inconsistent state,
+	   otherwise investigate MDF_WasUpToDate...
+	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+	   otherwise into D_CONSISTENT state.
+	*/
+	if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
+		if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
+			ns.disk = D_CONSISTENT;
+		else
+			ns.disk = D_OUTDATED;
+	} else {
+		ns.disk = D_INCONSISTENT;
+	}
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
+		ns.pdsk = D_OUTDATED;
+
+	if ( ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+		ns.disk = D_UP_TO_DATE;
+
+	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+	   this point, because drbd_request_state() modifies these
+	   flags. */
+
+	/* In case we are C_CONNECTED postpone any decision on the new disk
+	   state after the negotiation phase. */
+	if (mdev->state.conn == C_CONNECTED) {
+		mdev->new_state_tmp.i = ns.i;
+		ns.i = os.i;
+		ns.disk = D_NEGOTIATING;
+
+		/* We expect to receive up-to-date UUIDs soon.
+		   To avoid a race in receive_state, free p_uuid while
+		   holding req_lock. I.e. atomic with the state change */
+		kfree(mdev->p_uuid);
+		mdev->p_uuid = NULL;
+	}
+
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
+	rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	ns.i = mdev->state.i;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		goto force_diskless_dec;
+
+	if (mdev->state.role == R_PRIMARY)
+		mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+	else
+		mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+	drbd_md_mark_dirty(mdev);
+	drbd_md_sync(mdev);
+
+	drbd_kobject_uevent(mdev);
+	put_ldev(mdev);
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+
+ force_diskless_dec:
+	put_ldev(mdev);
+ force_diskless:
+	drbd_force_state(mdev, NS(disk, D_FAILED));
+	drbd_md_sync(mdev);
+ fail:
+	if (nbc) {
+		if (nbc->backing_bdev)
+			blkdev_put(nbc->backing_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		if (nbc->md_bdev)
+			blkdev_put(nbc->md_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		kfree(nbc);
+	}
+	lc_destroy(resync_lru);
+
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+}
+
+/* Detaching the disk is a process in multiple stages.  First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+STATIC int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			  struct drbd_nl_cfg_reply *reply)
+{
+	enum drbd_ret_code retcode;
+	int ret;
+	struct detach dt = {};
+
+	if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		goto out;
+	}
+
+	if (dt.detach_force) {
+		drbd_force_state(mdev, NS(disk, D_FAILED));
+		reply->ret_code = SS_SUCCESS;
+		goto out;
+	}
+
+	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
+	drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
+	retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
+	drbd_md_put_buffer(mdev);
+	/* D_FAILED will transition to DISKLESS. */
+	ret = wait_event_interruptible(mdev->misc_wait,
+			mdev->state.disk != D_FAILED);
+	drbd_resume_io(mdev);
+
+	if ((int)retcode == (int)SS_IS_DISKLESS)
+		retcode = SS_NOTHING_TO_DO;
+	if (ret)
+		retcode = ERR_INTR;
+	reply->ret_code = retcode;
+out:
+	return 0;
+}
+
+STATIC int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			    struct drbd_nl_cfg_reply *reply)
+{
+	int i, ns;
+	enum drbd_ret_code retcode;
+	struct net_conf *new_conf = NULL;
+	struct crypto_hash *tfm = NULL;
+	struct crypto_hash *integrity_w_tfm = NULL;
+	struct crypto_hash *integrity_r_tfm = NULL;
+	struct hlist_head *new_tl_hash = NULL;
+	struct hlist_head *new_ee_hash = NULL;
+	struct drbd_conf *odev;
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	void *int_dig_out = NULL;
+	void *int_dig_in = NULL;
+	void *int_dig_vv = NULL;
+	struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+
+	drbd_reconfig_start(mdev);
+
+	if (mdev->state.conn > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, cqueue thread context */
+	new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
+	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
+	new_conf->ping_int	   = DRBD_PING_INT_DEF;
+	new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
+	new_conf->max_buffers	   = DRBD_MAX_BUFFERS_DEF;
+	new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
+	new_conf->sndbuf_size	   = DRBD_SNDBUF_SIZE_DEF;
+	new_conf->rcvbuf_size	   = DRBD_RCVBUF_SIZE_DEF;
+	new_conf->ko_count	   = DRBD_KO_COUNT_DEF;
+	new_conf->after_sb_0p	   = DRBD_AFTER_SB_0P_DEF;
+	new_conf->after_sb_1p	   = DRBD_AFTER_SB_1P_DEF;
+	new_conf->after_sb_2p	   = DRBD_AFTER_SB_2P_DEF;
+	new_conf->want_lose	   = 0;
+	new_conf->two_primaries    = 0;
+	new_conf->wire_protocol    = DRBD_PROT_C;
+	new_conf->ping_timeo	   = DRBD_PING_TIMEO_DEF;
+	new_conf->rr_conflict	   = DRBD_RR_CONFLICT_DEF;
+	new_conf->on_congestion    = DRBD_ON_CONGESTION_DEF;
+	new_conf->cong_extents     = DRBD_CONG_EXTENTS_DEF;
+
+	if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (new_conf->two_primaries
+	    && (new_conf->wire_protocol != DRBD_PROT_C)) {
+		retcode = ERR_NOT_PROTO_C;
+		goto fail;
+	}
+
+	if (get_ldev(mdev)) {
+		enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+		if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
+
+	if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
+		retcode = ERR_CONG_NOT_PROTO_A;
+		goto fail;
+	}
+
+	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
+		retcode = ERR_DISCARD;
+		goto fail;
+	}
+
+	retcode = NO_ERROR;
+
+	new_my_addr = (struct sockaddr *)&new_conf->my_addr;
+	new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev || odev == mdev)
+			continue;
+		if (get_net_conf(odev)) {
+			taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
+			if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
+			    !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
+				retcode = ERR_LOCAL_ADDR;
+
+			taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
+			if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
+			    !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
+				retcode = ERR_PEER_ADDR;
+
+			put_net_conf(odev);
+			if (retcode != NO_ERROR)
+				goto fail;
+		}
+	}
+
+	if (new_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			new_conf->cram_hmac_alg);
+		tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm)) {
+			tfm = NULL;
+			retcode = ERR_AUTH_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
+			retcode = ERR_AUTH_ALG_ND;
+			goto fail;
+		}
+	}
+
+	if (new_conf->integrity_alg[0]) {
+		integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(integrity_w_tfm)) {
+			integrity_w_tfm = NULL;
+			retcode=ERR_INTEGRITY_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
+			retcode=ERR_INTEGRITY_ALG_ND;
+			goto fail;
+		}
+
+		integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(integrity_r_tfm)) {
+			integrity_r_tfm = NULL;
+			retcode=ERR_INTEGRITY_ALG;
+			goto fail;
+		}
+	}
+
+	ns = new_conf->max_epoch_size/8;
+	if (mdev->tl_hash_s != ns) {
+		new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+		if (!new_tl_hash) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	ns = new_conf->max_buffers/8;
+	if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
+		new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+		if (!new_ee_hash) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+#if 0
+	/* for the connection loss logic in drbd_recv
+	 * I _need_ the resulting timeo in jiffies to be
+	 * non-zero and different
+	 *
+	 * XXX maybe rather store the value scaled to jiffies?
+	 * Note: MAX_SCHEDULE_TIMEOUT/HZ*HZ != MAX_SCHEDULE_TIMEOUT
+	 *	 and HZ > 10; which is unlikely to change...
+	 *	 Thus, if interrupted by a signal,
+	 *	 sock_{send,recv}msg returns -EINTR,
+	 *	 if the timeout expires, -EAGAIN.
+	 */
+	/* unlikely: someone disabled the timeouts ...
+	 * just put some huge values in there. */
+	if (!new_conf->ping_int)
+		new_conf->ping_int = MAX_SCHEDULE_TIMEOUT/HZ;
+	if (!new_conf->timeout)
+		new_conf->timeout = MAX_SCHEDULE_TIMEOUT/HZ*10;
+	if (new_conf->ping_int*10 < new_conf->timeout)
+		new_conf->timeout = new_conf->ping_int*10/6;
+	if (new_conf->ping_int*10 == new_conf->timeout)
+		new_conf->ping_int = new_conf->ping_int+1;
+#endif
+
+	/* allocation not in the IO path, cqueue thread context */
+	if (integrity_w_tfm) {
+		i = crypto_hash_digestsize(integrity_w_tfm);
+		int_dig_out = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_out) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+		int_dig_in = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_in) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+		int_dig_vv = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_vv) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	if (!mdev->bitmap) {
+		if(drbd_bm_init(mdev)) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	drbd_flush_workqueue(mdev);
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->net_conf != NULL) {
+		retcode = ERR_NET_CONFIGURED;
+		spin_unlock_irq(&mdev->req_lock);
+		goto fail;
+	}
+	mdev->net_conf = new_conf;
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+
+	if (new_tl_hash) {
+		kfree(mdev->tl_hash);
+		mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
+		mdev->tl_hash = new_tl_hash;
+	}
+
+	if (new_ee_hash) {
+		kfree(mdev->ee_hash);
+		mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
+		mdev->ee_hash = new_ee_hash;
+	}
+
+	crypto_free_hash(mdev->cram_hmac_tfm);
+	mdev->cram_hmac_tfm = tfm;
+
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = integrity_w_tfm;
+
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = integrity_r_tfm;
+
+	kfree(mdev->int_dig_out);
+	kfree(mdev->int_dig_in);
+	kfree(mdev->int_dig_vv);
+	mdev->int_dig_out=int_dig_out;
+	mdev->int_dig_in=int_dig_in;
+	mdev->int_dig_vv=int_dig_vv;
+	retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
+	spin_unlock_irq(&mdev->req_lock);
+
+	drbd_kobject_uevent(mdev);
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+
+fail:
+	kfree(int_dig_out);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	crypto_free_hash(tfm);
+	crypto_free_hash(integrity_w_tfm);
+	crypto_free_hash(integrity_r_tfm);
+	kfree(new_tl_hash);
+	kfree(new_ee_hash);
+	kfree(new_conf);
+
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+}
+
+STATIC int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+	struct disconnect dc;
+
+	memset(&dc, 0, sizeof(struct disconnect));
+	if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (dc.force) {
+		spin_lock_irq(&mdev->req_lock);
+		if (mdev->state.conn >= C_WF_CONNECTION)
+			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+		goto done;
+	}
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+
+	if (retcode == SS_NOTHING_TO_DO)
+		goto done;
+	else if (retcode == SS_ALREADY_STANDALONE)
+		goto done;
+	else if (retcode == SS_PRIMARY_NOP) {
+		/* Our statche checking code wants to see the peer outdated. */
+		retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+						      pdsk, D_OUTDATED));
+	} else if (retcode == SS_CW_FAILED_BY_PEER) {
+		/* The peer probably wants to see us outdated. */
+		retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED),
+					      CS_ORDERED);
+		if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			retcode = SS_SUCCESS;
+		}
+	}
+
+	if (retcode < SS_SUCCESS)
+		goto fail;
+
+	if (wait_event_interruptible(mdev->state_wait,
+				     mdev->state.conn != C_DISCONNECTING)) {
+		/* Do not test for mdev->state.conn == C_STANDALONE, since
+		   someone else might connect us in the mean time! */
+		retcode = ERR_INTR;
+		goto fail;
+	}
+
+ done:
+	retcode = NO_ERROR;
+ fail:
+	drbd_md_sync(mdev);
+	reply->ret_code = retcode;
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_conf *mdev)
+{
+	int iass; /* I am sync source */
+
+	dev_info(DEV, "Resync of new storage after online grow\n");
+	if (mdev->state.role != mdev->state.peer)
+		iass = (mdev->state.role == R_PRIMARY);
+	else
+		iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+	if (iass)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+STATIC int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			  struct drbd_nl_cfg_reply *reply)
+{
+	struct resize rs;
+	int retcode = NO_ERROR;
+	enum determine_dev_size dd;
+	enum dds_flags ddsf;
+
+	memset(&rs, 0, sizeof(struct resize));
+	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (mdev->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail;
+	}
+
+	if (mdev->state.role == R_SECONDARY &&
+	    mdev->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail;
+	}
+
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
+	if (rs.no_resync && mdev->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail_ldev;
+	}
+
+	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
+		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+
+	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determine_dev_size(mdev, ddsf);
+	drbd_md_sync(mdev);
+	put_ldev(mdev);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
+	}
+
+	if (mdev->state.conn == C_CONNECTED) {
+		if (dd == grew)
+			set_bit(RESIZE_PENDING, &mdev->flags);
+
+		drbd_send_uuids(mdev);
+		drbd_send_sizes(mdev, 1, ddsf);
+	}
+
+ fail:
+	reply->ret_code = retcode;
+	return 0;
+
+ fail_ldev:
+	put_ldev(mdev);
+	goto fail;
+}
+
+STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			       struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	struct syncer_conf sc;
+	cpumask_var_t new_cpu_mask;
+	int *rs_plan_s = NULL;
+	int fifo_size;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
+		memset(&sc, 0, sizeof(struct syncer_conf));
+		sc.rate       = DRBD_RATE_DEF;
+		sc.after      = DRBD_AFTER_DEF;
+		sc.al_extents = DRBD_AL_EXTENTS_DEF;
+		sc.on_no_data  = DRBD_ON_NO_DATA_DEF;
+		sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
+		sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
+		sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
+		sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
+		sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
+	} else
+		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
+
+	if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	/* re-sync running */
+	rsr = (	mdev->state.conn == C_SYNC_SOURCE ||
+		mdev->state.conn == C_SYNC_TARGET ||
+		mdev->state.conn == C_PAUSED_SYNC_S ||
+		mdev->state.conn == C_PAUSED_SYNC_T );
+
+	if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
+	}
+
+	if (!rsr && sc.csums_alg[0]) {
+		csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(csums_tfm)) {
+			csums_tfm = NULL;
+			retcode = ERR_CSUMS_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
+			retcode = ERR_CSUMS_ALG_ND;
+			goto fail;
+		}
+	}
+
+	/* online verify running */
+	ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+
+	if (ovr) {
+		if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
+			retcode = ERR_VERIFY_RUNNING;
+			goto fail;
+		}
+	}
+
+	if (!ovr && sc.verify_alg[0]) {
+		verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(verify_tfm)) {
+			verify_tfm = NULL;
+			retcode = ERR_VERIFY_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
+			retcode = ERR_VERIFY_ALG_ND;
+			goto fail;
+		}
+	}
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
+		err = __bitmap_parse(sc.cpu_mask, 32, 0,
+				cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err) {
+			dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
+			retcode = ERR_CPU_MASK_PARSE;
+			goto fail;
+		}
+	}
+
+	ERR_IF (sc.rate < 1) sc.rate = 1;
+	ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
+#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
+	if (sc.al_extents > AL_MAX) {
+		dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
+		sc.al_extents = AL_MAX;
+	}
+#undef AL_MAX
+
+	/* to avoid spurious errors when configuring minors before configuring
+	 * the minors they depend on: if necessary, first create the minor we
+	 * depend on */
+	if (sc.after >= 0)
+		ensure_mdev(sc.after, 1);
+
+	/* most sanity checks done, try to assign the new sync-after
+	 * dependency.  need to hold the global lock in there,
+	 * to avoid a race in the dependency loop check. */
+	retcode = drbd_alter_sa(mdev, sc.after);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+	if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+		rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+		if (!rs_plan_s) {
+			dev_err(DEV, "kmalloc of fifo_buffer failed");
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	/* ok, assign the rest of it as well.
+	 * lock against receive_SyncParam() */
+	spin_lock(&mdev->peer_seq_lock);
+	mdev->sync_conf = sc;
+
+	if (!rsr) {
+		crypto_free_hash(mdev->csums_tfm);
+		mdev->csums_tfm = csums_tfm;
+		csums_tfm = NULL;
+	}
+
+	if (!ovr) {
+		crypto_free_hash(mdev->verify_tfm);
+		mdev->verify_tfm = verify_tfm;
+		verify_tfm = NULL;
+	}
+
+	if (fifo_size != mdev->rs_plan_s.size) {
+		kfree(mdev->rs_plan_s.values);
+		mdev->rs_plan_s.values = rs_plan_s;
+		mdev->rs_plan_s.size   = fifo_size;
+		mdev->rs_planed = 0;
+		rs_plan_s = NULL;
+	}
+
+	spin_unlock(&mdev->peer_seq_lock);
+
+	if (get_ldev(mdev)) {
+		wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+		drbd_al_shrink(mdev);
+		err = drbd_check_al_size(mdev);
+		lc_unlock(mdev->act_log);
+		wake_up(&mdev->al_wait);
+
+		put_ldev(mdev);
+		drbd_md_sync(mdev);
+
+		if (err) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	if (mdev->state.conn >= C_CONNECTED)
+		drbd_send_sync_param(mdev, &sc);
+
+	if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(mdev->cpu_mask, new_cpu_mask);
+		drbd_calc_cpu_mask(mdev);
+		mdev->receiver.reset_cpu_mask = 1;
+		mdev->asender.reset_cpu_mask = 1;
+		mdev->worker.reset_cpu_mask = 1;
+	}
+
+	drbd_kobject_uevent(mdev);
+fail:
+	kfree(rs_plan_s);
+	free_cpumask_var(new_cpu_mask);
+	crypto_free_hash(csums_tfm);
+	crypto_free_hash(verify_tfm);
+	reply->ret_code = retcode;
+	return 0;
+}
+
+STATIC int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
+
+	if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
+		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+
+	while (retcode == SS_NEED_CONNECTION) {
+		spin_lock_irq(&mdev->req_lock);
+		if (mdev->state.conn < C_CONNECTED)
+			retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+
+		if (retcode != SS_NEED_CONNECTION)
+			break;
+
+		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+	}
+	drbd_resume_io(mdev);
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+STATIC int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
+{
+	int rv;
+
+	rv = drbd_bmio_set_n_write(mdev);
+	drbd_suspend_al(mdev);
+	return rv;
+}
+
+STATIC int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				   struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
+
+	if (retcode < SS_SUCCESS) {
+		if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
+			/* The peer will get a resync upon connect anyways. Just make that
+			   into a full resync. */
+			retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
+			if (retcode >= SS_SUCCESS) {
+				if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
+					"set_n_write from invalidate_peer",
+					BM_LOCKED_SET_ALLOWED))
+					retcode = ERR_IO_MD_DISK;
+			}
+		} else
+			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+	}
+	drbd_resume_io(mdev);
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+STATIC int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+
+	if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+STATIC int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			       struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	union drbd_state s;
+
+	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+		s = mdev->state;
+		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+		} else {
+			retcode = ERR_PAUSE_IS_CLEAR;
+		}
+	}
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+STATIC int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
+
+	return 0;
+}
+
+STATIC int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+		drbd_uuid_new_current(mdev);
+		clear_bit(NEW_CUR_UUID, &mdev->flags);
+	}
+	drbd_suspend_io(mdev);
+	reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+	if (reply->ret_code == SS_SUCCESS) {
+		if (mdev->state.conn < C_CONNECTED)
+			tl_clear(mdev);
+		if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
+			tl_restart(mdev, fail_frozen_disk_io);
+	}
+	drbd_resume_io(mdev);
+
+	return 0;
+}
+
+STATIC int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
+	return 0;
+}
+
+STATIC int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+
+	tl = reply->tag_list;
+
+	if (get_ldev(mdev)) {
+		tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
+		put_ldev(mdev);
+	}
+
+	if (get_net_conf(mdev)) {
+		tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
+		put_net_conf(mdev);
+	}
+	tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
+
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+STATIC int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl = reply->tag_list;
+	union drbd_state s = mdev->state;
+	unsigned long rs_left;
+	unsigned int res;
+
+	tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+
+	/* no local ref, no bitmap, no syncer progress. */
+	if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
+		if (get_ldev(mdev)) {
+			drbd_get_syncer_progress(mdev, &rs_left, &res);
+			tl = tl_add_int(tl, T_sync_progress, &res);
+			put_ldev(mdev);
+		}
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+STATIC int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+
+	tl = reply->tag_list;
+
+	if (get_ldev(mdev)) {
+		tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
+		tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
+		put_ldev(mdev);
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+/**
+ * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
+ * @mdev:	DRBD device.
+ * @nlp:	Netlink/connector packet from drbdsetup
+ * @reply:	Reply packet for drbdsetup
+ */
+STATIC int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				    struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+	char rv;
+
+	tl = reply->tag_list;
+
+	rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
+	  test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
+
+	tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+STATIC int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				    struct drbd_nl_cfg_reply *reply)
+{
+	/* default to resume from last known position, if possible */
+	struct start_ov args =
+		{ .start_sector = mdev->ov_start_sector };
+
+	if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	/* If there is still bitmap IO pending, e.g. previous resync or verify
+	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
+	/* w_make_ov_request expects position to be aligned */
+	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
+	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	drbd_resume_io(mdev);
+	return 0;
+}
+
+
+STATIC int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	int skip_initial_sync = 0;
+	int err;
+
+	struct new_c_uuid args;
+
+	memset(&args, 0, sizeof(struct new_c_uuid));
+	if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	/* this is "skip initial sync", assume to be clean */
+	if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+	    mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+		dev_info(DEV, "Preparing to skip initial sync\n");
+		skip_initial_sync = 1;
+	} else if (mdev->state.conn != C_STANDALONE) {
+		retcode = ERR_CONNECTED;
+		goto out_dec;
+	}
+
+	drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+	drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
+			"clear_n_write from new_c_uuid", BM_LOCKED_MASK);
+		if (err) {
+			dev_err(DEV, "Writing bitmap failed with %d\n",err);
+			retcode = ERR_IO_MD_DISK;
+		}
+		if (skip_initial_sync) {
+			drbd_send_uuids_skip_initial_sync(mdev);
+			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			drbd_print_uuids(mdev, "cleared bitmap UUID");
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+
+	drbd_md_sync(mdev);
+out_dec:
+	put_ldev(mdev);
+out:
+	mutex_unlock(&mdev->state_mutex);
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+struct cn_handler_struct {
+	int (*function)(struct drbd_conf *,
+			 struct drbd_nl_cfg_req *,
+			 struct drbd_nl_cfg_reply *);
+	int reply_body_size;
+};
+
+static struct cn_handler_struct cnd_table[] = {
+	[ P_primary ]		= { &drbd_nl_primary,		0 },
+	[ P_secondary ]		= { &drbd_nl_secondary,		0 },
+	[ P_disk_conf ]		= { &drbd_nl_disk_conf,		0 },
+	[ P_detach ]		= { &drbd_nl_detach,		0 },
+	[ P_net_conf ]		= { &drbd_nl_net_conf,		0 },
+	[ P_disconnect ]	= { &drbd_nl_disconnect,	0 },
+	[ P_resize ]		= { &drbd_nl_resize,		0 },
+	[ P_syncer_conf ]	= { &drbd_nl_syncer_conf,	0 },
+	[ P_invalidate ]	= { &drbd_nl_invalidate,	0 },
+	[ P_invalidate_peer ]	= { &drbd_nl_invalidate_peer,	0 },
+	[ P_pause_sync ]	= { &drbd_nl_pause_sync,	0 },
+	[ P_resume_sync ]	= { &drbd_nl_resume_sync,	0 },
+	[ P_suspend_io ]	= { &drbd_nl_suspend_io,	0 },
+	[ P_resume_io ]		= { &drbd_nl_resume_io,		0 },
+	[ P_outdate ]		= { &drbd_nl_outdate,		0 },
+	[ P_get_config ]	= { &drbd_nl_get_config,
+				    sizeof(struct syncer_conf_tag_len_struct) +
+				    sizeof(struct disk_conf_tag_len_struct) +
+				    sizeof(struct net_conf_tag_len_struct) },
+	[ P_get_state ]		= { &drbd_nl_get_state,
+				    sizeof(struct get_state_tag_len_struct) +
+				    sizeof(struct sync_progress_tag_len_struct)	},
+	[ P_get_uuids ]		= { &drbd_nl_get_uuids,
+				    sizeof(struct get_uuids_tag_len_struct) },
+	[ P_get_timeout_flag ]	= { &drbd_nl_get_timeout_flag,
+				    sizeof(struct get_timeout_flag_tag_len_struct)},
+	[ P_start_ov ]		= { &drbd_nl_start_ov,		0 },
+	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
+};
+
+#ifdef KERNEL_HAS_CN_SKB_PARMS
+STATIC void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+{
+#else
+STATIC void drbd_connector_callback(void *data)
+{
+	struct cn_msg *req = data;
+#endif
+	struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
+	struct cn_handler_struct *cm;
+	struct cn_msg *cn_reply;
+	struct drbd_nl_cfg_reply *reply;
+	struct drbd_conf *mdev;
+	int retcode, rr;
+	int reply_size = sizeof(struct cn_msg)
+		+ sizeof(struct drbd_nl_cfg_reply)
+		+ sizeof(short int);
+
+	if (!try_module_get(THIS_MODULE)) {
+		printk(KERN_ERR "drbd: try_module_get() failed!\n");
+		return;
+	}
+
+#ifdef KERNEL_HAS_CN_SKB_PARMS
+# ifdef HAVE_NL_SKB_EFF_CAP
+	if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
+		retcode = ERR_PERM;
+		goto fail;
+	}
+# else
+	if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) {
+		retcode = ERR_PERM;
+		goto fail;
+	}
+# endif
+#endif
+
+	mdev = ensure_mdev(nlp->drbd_minor,
+			(nlp->flags & DRBD_NL_CREATE_DEVICE));
+	if (!mdev) {
+		retcode = ERR_MINOR_INVALID;
+		goto fail;
+	}
+
+	trace_drbd_netlink(req, 1);
+
+	if (nlp->packet_type >= P_nl_after_last_packet ||
+	    nlp->packet_type == P_return_code_only) {
+		retcode = ERR_PACKET_NR;
+		goto fail;
+	}
+
+	cm = cnd_table + nlp->packet_type;
+
+	/* This may happen if packet number is 0: */
+	if (cm->function == NULL) {
+		retcode = ERR_PACKET_NR;
+		goto fail;
+	}
+
+	reply_size += cm->reply_body_size;
+
+	/* allocation not in the IO path, cqueue thread context */
+	cn_reply = kzalloc(reply_size, GFP_KERNEL);
+	if (!cn_reply) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
+
+	reply->packet_type =
+		cm->reply_body_size ? nlp->packet_type : P_return_code_only;
+	reply->minor = nlp->drbd_minor;
+	reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
+	/* reply->tag_list; might be modified by cm->function. */
+
+	rr = cm->function(mdev, nlp, reply);
+
+	cn_reply->id = req->id;
+	cn_reply->seq = req->seq;
+	cn_reply->ack = req->ack  + 1;
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
+	cn_reply->flags = 0;
+
+	trace_drbd_netlink(cn_reply, 0);
+	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
+	if (rr && rr != -ESRCH)
+		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+
+	kfree(cn_reply);
+	module_put(THIS_MODULE);
+	return;
+ fail:
+	drbd_nl_send_reply(req, retcode);
+	module_put(THIS_MODULE);
+}
+
+static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
+
+static unsigned short *
+__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
+	unsigned short len, int nul_terminated)
+{
+	unsigned short l = tag_descriptions[tag_number(tag)].max_len;
+	len = (len < l) ? len :  l;
+	put_unaligned(tag, tl++);
+	put_unaligned(len, tl++);
+	memcpy(tl, data, len);
+	tl = (unsigned short*)((char*)tl + len);
+	if (nul_terminated)
+		*((char*)tl - 1) = 0;
+	return tl;
+}
+
+static unsigned short *
+tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
+{
+	return __tl_add_blob(tl, tag, data, len, 0);
+}
+
+static unsigned short *
+tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
+{
+	return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
+}
+
+static unsigned short *
+tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
+{
+	put_unaligned(tag, tl++);
+	switch(tag_type(tag)) {
+	case TT_INTEGER:
+		put_unaligned(sizeof(int), tl++);
+		put_unaligned(*(int *)val, (int *)tl);
+		tl = (unsigned short*)((char*)tl+sizeof(int));
+		break;
+	case TT_INT64:
+		put_unaligned(sizeof(u64), tl++);
+		put_unaligned(*(u64 *)val, (u64 *)tl);
+		tl = (unsigned short*)((char*)tl+sizeof(u64));
+		break;
+	default:
+		/* someone did something stupid. */
+		;
+	}
+	return tl;
+}
+
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct get_state_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+
+	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+	tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
+
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_get_state;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct call_helper_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+
+	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+	tl = tl_add_str(tl, T_helper, helper_name);
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_call_helper;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ee(struct drbd_conf *mdev,
+		const char *reason, const int dgs,
+		const char* seen_hash, const char* calc_hash,
+		const struct drbd_epoch_entry* e)
+{
+	struct cn_msg *cn_reply;
+	struct drbd_nl_cfg_reply *reply;
+	unsigned short *tl;
+	struct page *page;
+	unsigned len;
+
+	if (!e)
+		return;
+	if (!reason || !reason[0])
+		return;
+
+	/* apparently we have to memcpy twice, first to prepare the data for the
+	 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
+	 * netlink skb. */
+	/* receiver thread context, which is not in the writeout path (of this node),
+	 * but may be in the writeout path of the _other_ node.
+	 * GFP_NOIO to avoid potential "distributed deadlock". */
+	cn_reply = kzalloc(
+		sizeof(struct cn_msg)+
+		sizeof(struct drbd_nl_cfg_reply)+
+		sizeof(struct dump_ee_tag_len_struct)+
+		sizeof(short int),
+		GFP_NOIO);
+
+	if (!cn_reply) {
+		dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
+				(unsigned long long)e->sector, e->size);
+		return;
+	}
+
+	reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
+	tl = reply->tag_list;
+
+	tl = tl_add_str(tl, T_dump_ee_reason, reason);
+	tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
+	tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
+	tl = tl_add_int(tl, T_ee_sector, &e->sector);
+	tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
+
+	/* dump the first 32k */
+	len = min_t(unsigned, e->size, 32 << 10);
+	put_unaligned(T_ee_data, tl++);
+	put_unaligned(len, tl++);
+
+	page = e->pages;
+	page_chain_for_each(page) {
+		void *d = kmap_atomic(page, KM_USER0);
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+		memcpy(tl, d, l);
+		kunmap_atomic(d, KM_USER0);
+		tl = (unsigned short*)((char*)tl + l);
+		len -= l;
+		if (len == 0)
+			break;
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
+	cn_reply->ack = 0; // not used here.
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char*)tl - (char*)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_dump_ee;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	kfree(cn_reply);
+}
+
+void drbd_bcast_sync_progress(struct drbd_conf *mdev)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct sync_progress_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+	unsigned long rs_left;
+	unsigned int res;
+
+	/* no local ref, no bitmap, no syncer progress, no broadcast. */
+	if (!get_ldev(mdev))
+		return;
+	drbd_get_syncer_progress(mdev, &rs_left, &res);
+	put_ldev(mdev);
+
+	tl = tl_add_int(tl, T_sync_progress, &res);
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_sync_progress;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+#ifdef NETLINK_ROUTE6
+int __init cn_init(void);
+void __exit cn_fini(void);
+#endif
+
+typedef int (*cn_add_callback_req_nsp_fn)(struct cb_id *, char *,
+	void (*cb)(struct cn_msg *req, struct netlink_skb_parms *nsp));
+typedef int (*cn_add_callback_const_name_req_nsp_fn)(
+		struct cb_id *id, const char *name,
+	void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
+typedef int (*cn_add_callback_req_fn)(struct cb_id *, char *,
+	void (*cb)(struct cn_msg *req));
+typedef int (*cn_add_callback_void_fn)(struct cb_id *, char *,
+	void (*cb)(void *data));
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
+int __init drbd_nl_init(void)
+{
+	static struct cb_id cn_id_drbd;
+	int err, try=10;
+
+#ifdef NETLINK_ROUTE6
+	/* pre 2.6.16 */
+	err = cn_init();
+	if (err)
+		return err;
+#endif
+	cn_id_drbd.val = CN_VAL_DRBD;
+	do {
+		cn_id_drbd.idx = cn_idx;
+		/* Try to catch incompatible callbacks at compile time,
+		 * otherwise it will just be a compiler _warning_,
+		 * but then BUG at runtime. */
+#ifdef KERNEL_HAS_CN_SKB_PARMS
+		BUILD_BUG_ON(!(
+			__same_type(&cn_add_callback, cn_add_callback_req_nsp_fn) ||
+			__same_type(&cn_add_callback, cn_add_callback_const_name_req_nsp_fn)));
+#else
+		BUILD_BUG_ON(!(
+			__same_type(&cn_add_callback, cn_add_callback_req_fn) ||
+			__same_type(&cn_add_callback, cn_add_callback_void_fn)));
+#endif
+		err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
+		if (!err)
+			break;
+		cn_idx = (cn_idx + CN_IDX_STEP);
+	} while (try--);
+
+	if (err) {
+		printk(KERN_ERR "drbd: cn_drbd failed to register\n");
+		return err;
+	}
+
+	return 0;
+}
+
+void drbd_nl_cleanup(void)
+{
+	static struct cb_id cn_id_drbd;
+
+	cn_id_drbd.idx = cn_idx;
+	cn_id_drbd.val = CN_VAL_DRBD;
+
+	cn_del_callback(&cn_id_drbd);
+
+#ifdef NETLINK_ROUTE6
+	/* pre 2.6.16 */
+	cn_fini();
+#endif
+}
+
+void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
+{
+	char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	int rr;
+
+	memset(buffer, 0, sizeof(buffer));
+	cn_reply->id = req->id;
+
+	cn_reply->seq = req->seq;
+	cn_reply->ack = req->ack  + 1;
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_return_code_only;
+	reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
+	reply->ret_code = ret_code;
+
+	trace_drbd_netlink(cn_reply, 0);
+	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	if (rr && rr != -ESRCH)
+		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+}
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_proc.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_proc.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_proc.c	2015-01-21 12:02:58.383823937 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_proc.c	2015-01-21 12:02:58.383823937 +0300
@@ -0,0 +1,323 @@
+/*
+   drbd_proc.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+STATIC int drbd_proc_open(struct inode *inode, struct file *file);
+STATIC int drbd_proc_release(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+const struct file_operations drbd_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= drbd_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= drbd_proc_release,
+};
+
+void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+	/* v is in kB/sec. We don't expect TiByte/sec yet. */
+	if (unlikely(v >= 1000000)) {
+		/* cool: > GiByte/s */
+		seq_printf(seq, "%ld,", v / 1000000);
+		v %= 1000000;
+		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+	} else if (likely(v >= 1000))
+		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+	else
+		seq_printf(seq, "%ld", v);
+}
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+STATIC void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
+{
+	unsigned long db, dt, dbdt, rt, rs_left;
+	unsigned int res;
+	int i, x, y;
+	int stalled = 0;
+
+	drbd_get_syncer_progress(mdev, &rs_left, &res);
+
+	x = res/50;
+	y = 20-x;
+	seq_printf(seq, "\t[");
+	for (i = 1; i < x; i++)
+		seq_printf(seq, "=");
+	seq_printf(seq, ">");
+	for (i = 0; i < y; i++)
+		seq_printf(seq, ".");
+	seq_printf(seq, "] ");
+
+	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+		seq_printf(seq, "verified:");
+	else
+		seq_printf(seq, "sync'ed:");
+	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+	/* if more than a few GB, display in MB */
+	if (mdev->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+		seq_printf(seq, "(%lu/%lu)M",
+			    (unsigned long) Bit2KB(rs_left >> 10),
+			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
+	else
+		seq_printf(seq, "(%lu/%lu)K",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(mdev->rs_total));
+
+	seq_printf(seq, "\n\t");
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	/* Rolling marks. last_mark+1 may just now be modified.  last_mark+2 is
+	 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+	 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+	/* ------------------------ ~18s average ------------------------ */
+	i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+	dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
+	if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+		stalled = 1;
+
+	if (!dt)
+		dt++;
+	db = mdev->rs_mark_left[i] - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	dbdt = Bit2KB(db/dt);
+	seq_printf(seq, " speed: ");
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, " (");
+	/* ------------------------- ~3s average ------------------------ */
+	if (proc_details >= 1) {
+		/* this is what drbd_rs_should_slow_down() uses */
+		i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+		dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = mdev->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+		seq_printf_with_thousands_grouping(seq, dbdt);
+		seq_printf(seq, " -- ");
+	}
+
+	/* --------------------- long term average ---------------------- */
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt == 0)
+		dt = 1;
+	db = mdev->rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, ")");
+
+	if (mdev->state.conn == C_SYNC_TARGET ||
+	    mdev->state.conn == C_VERIFY_S) {
+		seq_printf(seq, " want: ");
+		seq_printf_with_thousands_grouping(seq, mdev->c_sync_rate);
+	}
+	seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+	if (proc_details >= 1) {
+		/* 64 bit:
+		 * we convert to sectors in the display below. */
+		unsigned long bm_bits = drbd_bm_bits(mdev);
+		unsigned long bit_pos;
+		if (mdev->state.conn == C_VERIFY_S ||
+		    mdev->state.conn == C_VERIFY_T)
+			bit_pos = bm_bits - mdev->ov_left;
+		else
+			bit_pos = mdev->bm_resync_fo;
+		/* Total sectors may be slightly off for oddly
+		 * sized devices. So what. */
+		seq_printf(seq,
+			"\t%3d%% sector pos: %llu/%llu\n",
+			(int)(bit_pos / (bm_bits/100+1)),
+			(unsigned long long)bit_pos * BM_SECT_PER_BIT,
+			(unsigned long long)bm_bits * BM_SECT_PER_BIT);
+	}
+}
+
+STATIC void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
+{
+	struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+	seq_printf(seq, "%5d %s %s\n", bme->rs_left,
+		   bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
+		   bme->flags & BME_LOCKED ? "LOCKED" : "------"
+		   );
+}
+
+STATIC int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i, hole = 0;
+	const char *sn;
+	struct drbd_conf *mdev;
+
+	static char write_ordering_chars[] = {
+		[WO_none] = 'n',
+		[WO_drain_io] = 'd',
+		[WO_bdev_flush] = 'f',
+		[WO_bio_barrier] = 'b',
+	};
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
+	*/
+
+	for (i = 0; i < minor_count; i++) {
+		mdev = minor_to_mdev(i);
+		if (!mdev) {
+			hole = 1;
+			continue;
+		}
+		if (hole) {
+			hole = 0;
+			seq_printf(seq, "\n");
+		}
+
+		sn = drbd_conn_str(mdev->state.conn);
+
+		if (mdev->state.conn == C_STANDALONE &&
+		    mdev->state.disk == D_DISKLESS &&
+		    mdev->state.role == R_SECONDARY) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+		} else {
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+			   i, sn,
+			   drbd_role_str(mdev->state.role),
+			   drbd_role_str(mdev->state.peer),
+			   drbd_disk_str(mdev->state.disk),
+			   drbd_disk_str(mdev->state.pdsk),
+			   (mdev->net_conf == NULL ? ' ' :
+			    (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
+			   is_susp(mdev->state) ? 's' : 'r',
+			   mdev->state.aftr_isp ? 'a' : '-',
+			   mdev->state.peer_isp ? 'p' : '-',
+			   mdev->state.user_isp ? 'u' : '-',
+			   mdev->congestion_reason ?: '-',
+			   test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
+			   mdev->send_cnt/2,
+			   mdev->recv_cnt/2,
+			   mdev->writ_cnt/2,
+			   mdev->read_cnt/2,
+			   mdev->al_writ_cnt,
+			   mdev->bm_writ_cnt,
+			   atomic_read(&mdev->local_cnt),
+			   atomic_read(&mdev->ap_pending_cnt) +
+			   atomic_read(&mdev->rs_pending_cnt),
+			   atomic_read(&mdev->unacked_cnt),
+			   atomic_read(&mdev->ap_bio_cnt),
+			   mdev->epochs,
+			   write_ordering_chars[mdev->write_ordering]
+			);
+			seq_printf(seq, " oos:%llu\n",
+				   Bit2KB((unsigned long long)
+					   drbd_bm_total_weight(mdev)));
+		}
+		if (mdev->state.conn == C_SYNC_SOURCE ||
+		    mdev->state.conn == C_SYNC_TARGET ||
+		    mdev->state.conn == C_VERIFY_S ||
+		    mdev->state.conn == C_VERIFY_T)
+			drbd_syncer_progress(mdev, seq);
+
+		if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
+			lc_seq_printf_stats(seq, mdev->resync);
+			lc_seq_printf_stats(seq, mdev->act_log);
+			put_ldev(mdev);
+		}
+
+		if (proc_details >= 2) {
+			if (mdev->resync) {
+				lc_seq_dump_details(seq, mdev->resync, "rs_left",
+					resync_dump_detail);
+			}
+		}
+	}
+
+	return 0;
+}
+
+STATIC int drbd_proc_open(struct inode *inode, struct file *file)
+{
+	if (try_module_get(THIS_MODULE))
+		return single_open(file, drbd_seq_show, PDE(inode)->data);
+	return -ENODEV;
+}
+
+STATIC int drbd_proc_release(struct inode *inode, struct file *file)
+{
+	module_put(THIS_MODULE);
+	return single_release(inode, file);
+}
+
+/* PROC FS stuff end */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_receiver.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_receiver.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_receiver.c	2015-01-21 12:02:58.385823885 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_receiver.c	2015-01-21 12:02:58.385823885 +0300
@@ -0,0 +1,5005 @@
+/*
+   drbd_receiver.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_req.h"
+#include "drbd_vli.h"
+#ifdef HAVE_LINUX_SCATTERLIST_H
+/* 2.6.11 (suse 9.3, fc4) does not include requisites
+ * from linux/scatterlist.h :( */
+#include <asm/scatterlist.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#endif
+
+struct flush_work {
+	struct drbd_work w;
+	struct drbd_epoch *epoch;
+};
+
+enum finish_epoch {
+	FE_STILL_LIVE,
+	FE_DESTROYED,
+	FE_RECYCLED,
+};
+
+STATIC int drbd_do_handshake(struct drbd_conf *mdev);
+STATIC int drbd_do_auth(struct drbd_conf *mdev);
+
+STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
+STATIC int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+
+static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	struct drbd_epoch *prev;
+	spin_lock(&mdev->epoch_lock);
+	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
+	if (prev == epoch || prev == mdev->current_epoch)
+		prev = NULL;
+	spin_unlock(&mdev->epoch_lock);
+	return prev;
+}
+
+#ifdef DBG_ASSERTS
+void drbd_assert_breakpoint(struct drbd_conf *mdev, char *exp,
+			    char *file, int line)
+{
+	dev_err(DEV, "ASSERT( %s ) in %s:%d\n", exp, file, line);
+}
+#endif
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+	struct page *page;
+	struct page *tmp;
+
+	BUG_ON(!n);
+	BUG_ON(!head);
+
+	page = *head;
+
+	if (!page)
+		return NULL;
+
+	while (page) {
+		tmp = page_chain_next(page);
+		if (--n == 0)
+			break; /* found sufficient pages */
+		if (tmp == NULL)
+			/* insufficient pages, don't use any of them. */
+			return NULL;
+		page = tmp;
+	}
+
+	/* add end of list marker for the returned list */
+	set_page_private(page, 0);
+	/* actual return value, and adjustment of head */
+	page = *head;
+	*head = tmp;
+	return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+	struct page *tmp;
+	int i = 1;
+	while ((tmp = page_chain_next(page)))
+		++i, page = tmp;
+	if (len)
+		*len = i;
+	return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+	struct page *tmp;
+	int i = 0;
+	page_chain_for_each_safe(page, tmp) {
+		put_page(page);
+		++i;
+	}
+	return i;
+}
+
+static void page_chain_add(struct page **head,
+		struct page *chain_first, struct page *chain_last)
+{
+#if 1
+	struct page *tmp;
+	tmp = page_chain_tail(chain_first, NULL);
+	BUG_ON(tmp != chain_last);
+#endif
+
+	/* add chain to head */
+	set_page_private(chain_last, (unsigned long)*head);
+	*head = chain_first;
+}
+
+static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
+{
+	struct page *page = NULL;
+	struct page *tmp = NULL;
+	int i = 0;
+
+	/* Yes, testing drbd_pp_vacant outside the lock is racy.
+	 * So what. It saves a spin_lock. */
+	if (drbd_pp_vacant >= number) {
+		spin_lock(&drbd_pp_lock);
+		page = page_chain_del(&drbd_pp_pool, number);
+		if (page)
+			drbd_pp_vacant -= number;
+		spin_unlock(&drbd_pp_lock);
+		if (page)
+			return page;
+	}
+
+	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	for (i = 0; i < number; i++) {
+		tmp = alloc_page(GFP_TRY);
+		if (!tmp)
+			break;
+		set_page_private(tmp, (unsigned long)page);
+		page = tmp;
+	}
+
+	if (i == number)
+		return page;
+
+	/* Not enough pages immediately available this time.
+	 * No need to jump around here, drbd_pp_alloc will retry this
+	 * function "soon". */
+	if (page) {
+		tmp = page_chain_tail(page, NULL);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	return NULL;
+}
+
+/* kick lower level device, if we have more than (arbitrary number)
+ * reference counts on it, which typically are locally submitted io
+ * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
+static void maybe_kick_lo(struct drbd_conf *mdev)
+{
+	if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
+		drbd_kick_lo(mdev);
+}
+
+static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+{
+	struct drbd_epoch_entry *e;
+	struct list_head *le, *tle;
+
+	/* The EEs are always appended to the end of the list. Since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_safe(le, tle, &mdev->net_ee) {
+		e = list_entry(le, struct drbd_epoch_entry, w.list);
+		if (drbd_ee_has_active_page(e))
+			break;
+		list_move(le, to_be_freed);
+	}
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
+{
+	LIST_HEAD(reclaimed);
+	struct drbd_epoch_entry *e, *t;
+
+	maybe_kick_lo(mdev);
+	spin_lock_irq(&mdev->req_lock);
+	reclaim_net_ee(mdev, &reclaimed);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &reclaimed, w.list)
+		drbd_free_net_ee(mdev, e);
+}
+
+/**
+ * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
+ * @mdev:	DRBD device.
+ * @number:	number of pages requested
+ * @retry:	whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel, unless this allocation would exceed the max_buffers setting.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
+ *
+ * Returns a page chain linked via page->private.
+ */
+STATIC struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
+{
+	struct page *page = NULL;
+	DEFINE_WAIT(wait);
+
+	/* Yes, we may run up to @number over max_buffers. If we
+	 * follow it strictly, the admin will get it wrong anyways. */
+	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
+		page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+
+	while (page == NULL) {
+		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+		drbd_kick_lo_and_reclaim_net(mdev);
+
+		if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+			page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+			if (page)
+				break;
+		}
+
+		if (!retry)
+			break;
+
+		if (signal_pending(current)) {
+			dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+			break;
+		}
+
+		schedule();
+	}
+	finish_wait(&drbd_pp_wait, &wait);
+
+	if (page)
+		atomic_add(number, &mdev->pp_in_use);
+	return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
+STATIC void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
+{
+	atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
+	int i;
+
+	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
+		i = page_chain_free(page);
+	else {
+		struct page *tmp;
+		tmp = page_chain_tail(page, &i);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	i = atomic_sub_return(i, a);
+	if (i < 0)
+		dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
+	wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_ee()
+ drbd_alloc_ee()
+ drbd_init_ee()
+ drbd_release_ee()
+ drbd_ee_fix_bhs()
+ drbd_process_done_ee()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+				     u64 id,
+				     sector_t sector,
+				     unsigned int data_size,
+				     gfp_t gfp_mask) __must_hold(local)
+{
+	struct drbd_epoch_entry *e;
+	struct page *page;
+	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+	if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
+		return NULL;
+
+	e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!e) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+		return NULL;
+	}
+
+	page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+	if (!page)
+		goto fail;
+
+	INIT_HLIST_NODE(&e->collision);
+	e->epoch = NULL;
+	e->mdev = mdev;
+	e->pages = page;
+	atomic_set(&e->pending_bios, 0);
+	e->size = data_size;
+	e->flags = 0;
+	e->sector = sector;
+	e->block_id = id;
+
+	trace_drbd_ee(mdev, e, "allocated");
+	return e;
+
+ fail:
+	mempool_free(e, drbd_ee_mempool);
+	return NULL;
+}
+
+void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
+{
+	trace_drbd_ee(mdev, e, "freed");
+	if (e->flags & EE_HAS_DIGEST)
+		kfree(e->digest);
+	drbd_pp_free(mdev, e->pages, is_net);
+	D_ASSERT(atomic_read(&e->pending_bios) == 0);
+	D_ASSERT(hlist_unhashed(&e->collision));
+	mempool_free(e, drbd_ee_mempool);
+}
+
+int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+{
+	LIST_HEAD(work_list);
+	struct drbd_epoch_entry *e, *t;
+	int count = 0;
+	int is_net = list == &mdev->net_ee;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_splice_init(list, &work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &work_list, w.list) {
+		drbd_free_some_ee(mdev, e, is_net);
+		count++;
+	}
+	return count;
+}
+
+
+/*
+ * This function is called from _asender only_
+ * but see also comments in _req_mod(,barrier_acked)
+ * and receive_Barrier.
+ *
+ * Move entries from net_ee to done_ee, if ready.
+ * Grab done_ee, call all callbacks, free the entries.
+ * The callbacks typically send out ACKs.
+ */
+STATIC int drbd_process_done_ee(struct drbd_conf *mdev)
+{
+	LIST_HEAD(work_list);
+	LIST_HEAD(reclaimed);
+	struct drbd_epoch_entry *e, *t;
+	int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+
+	spin_lock_irq(&mdev->req_lock);
+	reclaim_net_ee(mdev, &reclaimed);
+	list_splice_init(&mdev->done_ee, &work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &reclaimed, w.list)
+		drbd_free_net_ee(mdev, e);
+
+	/* possible callbacks here:
+	 * e_end_block, and e_end_resync_block, e_send_discard_ack.
+	 * all ignore the last argument.
+	 */
+	list_for_each_entry_safe(e, t, &work_list, w.list) {
+		trace_drbd_ee(mdev, e, "process_done_ee");
+		/* list_del not necessary, next/prev members not touched */
+		ok = e->w.cb(mdev, &e->w, !ok) && ok;
+		drbd_free_ee(mdev, e);
+	}
+	wake_up(&mdev->ee_wait);
+
+	return ok;
+}
+
+void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+	DEFINE_WAIT(wait);
+
+	/* avoids spin_lock/unlock
+	 * and calling prepare_to_wait in the fast path */
+	while (!list_empty(head)) {
+		prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mdev->req_lock);
+		drbd_kick_lo(mdev);
+		schedule();
+		finish_wait(&mdev->ee_wait, &wait);
+		spin_lock_irq(&mdev->req_lock);
+	}
+}
+
+void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+	spin_lock_irq(&mdev->req_lock);
+	_drbd_wait_ee_list_empty(mdev, head);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+#ifdef DEFINE_SOCK_CREATE_KERN
+/* if there is no sock_create_kern,
+ * there is also sock_create_lite missing */
+int sock_create_lite(int family, int type, int protocol, struct socket **res)
+{
+	int err = 0;
+	struct socket *sock = NULL;
+
+	sock = sock_alloc();
+	if (!sock)
+		err = -ENOMEM;
+	else
+		sock->type = type;
+
+	*res = sock;
+	return err;
+}
+#endif
+
+/* see also kernel_accept; which is only present since 2.6.18.
+ * also we want to log which part of it failed, exactly */
+STATIC int drbd_accept(struct drbd_conf *mdev, const char **what,
+		struct socket *sock, struct socket **newsock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	*what = "listen";
+	err = sock->ops->listen(sock, 5);
+	if (err < 0)
+		goto out;
+
+	*what = "sock_create_lite";
+	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+			       newsock);
+	if (err < 0)
+		goto out;
+
+	*what = "accept";
+	err = sock->ops->accept(sock, *newsock, 0);
+	if (err < 0) {
+		sock_release(*newsock);
+		*newsock = NULL;
+		goto out;
+	}
+	(*newsock)->ops  = sock->ops;
+	__module_get((*newsock)->ops->owner);
+
+out:
+	return err;
+}
+
+STATIC int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
+		    void *buf, size_t size, int flags)
+{
+	mm_segment_t oldfs;
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&iov,
+		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+	};
+	int rv;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
+	set_fs(oldfs);
+
+	return rv;
+}
+
+STATIC int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+{
+	mm_segment_t oldfs;
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&iov,
+		.msg_flags = MSG_WAITALL | MSG_NOSIGNAL
+	};
+	int rv;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	for (;;) {
+		rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
+		if (rv == size)
+			break;
+
+		/* Note:
+		 * ECONNRESET	other side closed the connection
+		 * ERESTARTSYS	(on  sock) we got a signal
+		 */
+
+		if (rv < 0) {
+			if (rv == -ECONNRESET)
+				dev_info(DEV, "sock was reset by peer\n");
+			else if (rv != -ERESTARTSYS)
+				dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			break;
+		} else if (rv == 0) {
+			dev_info(DEV, "sock was shut down by peer\n");
+			break;
+		} else	{
+			/* signal came in, or peer/link went down,
+			 * after we read a partial message
+			 */
+			/* D_ASSERT(signal_pending(current)); */
+			break;
+		}
+	};
+
+	set_fs(oldfs);
+
+	if (rv != size)
+		drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+
+	return rv;
+}
+
+/* quoting tcp(7):
+ *   On individual connections, the socket buffer size must be set prior to the
+ *   listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+		unsigned int rcv)
+{
+	/* open coded SO_SNDBUF, SO_RCVBUF */
+	if (snd) {
+		sock->sk->sk_sndbuf = snd;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rcv) {
+		sock->sk->sk_rcvbuf = rcv;
+		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+}
+
+STATIC struct socket *drbd_try_connect(struct drbd_conf *mdev)
+{
+	const char *what;
+	struct socket *sock;
+	struct sockaddr_in6 src_in6;
+	int err;
+	int disconnect_on_error = 1;
+
+	if (!get_net_conf(mdev))
+		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
+	}
+
+	sock->sk->sk_rcvtimeo =
+	sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
+	drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
+			mdev->net_conf->rcvbuf_size);
+
+       /* explicitly bind to the configured IP as source IP
+	*  for the outgoing connections.
+	*  This is needed for multihomed hosts and to be
+	*  able to use lo: interfaces for drbd.
+	* Make sure to use 0 as port number, so linux selects
+	*  a free one dynamically.
+	*/
+	memcpy(&src_in6, mdev->net_conf->my_addr,
+	       min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
+	if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	what = "bind before connect";
+	err = sock->ops->bind(sock,
+			      (struct sockaddr *) &src_in6,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
+
+	/* connect may fail, peer not yet available.
+	 * stay C_WF_CONNECTION, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
+	err = sock->ops->connect(sock,
+				 (struct sockaddr *)mdev->net_conf->peer_addr,
+				 mdev->net_conf->peer_addr_len, 0);
+
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+			disconnect_on_error = 0;
+			break;
+		default:
+			dev_err(DEV, "%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	}
+	put_net_conf(mdev);
+	return sock;
+}
+
+STATIC struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+{
+	int timeo, err;
+	struct socket *s_estab = NULL, *s_listen;
+	const char *what;
+
+	if (!get_net_conf(mdev))
+		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	if (err) {
+		s_listen = NULL;
+		goto out;
+	}
+
+	timeo = mdev->net_conf->try_connect_int * HZ;
+	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+
+	s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
+	s_listen->sk->sk_rcvtimeo = timeo;
+	s_listen->sk->sk_sndtimeo = timeo;
+	drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
+			mdev->net_conf->rcvbuf_size);
+
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen,
+			      (struct sockaddr *) mdev->net_conf->my_addr,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
+
+	err = drbd_accept(mdev, &what, s_listen, &s_estab);
+
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			dev_err(DEV, "%s failed, err = %d\n", what, err);
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		}
+	}
+	put_net_conf(mdev);
+
+	return s_estab;
+}
+
+STATIC int drbd_send_fp(struct drbd_conf *mdev,
+	struct socket *sock, enum drbd_packets cmd)
+{
+	struct p_header80 *h = &mdev->data.sbuf.header.h80;
+
+	return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+}
+
+STATIC enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+{
+	struct p_header80 *h = &mdev->data.rbuf.header.h80;
+	int rr;
+
+	rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+
+	if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
+		return be16_to_cpu(h->command);
+
+	return 0xffff;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @mdev:	DRBD device.
+ * @sock:	pointer to the pointer to the socket.
+ */
+static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+{
+	int rr;
+	char tb[4];
+
+	if (!*sock)
+		return false;
+
+	rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+	if (rr > 0 || rr == -EAGAIN) {
+		return true;
+	} else {
+		sock_release(*sock);
+		*sock = NULL;
+		return false;
+	}
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ *  -2 We do not have a network config...
+ */
+STATIC int drbd_connect(struct drbd_conf *mdev)
+{
+	struct socket *s, *sock, *msock;
+	int try, h, ok;
+	enum drbd_state_rv rv;
+
+	D_ASSERT(!mdev->data.socket);
+
+	if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+		return -2;
+
+	clear_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+	sock  = NULL;
+	msock = NULL;
+
+	do {
+		for (try = 0;;) {
+			/* 3 tries, this should take less than a second! */
+			s = drbd_try_connect(mdev);
+			if (s || ++try >= 3)
+				break;
+			/* give the other side time to call bind() & listen() */
+			schedule_timeout_interruptible(HZ / 10);
+		}
+
+		if (s) {
+			if (!sock) {
+				drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
+				sock = s;
+				s = NULL;
+			} else if (!msock) {
+				drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
+				msock = s;
+				s = NULL;
+			} else {
+				dev_err(DEV, "Logic error in drbd_connect()\n");
+				goto out_release_sockets;
+			}
+		}
+
+		if (sock && msock) {
+			schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
+			ok = drbd_socket_okay(mdev, &sock);
+			ok = drbd_socket_okay(mdev, &msock) && ok;
+			if (ok)
+				break;
+		}
+
+retry:
+		s = drbd_wait_for_connect(mdev);
+		if (s) {
+			try = drbd_recv_fp(mdev, s);
+			drbd_socket_okay(mdev, &sock);
+			drbd_socket_okay(mdev, &msock);
+			switch (try) {
+			case P_HAND_SHAKE_S:
+				if (sock) {
+					dev_warn(DEV, "initial packet S crossed\n");
+					sock_release(sock);
+				}
+				sock = s;
+				break;
+			case P_HAND_SHAKE_M:
+				if (msock) {
+					dev_warn(DEV, "initial packet M crossed\n");
+					sock_release(msock);
+				}
+				msock = s;
+				set_bit(DISCARD_CONCURRENT, &mdev->flags);
+				break;
+			default:
+				dev_warn(DEV, "Error receiving initial packet\n");
+				sock_release(s);
+				if (random32() & 1)
+					goto retry;
+			}
+		}
+
+		if (mdev->state.conn <= C_DISCONNECTING)
+			goto out_release_sockets;
+		if (signal_pending(current)) {
+			flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&mdev->receiver) == Exiting)
+				goto out_release_sockets;
+		}
+
+		if (sock && msock) {
+			ok = drbd_socket_okay(mdev, &sock);
+			ok = drbd_socket_okay(mdev, &msock) && ok;
+			if (ok)
+				break;
+		}
+	} while (1);
+
+	msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+	sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+
+	sock->sk->sk_allocation = GFP_NOIO;
+	msock->sk->sk_allocation = GFP_NOIO;
+
+	sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	/* NOT YET ...
+	 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_HAND_SHAKE timeout,
+	 * which we set to 4x the configured ping_timeout. */
+	sock->sk->sk_sndtimeo =
+	sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+
+	msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+	/* we don't want delays.
+	 * we use TCP_CORK where appropriate, though */
+	drbd_tcp_nodelay(sock);
+	drbd_tcp_nodelay(msock);
+
+	mdev->data.socket = sock;
+	mdev->meta.socket = msock;
+	mdev->last_received = jiffies;
+
+	D_ASSERT(mdev->asender.task == NULL);
+
+	h = drbd_do_handshake(mdev);
+	if (h <= 0)
+		return h;
+
+	if (mdev->cram_hmac_tfm) {
+		/* drbd_request_state(mdev, NS(conn, WFAuth)); */
+		switch (drbd_do_auth(mdev)) {
+		case -1:
+			dev_err(DEV, "Authentication of peer failed\n");
+			return -1;
+		case 0:
+			dev_err(DEV, "Authentication of peer failed, trying again.\n");
+			return 0;
+		}
+	}
+
+	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	atomic_set(&mdev->packet_seq, 0);
+	mdev->peer_seq = 0;
+
+	if (drbd_send_protocol(mdev) == -1)
+		return -1;
+	set_bit(STATE_SENT, &mdev->flags);
+	drbd_send_sync_param(mdev, &mdev->sync_conf);
+	drbd_send_sizes(mdev, 0, 0);
+	drbd_send_uuids(mdev);
+	drbd_send_current_state(mdev);
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	clear_bit(RESIZE_PENDING, &mdev->flags);
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
+	if (mdev->state.conn != C_WF_REPORT_PARAMS)
+		clear_bit(STATE_SENT, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		return 0;
+
+	drbd_thread_start(&mdev->asender);
+	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+
+	return 1;
+
+out_release_sockets:
+	if (sock)
+		sock_release(sock);
+	if (msock)
+		sock_release(msock);
+	return -1;
+}
+
+STATIC int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
+{
+	union p_header *h = &mdev->data.rbuf.header;
+	int r;
+
+	r = drbd_recv(mdev, h, sizeof(*h));
+	if (unlikely(r != sizeof(*h))) {
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
+		return false;
+	}
+
+	if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
+		*cmd = be16_to_cpu(h->h80.command);
+		*packet_size = be16_to_cpu(h->h80.length);
+	} else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
+		*cmd = be16_to_cpu(h->h95.command);
+		*packet_size = be32_to_cpu(h->h95.length);
+	} else {
+		dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+		    be32_to_cpu(h->h80.magic),
+		    be16_to_cpu(h->h80.command),
+		    be16_to_cpu(h->h80.length));
+		return false;
+	}
+	mdev->last_received = jiffies;
+
+	return true;
+}
+
+STATIC enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	int rv;
+
+	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
+		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
+					NULL);
+		if (rv) {
+			dev_info(DEV, "local disk flush failed with status %d\n", rv);
+			/* would rather check on EOPNOTSUPP, but that is not reliable.
+			 * don't try again for ANY return value != 0
+			 * if (rv == -EOPNOTSUPP) */
+			drbd_bump_write_ordering(mdev, WO_drain_io);
+		}
+		put_ldev(mdev);
+	}
+
+	return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+}
+
+STATIC int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct flush_work *fw = (struct flush_work *)w;
+	struct drbd_epoch *epoch = fw->epoch;
+
+	kfree(w);
+
+	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
+		drbd_flush_after_epoch(mdev, epoch);
+
+	drbd_may_finish_epoch(mdev, epoch, EV_PUT |
+			      (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
+
+	return 1;
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @mdev:	DRBD device.
+ * @epoch:	Epoch object.
+ * @ev:		Epoch event.
+ */
+STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int finish, epoch_size;
+	struct drbd_epoch *next_epoch;
+	int schedule_flush = 0;
+	enum finish_epoch rv = FE_STILL_LIVE;
+
+	spin_lock(&mdev->epoch_lock);
+	do {
+		next_epoch = NULL;
+		finish = 0;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_CLEANUP) {
+		case EV_PUT:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_GOT_BARRIER_NR:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+
+			/* Special case: If we just switched from WO_bio_barrier to
+			   WO_bdev_flush we should not finish the current epoch */
+			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
+			    mdev->write_ordering != WO_bio_barrier &&
+			    epoch == mdev->current_epoch)
+				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
+			break;
+		case EV_BARRIER_DONE:
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
+			break;
+		case EV_BECAME_LAST:
+			/* nothing to do*/
+			break;
+		}
+
+		trace_drbd_epoch(mdev, epoch, ev);
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP) &&
+		    epoch->list.prev == &mdev->current_epoch->list &&
+		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
+			/* Nearly all conditions are met to finish that epoch... */
+			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
+			    mdev->write_ordering == WO_none ||
+			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
+			    ev & EV_CLEANUP) {
+				finish = 1;
+				set_bit(DE_IS_FINISHING, &epoch->flags);
+			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
+				 mdev->write_ordering == WO_bio_barrier) {
+				atomic_inc(&epoch->active);
+				schedule_flush = 1;
+			}
+		}
+		if (finish) {
+			if (!(ev & EV_CLEANUP)) {
+				spin_unlock(&mdev->epoch_lock);
+				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
+				spin_lock(&mdev->epoch_lock);
+			}
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(mdev);
+
+			if (mdev->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+				mdev->epochs--;
+				trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE);
+				kfree(epoch);
+
+				if (rv == FE_STILL_LIVE)
+					rv = FE_DESTROYED;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is alrady zero */
+				if (rv == FE_STILL_LIVE)
+					rv = FE_RECYCLED;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&mdev->epoch_lock);
+
+	if (schedule_flush) {
+		struct flush_work *fw;
+		fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
+		if (fw) {
+			trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH);
+			fw->w.cb = w_flush;
+			fw->epoch = epoch;
+			drbd_queue_work(&mdev->data.work, &fw->w);
+		} else {
+			dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+			/* That is not a recursion, only one level */
+			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+			drbd_may_finish_epoch(mdev, epoch, EV_PUT);
+		}
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @mdev:	DRBD device.
+ * @wo:		Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+{
+	enum write_ordering_e pwo;
+	static char *write_ordering_str[] = {
+		[WO_none] = "none",
+		[WO_drain_io] = "drain",
+		[WO_bdev_flush] = "flush",
+		[WO_bio_barrier] = "barrier",
+	};
+
+	pwo = mdev->write_ordering;
+	wo = min(pwo, wo);
+	if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
+		wo = WO_bdev_flush;
+	if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
+		wo = WO_none;
+	mdev->write_ordering = wo;
+	if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+}
+
+/**
+ * drbd_submit_ee()
+ * @mdev:	DRBD device.
+ * @e:		epoch entry
+ * @rw:		flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ *  single page to an empty bio (which should never happen and likely indicates
+ *  that the lower level IO stack is in some way broken). This has been observed
+ *  on certain Xen deployments.
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+		const unsigned rw, const int fault_type)
+{
+	struct bio *bios = NULL;
+	struct bio *bio;
+	struct page *page = e->pages;
+	sector_t sector = e->sector;
+	unsigned ds = e->size;
+	unsigned n_bios = 0;
+	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+	int err = -ENOMEM;
+
+	/* In most cases, we will only need one bio.  But in case the lower
+	 * level restrictions happen to be different at this offset on this
+	 * side than those of the sending peer, we may need to submit the
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
+next_bio:
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+	if (!bio) {
+		dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
+		goto fail;
+	}
+	/* > e->sector, unless this is the first bio */
+	bio->bi_sector = sector;
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+	/* we special case some flags in the multi-bio case, see below
+	 * (REQ_UNPLUG, REQ_FLUSH, or BIO_RW_BARRIER in older kernels) */
+	bio->bi_rw = rw;
+	bio->bi_private = e;
+	bio->bi_end_io = drbd_endio_sec;
+
+	bio->bi_next = bios;
+	bios = bio;
+	++n_bios;
+
+	page_chain_for_each(page) {
+		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
+		if (!bio_add_page(bio, page, len, 0)) {
+			/* A single page must always be possible!
+			 * But in case it fails anyways,
+			 * we deal with it, and complain (below). */
+			if (bio->bi_vcnt == 0) {
+				dev_err(DEV,
+					"bio_add_page failed for len=%u, "
+					"bi_vcnt=0 (bi_sector=%llu)\n",
+					len, (unsigned long long)bio->bi_sector);
+				err = -ENOSPC;
+				goto fail;
+			}
+			goto next_bio;
+		}
+		ds -= len;
+		sector += len >> 9;
+		--nr_pages;
+	}
+	D_ASSERT(page == NULL);
+	D_ASSERT(ds == 0);
+
+	atomic_set(&e->pending_bios, n_bios);
+	trace_drbd_ee(mdev, e, "submitted");
+	do {
+		bio = bios;
+		bios = bios->bi_next;
+		bio->bi_next = NULL;
+
+		/* strip off REQ_UNPLUG unless it is the last bio */
+		if (bios)
+			bio->bi_rw &= ~DRBD_REQ_UNPLUG;
+		trace_drbd_bio(mdev, "Sec", bio, 0, NULL);
+		drbd_generic_make_request(mdev, fault_type, bio);
+
+		/* strip off REQ_FLUSH,
+		 * unless it is the first or last bio */
+		if (bios && bios->bi_next)
+			bios->bi_rw &= ~DRBD_REQ_FLUSH;
+	} while (bios);
+	maybe_kick_lo(mdev);
+	return 0;
+
+fail:
+	while (bios) {
+		bio = bios;
+		bios = bios->bi_next;
+		bio_put(bio);
+	}
+	return err;
+}
+
+/**
+ * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways (unused in this callback)
+ */
+int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	int err;
+	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
+	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
+	   so that we can finish that epoch in drbd_may_finish_epoch().
+	   That is necessary if we already have a long chain of Epochs, before
+	   we realize that BARRIER is actually not supported */
+
+	/* As long as the -ENOTSUPP on the barrier is reported immediately
+	   that will never trigger. If it is reported late, we will just
+	   print that warning and continue correctly for all future requests
+	   with WO_bdev_flush */
+	if (previous_epoch(mdev, e->epoch))
+		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
+
+	/* we still have a local reference,
+	 * get_ldev was done in receive_Data. */
+
+	e->w.cb = e_end_block;
+	err = drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR);
+	switch (err) {
+	case -ENOMEM:
+		e->w.cb = w_e_reissue;
+		drbd_queue_work(&mdev->data.work, &e->w);
+		/* retry later; fall through */
+	case 0:
+		/* keep worker happy and connection up */
+		return 1;
+
+	case -ENOSPC:
+		/* no other error expected, but anyways: */
+	default:
+		/* forget the object,
+		 * and cause a "Network failure" */
+		spin_lock_irq(&mdev->req_lock);
+		list_del(&e->w.list);
+		hlist_del_init(&e->collision);
+		spin_unlock_irq(&mdev->req_lock);
+		if (e->flags & EE_CALL_AL_COMPLETE_IO)
+			drbd_al_complete_io(mdev, e->sector);
+		drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
+		drbd_free_ee(mdev, e);
+		dev_err(DEV, "submit failed, triggering re-connect\n");
+		return 0;
+	}
+}
+
+STATIC int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	int rv, issue_flush;
+	struct p_barrier *p = &mdev->data.rbuf.barrier;
+	struct drbd_epoch *epoch;
+
+	inc_unacked(mdev);
+
+	if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
+		drbd_kick_lo(mdev);
+
+	mdev->current_epoch->barrier_nr = p->barrier;
+	rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+
+	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+	 * the activity log, which means it would not be resynced in case the
+	 * R_PRIMARY crashes now.
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (mdev->write_ordering) {
+	case WO_bio_barrier:
+	case WO_none:
+		if (rv == FE_RECYCLED)
+			return true;
+		break;
+
+	case WO_bdev_flush:
+	case WO_drain_io:
+		if (rv == FE_STILL_LIVE) {
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
+			drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+		}
+		if (rv == FE_RECYCLED)
+			return true;
+
+		/* The asender will send all the ACKs and barrier ACKs out, since
+		   all EEs moved from the active_ee to the done_ee. We need to
+		   provide a new epoch object for the EEs that come in soon */
+		break;
+	}
+
+	/* receiver context, in the writeout path of the other node.
+	 * avoid potential distributed deadlock */
+	epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+	if (!epoch) {
+		dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		if (issue_flush) {
+			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+			if (rv == FE_RECYCLED)
+				return true;
+		}
+
+		drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+
+		return true;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&mdev->epoch_lock);
+	if (atomic_read(&mdev->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &mdev->current_epoch->list);
+		mdev->current_epoch = epoch;
+		mdev->epochs++;
+		trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC);
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	return true;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+STATIC struct drbd_epoch_entry *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+{
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	struct drbd_epoch_entry *e;
+	struct page *page;
+	int dgs, ds, rr;
+	void *dig_in = mdev->int_dig_in;
+	void *dig_vv = mdev->int_dig_vv;
+	unsigned long *data;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+	if (dgs) {
+		rr = drbd_recv(mdev, dig_in, dgs);
+		if (rr != dgs) {
+			if (!signal_pending(current))
+				dev_warn(DEV,
+					"short read receiving data digest: read %d expected %d\n",
+					rr, dgs);
+			return NULL;
+		}
+	}
+
+	data_size -= dgs;
+
+	ERR_IF(data_size == 0) return NULL;
+	ERR_IF(data_size &  0x1ff) return NULL;
+	ERR_IF(data_size >  DRBD_MAX_BIO_SIZE) return NULL;
+
+	/* even though we trust out peer,
+	 * we sometimes have to double check. */
+	if (sector + (data_size>>9) > capacity) {
+		dev_err(DEV, "request from peer beyond end of local disk: "
+			"capacity: %llus < sector: %llus + size: %u\n",
+			(unsigned long long)capacity,
+			(unsigned long long)sector, data_size);
+		return NULL;
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
+	if (!e)
+		return NULL;
+
+	ds = data_size;
+	page = e->pages;
+	page_chain_for_each(page) {
+		unsigned len = min_t(int, ds, PAGE_SIZE);
+		data = kmap(page);
+		rr = drbd_recv(mdev, data, len);
+		if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
+			dev_err(DEV, "Fault injection: Corrupting data on receive\n");
+			data[0] = data[0] ^ (unsigned long)-1;
+		}
+		kunmap(page);
+		if (rr != len) {
+			drbd_free_ee(mdev, e);
+			if (!signal_pending(current))
+				dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+				rr, len);
+			return NULL;
+		}
+		ds -= rr;
+	}
+
+	if (dgs) {
+		drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
+				(unsigned long long)sector, data_size);
+			drbd_bcast_ee(mdev, "digest failed",
+					dgs, dig_in, dig_vv, e);
+			drbd_free_ee(mdev, e);
+			return NULL;
+		}
+	}
+	mdev->recv_cnt += data_size>>9;
+	return e;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+STATIC int drbd_drain_block(struct drbd_conf *mdev, int data_size)
+{
+	struct page *page;
+	int rr, rv = 1;
+	void *data;
+
+	if (!data_size)
+		return true;
+
+	page = drbd_pp_alloc(mdev, 1, 1);
+
+	data = kmap(page);
+	while (data_size) {
+		rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
+		if (rr != min_t(int, data_size, PAGE_SIZE)) {
+			rv = 0;
+			if (!signal_pending(current))
+				dev_warn(DEV,
+					"short read receiving data: read %d expected %d\n",
+					rr, min_t(int, data_size, PAGE_SIZE));
+			break;
+		}
+		data_size -= rr;
+	}
+	kunmap(page);
+	drbd_pp_free(mdev, page, 0);
+	return rv;
+}
+
+STATIC int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
+			   sector_t sector, int data_size)
+{
+	struct bio_vec *bvec;
+	struct bio *bio;
+	int dgs, rr, i, expect;
+	void *dig_in = mdev->int_dig_in;
+	void *dig_vv = mdev->int_dig_vv;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+	if (dgs) {
+		rr = drbd_recv(mdev, dig_in, dgs);
+		if (rr != dgs) {
+			if (!signal_pending(current))
+				dev_warn(DEV,
+					"short read receiving data reply digest: read %d expected %d\n",
+					rr, dgs);
+			return 0;
+		}
+	}
+
+	data_size -= dgs;
+
+	/* optimistically update recv_cnt.  if receiving fails below,
+	 * we disconnect anyways, and counters will be reset. */
+	mdev->recv_cnt += data_size>>9;
+
+	bio = req->master_bio;
+	D_ASSERT(sector == bio->bi_sector);
+
+	bio_for_each_segment(bvec, bio, i) {
+		expect = min_t(int, data_size, bvec->bv_len);
+		rr = drbd_recv(mdev,
+			     kmap(bvec->bv_page)+bvec->bv_offset,
+			     expect);
+		kunmap(bvec->bv_page);
+		if (rr != expect) {
+			if (!signal_pending(current))
+				dev_warn(DEV, "short read receiving data reply: "
+					"read %d expected %d\n",
+					rr, expect);
+			return 0;
+		}
+		data_size -= rr;
+	}
+
+	if (dgs) {
+		drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
+			return 0;
+		}
+	}
+
+	D_ASSERT(data_size == 0);
+	return 1;
+}
+
+/* e_end_resync_block() is called via
+ * drbd_process_done_ee() by asender only */
+STATIC int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	sector_t sector = e->sector;
+	int ok;
+
+	D_ASSERT(hlist_unhashed(&e->collision));
+
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+		drbd_set_in_sync(mdev, sector, e->size);
+		ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(mdev, sector, e->size);
+
+		ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+	}
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+STATIC int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
+{
+	struct drbd_epoch_entry *e;
+
+	e = read_in_block(mdev, ID_SYNCER, sector, data_size);
+	if (!e)
+		goto fail;
+
+	dec_rs_pending(mdev);
+
+	inc_unacked(mdev);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	e->w.cb = e_end_resync_block;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->sync_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	atomic_add(data_size >> 9, &mdev->rs_sect_ev);
+	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
+		return true;
+
+	/* don't care for the reason here */
+	dev_err(DEV, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	drbd_free_ee(mdev, e);
+fail:
+	put_ldev(mdev);
+	return false;
+}
+
+STATIC int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct drbd_request *req;
+	sector_t sector;
+	int ok;
+	struct p_data *p = &mdev->data.rbuf.data;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&mdev->req_lock);
+	req = _ar_id_to_req(mdev, p->block_id, sector);
+	spin_unlock_irq(&mdev->req_lock);
+	if (unlikely(!req)) {
+		dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
+		return false;
+	}
+
+	/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
+	 * special casing it there for the various failure cases.
+	 * still no race with drbd_fail_pending_reads */
+	ok = recv_dless_read(mdev, req, sector, data_size);
+
+	if (ok)
+		req_mod(req, data_received);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
+
+	return ok;
+}
+
+STATIC int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	sector_t sector;
+	int ok;
+	struct p_data *p = &mdev->data.rbuf.data;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	if (get_ldev(mdev)) {
+		/* data is submitted to disk within recv_resync_read.
+		 * corresponding put_ldev done below on error,
+		 * or in drbd_endio_write_sec. */
+		ok = recv_resync_read(mdev, sector, data_size);
+	} else {
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "Can not write resync data to local disk.\n");
+
+		ok = drbd_drain_block(mdev, data_size);
+
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+	}
+
+	atomic_add(data_size >> 9, &mdev->rs_sect_in);
+
+	return ok;
+}
+
+/* e_end_block() is called via drbd_process_done_ee().
+ * this means this function only runs in the asender thread
+ */
+STATIC int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	sector_t sector = e->sector;
+	struct drbd_epoch *epoch;
+	int ok = 1, pcmd;
+
+	if (e->flags & EE_IS_BARRIER) {
+		epoch = previous_epoch(mdev, e->epoch);
+		if (epoch)
+			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
+	}
+
+	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
+		if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
+				mdev->state.conn <= C_PAUSED_SYNC_T &&
+				e->flags & EE_MAY_SET_IN_SYNC) ?
+				P_RS_WRITE_ACK : P_WRITE_ACK;
+			ok &= drbd_send_ack(mdev, pcmd, e);
+			if (pcmd == P_RS_WRITE_ACK)
+				drbd_set_in_sync(mdev, sector, e->size);
+		} else {
+			ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		}
+		dec_unacked(mdev);
+	}
+	/* we delete from the conflict detection hash _after_ we sent out the
+	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
+	if (mdev->net_conf->two_primaries) {
+		spin_lock_irq(&mdev->req_lock);
+		D_ASSERT(!hlist_unhashed(&e->collision));
+		hlist_del_init(&e->collision);
+		spin_unlock_irq(&mdev->req_lock);
+	} else {
+		D_ASSERT(hlist_unhashed(&e->collision));
+	}
+
+	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return ok;
+}
+
+STATIC int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	int ok = 1;
+
+	D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+	ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+
+	spin_lock_irq(&mdev->req_lock);
+	D_ASSERT(!hlist_unhashed(&e->collision));
+	hlist_del_init(&e->collision);
+	spin_unlock_irq(&mdev->req_lock);
+
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+{
+
+	struct drbd_epoch_entry *rs_e;
+	bool rv = 0;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
+		if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+			rv = 1;
+			break;
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return rv;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than mdev->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update mdev->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+{
+	DEFINE_WAIT(wait);
+	unsigned int p_seq;
+	long timeout;
+	int ret = 0;
+	spin_lock(&mdev->peer_seq_lock);
+	for (;;) {
+		prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		if (seq_le(packet_seq, mdev->peer_seq+1))
+			break;
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		p_seq = mdev->peer_seq;
+		spin_unlock(&mdev->peer_seq_lock);
+		timeout = schedule_timeout(30*HZ);
+		spin_lock(&mdev->peer_seq_lock);
+		if (timeout == 0 && p_seq == mdev->peer_seq) {
+			ret = -ETIMEDOUT;
+			dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+			break;
+		}
+	}
+	finish_wait(&mdev->seq_wait, &wait);
+	if (mdev->peer_seq+1 == packet_seq)
+		mdev->peer_seq++;
+	spin_unlock(&mdev->peer_seq_lock);
+	return ret;
+}
+
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+{
+	if (mdev->agreed_pro_version >= 95)
+		return  (dpf & DP_RW_SYNC ? DRBD_REQ_SYNC : 0) |
+			(dpf & DP_UNPLUG ? DRBD_REQ_UNPLUG : 0) |
+			(dpf & DP_FUA ? DRBD_REQ_FUA : 0) |
+			(dpf & DP_FLUSH ? DRBD_REQ_FLUSH : 0) |
+			(dpf & DP_DISCARD ? DRBD_REQ_DISCARD : 0);
+
+	/* else: we used to communicate one bit only in older DRBD */
+	return dpf & DP_RW_SYNC ? (DRBD_REQ_SYNC | DRBD_REQ_UNPLUG) : 0;
+}
+
+/* mirrored write */
+STATIC int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	sector_t sector;
+	struct drbd_epoch_entry *e;
+	struct p_data *p = &mdev->data.rbuf.data;
+	int rw = WRITE;
+	u32 dp_flags;
+
+	if (!get_ldev(mdev)) {
+		spin_lock(&mdev->peer_seq_lock);
+		if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
+			mdev->peer_seq++;
+		spin_unlock(&mdev->peer_seq_lock);
+
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+		atomic_inc(&mdev->current_epoch->epoch_size);
+		return drbd_drain_block(mdev, data_size);
+	}
+
+	/* get_ldev(mdev) successful.
+	 * Corresponding put_ldev done either below (on various errors),
+	 * or in drbd_endio_write_sec, if we successfully submit the data at
+	 * the end of this function. */
+
+	sector = be64_to_cpu(p->sector);
+	e = read_in_block(mdev, p->block_id, sector, data_size);
+	if (!e) {
+		put_ldev(mdev);
+		return false;
+	}
+
+	e->w.cb = e_end_block;
+
+	dp_flags = be32_to_cpu(p->dp_flags);
+	rw |= wire_flags_to_bio(mdev, dp_flags);
+
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		e->flags |= EE_MAY_SET_IN_SYNC;
+
+	/* last "fixes" to rw flags.
+	 * Strip off BIO_RW_BARRIER unconditionally,
+	 * it is not supposed to be here anyways.
+	 * (Was FUA or FLUSH on the peer,
+	 * and got translated to BARRIER on this side).
+	 * Note that the epoch handling code below
+	 * may add it again, though.
+	 */
+	rw &= ~DRBD_REQ_HARDBARRIER;
+
+	spin_lock(&mdev->epoch_lock);
+	e->epoch = mdev->current_epoch;
+	atomic_inc(&e->epoch->epoch_size);
+	atomic_inc(&e->epoch->active);
+
+	if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
+		struct drbd_epoch *epoch;
+		/* Issue a barrier if we start a new epoch, and the previous epoch
+		   was not a epoch containing a single request which already was
+		   a Barrier. */
+		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
+		if (epoch == e->epoch) {
+			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+			trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER);
+			rw |= DRBD_REQ_FLUSH | DRBD_REQ_FUA;
+			e->flags |= EE_IS_BARRIER;
+		} else {
+			if (atomic_read(&epoch->epoch_size) > 1 ||
+			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
+				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+				trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI);
+				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+				trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER);
+				rw |= DRBD_REQ_FLUSH | DRBD_REQ_FUA;
+				e->flags |= EE_IS_BARRIER;
+			}
+		}
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	/* I'm the receiver, I do hold a net_cnt reference. */
+	if (!mdev->net_conf->two_primaries) {
+		spin_lock_irq(&mdev->req_lock);
+	} else {
+		/* don't get the req_lock yet,
+		 * we may sleep in drbd_wait_peer_seq */
+		const int size = e->size;
+		const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+		DEFINE_WAIT(wait);
+		struct drbd_request *i;
+		struct hlist_node *n;
+		struct hlist_head *slot;
+		int first;
+
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		BUG_ON(mdev->ee_hash == NULL);
+		BUG_ON(mdev->tl_hash == NULL);
+
+		/* conflict detection and handling:
+		 * 1. wait on the sequence number,
+		 *    in case this data packet overtook ACK packets.
+		 * 2. check our hash tables for conflicting requests.
+		 *    we only need to walk the tl_hash, since an ee can not
+		 *    have a conflict with an other ee: on the submitting
+		 *    node, the corresponding req had already been conflicting,
+		 *    and a conflicting req is never sent.
+		 *
+		 * Note: for two_primaries, we are protocol C,
+		 * so there cannot be any request that is DONE
+		 * but still on the transfer log.
+		 *
+		 * unconditionally add to the ee_hash.
+		 *
+		 * if no conflicting request is found:
+		 *    submit.
+		 *
+		 * if any conflicting request is found
+		 * that has not yet been acked,
+		 * AND I have the "discard concurrent writes" flag:
+		 *	 queue (via done_ee) the P_DISCARD_ACK; OUT.
+		 *
+		 * if any conflicting request is found:
+		 *	 block the receiver, waiting on misc_wait
+		 *	 until no more conflicting requests are there,
+		 *	 or we get interrupted (disconnect).
+		 *
+		 *	 we do not just write after local io completion of those
+		 *	 requests, but only after req is done completely, i.e.
+		 *	 we wait for the P_DISCARD_ACK to arrive!
+		 *
+		 *	 then proceed normally, i.e. submit.
+		 */
+		if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+			goto out_interrupted;
+
+		spin_lock_irq(&mdev->req_lock);
+
+		hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+		slot = tl_hash_slot(mdev, sector);
+		first = 1;
+		for (;;) {
+			int have_unacked = 0;
+			int have_conflict = 0;
+			prepare_to_wait(&mdev->misc_wait, &wait,
+				TASK_INTERRUPTIBLE);
+			hlist_for_each_entry(i, n, slot, collision) {
+				if (OVERLAPS) {
+					/* only ALERT on first iteration,
+					 * we may be woken up early... */
+					if (first)
+						dev_alert(DEV, "%s[%u] Concurrent local write detected!"
+						      "	new: %llus +%u; pending: %llus +%u\n",
+						      current->comm, current->pid,
+						      (unsigned long long)sector, size,
+						      (unsigned long long)i->sector, i->size);
+					if (i->rq_state & RQ_NET_PENDING)
+						++have_unacked;
+					++have_conflict;
+				}
+			}
+#undef OVERLAPS
+			if (!have_conflict)
+				break;
+
+			/* Discard Ack only for the _first_ iteration */
+			if (first && discard && have_unacked) {
+				dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
+				     (unsigned long long)sector);
+				inc_unacked(mdev);
+				e->w.cb = e_send_discard_ack;
+				list_add_tail(&e->w.list, &mdev->done_ee);
+
+				spin_unlock_irq(&mdev->req_lock);
+
+				/* we could probably send that P_DISCARD_ACK ourselves,
+				 * but I don't like the receiver using the msock */
+
+				put_ldev(mdev);
+				wake_asender(mdev);
+				finish_wait(&mdev->misc_wait, &wait);
+				return true;
+			}
+
+			if (signal_pending(current)) {
+				hlist_del_init(&e->collision);
+
+				spin_unlock_irq(&mdev->req_lock);
+
+				finish_wait(&mdev->misc_wait, &wait);
+				goto out_interrupted;
+			}
+
+			spin_unlock_irq(&mdev->req_lock);
+			if (first) {
+				first = 0;
+				dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
+				     "sec=%llus\n", (unsigned long long)sector);
+			} else if (discard) {
+				/* we had none on the first iteration.
+				 * there must be none now. */
+				D_ASSERT(have_unacked == 0);
+			}
+			schedule();
+			spin_lock_irq(&mdev->req_lock);
+		}
+		finish_wait(&mdev->misc_wait, &wait);
+	}
+
+	list_add(&e->w.list, &mdev->active_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (mdev->state.conn == C_SYNC_TARGET)
+		wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
+
+	switch (mdev->net_conf->wire_protocol) {
+	case DRBD_PROT_C:
+		inc_unacked(mdev);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+		break;
+	case DRBD_PROT_B:
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(mdev, P_RECV_ACK, e);
+		break;
+	case DRBD_PROT_A:
+		/* nothing to do */
+		break;
+	}
+
+	if (mdev->state.pdsk < D_INCONSISTENT) {
+		/* In case we have the only disk of the cluster, */
+		drbd_set_out_of_sync(mdev, e->sector, e->size);
+		e->flags |= EE_CALL_AL_COMPLETE_IO;
+		e->flags &= ~EE_MAY_SET_IN_SYNC;
+		drbd_al_begin_io(mdev, e->sector);
+	}
+
+	if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
+		return true;
+
+	/* don't care for the reason here */
+	dev_err(DEV, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	hlist_del_init(&e->collision);
+	spin_unlock_irq(&mdev->req_lock);
+	if (e->flags & EE_CALL_AL_COMPLETE_IO)
+		drbd_al_complete_io(mdev, e->sector);
+
+out_interrupted:
+	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
+	put_ldev(mdev);
+	drbd_free_ee(mdev, e);
+	return false;
+}
+
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned long db, dt, dbdt;
+	struct lc_element *tmp;
+	int curr_events;
+	int throttle = 0;
+
+	/* feature disabled? */
+	if (mdev->sync_conf.c_min_rate == 0)
+		return 0;
+
+	spin_lock_irq(&mdev->al_lock);
+	tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
+			spin_unlock_irq(&mdev->al_lock);
+			return 0;
+		}
+		/* Do not slow down if app IO is already waiting for this extent */
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	curr_events = drbd_backing_bdev_events(mdev)
+		    - atomic_read(&mdev->rs_sect_ev);
+
+	if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+		unsigned long rs_left;
+		int i;
+
+		mdev->rs_last_events = curr_events;
+
+		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+		 * approx. */
+		i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+		if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+			rs_left = mdev->ov_left;
+		else
+			rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+
+		dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = mdev->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+
+		if (dbdt > mdev->sync_conf.c_min_rate)
+			throttle = 1;
+	}
+	return throttle;
+}
+
+
+STATIC int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
+{
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	struct drbd_epoch_entry *e;
+	struct digest_info *di = NULL;
+	int size, verb;
+	unsigned int fault_type;
+	struct p_block_req *p =	&mdev->data.rbuf.block_req;
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return false;
+	}
+	if (sector + (size>>9) > capacity) {
+		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return false;
+	}
+
+	if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
+		verb = 1;
+		switch (cmd) {
+		case P_DATA_REQUEST:
+			drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+			break;
+		case P_RS_DATA_REQUEST:
+		case P_CSUM_RS_REQUEST:
+		case P_OV_REQUEST:
+			drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+			break;
+		case P_OV_REPLY:
+			verb = 0;
+			dec_rs_pending(mdev);
+			drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+			break;
+		default:
+			dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+				cmdname(cmd));
+		}
+		if (verb && DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "Can not satisfy peer's read request, "
+			    "no local data.\n");
+
+		/* drain possibly payload */
+		return drbd_drain_block(mdev, digest_size);
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
+	if (!e) {
+		put_ldev(mdev);
+		return false;
+	}
+
+	switch (cmd) {
+	case P_DATA_REQUEST:
+		e->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
+		/* application IO, don't drbd_rs_begin_io */
+		goto submit;
+
+	case P_RS_DATA_REQUEST:
+		e->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* used in the sector offset progress display */
+		mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+		break;
+
+	case P_OV_REPLY:
+	case P_CSUM_RS_REQUEST:
+		fault_type = DRBD_FAULT_RS_RD;
+		di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+		if (!di)
+			goto out_free_e;
+
+		di->digest_size = digest_size;
+		di->digest = (((char *)di)+sizeof(struct digest_info));
+
+		e->digest = di;
+		e->flags |= EE_HAS_DIGEST;
+
+		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+			goto out_free_e;
+
+		if (cmd == P_CSUM_RS_REQUEST) {
+			D_ASSERT(mdev->agreed_pro_version >= 89);
+			e->w.cb = w_e_end_csum_rs_req;
+			/* used in the sector offset progress display */
+			mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+		} else if (cmd == P_OV_REPLY) {
+			/* track progress, we may need to throttle */
+			atomic_add(size >> 9, &mdev->rs_sect_in);
+			e->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(mdev);
+			/* drbd_rs_begin_io done when we sent this request,
+			 * but accounting still needs to be done. */
+			goto submit_for_resync;
+		}
+		break;
+
+	case P_OV_REQUEST:
+		if (mdev->ov_start_sector == ~(sector_t)0 &&
+		    mdev->agreed_pro_version >= 90) {
+			unsigned long now = jiffies;
+			int i;
+			mdev->ov_start_sector = sector;
+			mdev->ov_position = sector;
+			mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
+			mdev->rs_total = mdev->ov_left;
+			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+				mdev->rs_mark_left[i] = mdev->ov_left;
+				mdev->rs_mark_time[i] = now;
+			}
+			dev_info(DEV, "Online Verify start sector: %llu\n",
+					(unsigned long long)sector);
+		}
+		e->w.cb = w_e_end_ov_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		break;
+
+	default:
+		dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+		    cmdname(cmd));
+		fault_type = DRBD_FAULT_MAX;
+		goto out_free_e;
+	}
+
+	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
+	 * wrt the receiver, but it is not as straightforward as it may seem.
+	 * Various places in the resync start and stop logic assume resync
+	 * requests are processed in order, requeuing this on the worker thread
+	 * introduces a bunch of new code for synchronization between threads.
+	 *
+	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
+	 * "forever", throttling after drbd_rs_begin_io will lock that extent
+	 * for application writes for the same time.  For now, just throttle
+	 * here, where the rest of the code expects the receiver to sleep for
+	 * a while, anyways.
+	 */
+
+	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
+	 * this defers syncer requests for some time, before letting at least
+	 * on request through.  The resync controller on the receiving side
+	 * will adapt to the incoming rate accordingly.
+	 *
+	 * We cannot throttle here if remote is Primary/SyncTarget:
+	 * we would also throttle its application reads.
+	 * In that case, throttling is done on the SyncTarget only.
+	 */
+	if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
+		schedule_timeout_uninterruptible(HZ/10);
+	if (drbd_rs_begin_io(mdev, sector))
+		goto out_free_e;
+
+submit_for_resync:
+	atomic_add(size >> 9, &mdev->rs_sect_ev);
+
+submit:
+	inc_unacked(mdev);
+	spin_lock_irq(&mdev->req_lock);
+	list_add_tail(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
+		return true;
+
+	/* don't care for the reason here */
+	dev_err(DEV, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	spin_unlock_irq(&mdev->req_lock);
+	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
+out_free_e:
+	put_ldev(mdev);
+	drbd_free_ee(mdev, e);
+	return false;
+}
+
+STATIC int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, rv = -100;
+	unsigned long ch_self, ch_peer;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	ch_peer = mdev->p_uuid[UI_SIZE];
+	ch_self = mdev->comm_bm_set;
+
+	switch (mdev->net_conf->after_sb_0p) {
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_CALL_HELPER:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_DISCARD_YOUNGER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = -1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv =  1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+	case ASB_DISCARD_OLDER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = 1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv = -1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+		dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
+		     "Using discard-least-changes instead\n");
+	case ASB_DISCARD_ZERO_CHG:
+		if (ch_peer == 0 && ch_self == 0) {
+			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+				? -1 : 1;
+			break;
+		} else {
+			if (ch_peer == 0) { rv =  1; break; }
+			if (ch_self == 0) { rv = -1; break; }
+		}
+		if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+			break;
+	case ASB_DISCARD_LEAST_CHG:
+		if	(ch_self < ch_peer)
+			rv = -1;
+		else if (ch_self > ch_peer)
+			rv =  1;
+		else /* ( ch_self == ch_peer ) */
+		     /* Well, then use something else. */
+			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+				? -1 : 1;
+		break;
+	case ASB_DISCARD_LOCAL:
+		rv = -1;
+		break;
+	case ASB_DISCARD_REMOTE:
+		rv =  1;
+	}
+
+	return rv;
+}
+
+STATIC int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int hg, rv = -100;
+
+	switch (mdev->net_conf->after_sb_1p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CONSENSUS:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1 && mdev->state.role == R_SECONDARY)
+			rv = hg;
+		if (hg == 1  && mdev->state.role == R_PRIMARY)
+			rv = hg;
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(mdev);
+		break;
+	case ASB_DISCARD_SECONDARY:
+		return mdev->state.role == R_PRIMARY ? 1 : -1;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1 && mdev->state.role == R_PRIMARY) {
+			enum drbd_state_rv rv2;
+
+			drbd_set_role(mdev, R_SECONDARY, 0);
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(mdev, "pri-lost-after-sb");
+			} else {
+				dev_warn(DEV, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+STATIC int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int hg, rv = -100;
+
+	switch (mdev->net_conf->after_sb_2p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(mdev);
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(mdev, "pri-lost-after-sb");
+			} else {
+				dev_warn(DEV, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+STATIC void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
+			   u64 bits, u64 flags)
+{
+	if (!uuid) {
+		dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
+	dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     text,
+	     (unsigned long long)uuid[UI_CURRENT],
+	     (unsigned long long)uuid[UI_BITMAP],
+	     (unsigned long long)uuid[UI_HISTORY_START],
+	     (unsigned long long)uuid[UI_HISTORY_END],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
+
+/*
+  100	after split brain try auto recover
+    2	C_SYNC_SOURCE set BitMap
+    1	C_SYNC_SOURCE use BitMap
+    0	no Sync
+   -1	C_SYNC_TARGET use BitMap
+   -2	C_SYNC_TARGET set BitMap
+ -100	after split brain, disconnect
+-1000	unrelated data
+-1091   requires proto 91
+-1096   requires proto 96
+ */
+STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
+{
+	u64 self, peer;
+	int i, j;
+
+	self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+
+	*rule_nr = 10;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return 0;
+
+	*rule_nr = 20;
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+	     peer != UUID_JUST_CREATED)
+		return -2;
+
+	*rule_nr = 30;
+	if (self != UUID_JUST_CREATED &&
+	    (peer == UUID_JUST_CREATED || peer == (u64)0))
+		return 2;
+
+	if (self == peer) {
+		int rct, dc; /* roles at crash time */
+
+		if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+			if (mdev->agreed_pro_version < 91)
+				return -1091;
+
+			if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+			    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+				dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
+				drbd_uuid_set_bm(mdev, 0UL);
+
+				drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+					       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+				*rule_nr = 34;
+			} else {
+				dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
+				*rule_nr = 36;
+			}
+
+			return 1;
+		}
+
+		if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
+
+			if (mdev->agreed_pro_version < 91)
+				return -1091;
+
+			if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+			    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+				dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+				mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
+				mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
+				mdev->p_uuid[UI_BITMAP] = 0UL;
+
+				drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+				*rule_nr = 35;
+			} else {
+				dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
+				*rule_nr = 37;
+			}
+
+			return -1;
+		}
+
+		/* Common power [off|failure] */
+		rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
+			(mdev->p_uuid[UI_FLAGS] & 2);
+		/* lowest bit is set when we were primary,
+		 * next bit (weight 2) is set when peer was primary */
+		*rule_nr = 40;
+
+		switch (rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0;
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */
+			dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+			return dc ? -1 : 1;
+		}
+	}
+
+	*rule_nr = 50;
+	peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer)
+		return -1;
+
+	*rule_nr = 51;
+	peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (mdev->agreed_pro_version < 96 ?
+		    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+		    (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+		    peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of the peer's UUIDs. */
+
+			if (mdev->agreed_pro_version < 91)
+				return -1091;
+
+			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
+			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
+
+			dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
+			drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+
+			return -1;
+		}
+	}
+
+	*rule_nr = 60;
+	self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		peer = mdev->p_uuid[i] & ~((u64)1);
+		if (self == peer)
+			return -2;
+	}
+
+	*rule_nr = 70;
+	self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+	if (self == peer)
+		return 1;
+
+	*rule_nr = 71;
+	self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (mdev->agreed_pro_version < 96 ?
+		    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+		    (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+		    self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of our UUIDs. */
+
+			if (mdev->agreed_pro_version < 91)
+				return -1091;
+
+			_drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
+			_drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+			dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
+			drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+				       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+
+			return 1;
+		}
+	}
+
+
+	*rule_nr = 80;
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = mdev->ldev->md.uuid[i] & ~((u64)1);
+		if (self == peer)
+			return 2;
+	}
+
+	*rule_nr = 90;
+	self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer && self != ((u64)0))
+		return 100;
+
+	*rule_nr = 100;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = mdev->ldev->md.uuid[i] & ~((u64)1);
+		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+			peer = mdev->p_uuid[j] & ~((u64)1);
+			if (self == peer)
+				return -100;
+		}
+	}
+
+	return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+   CONN_MASK (-1) on failure.
+ */
+STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
+					   enum drbd_disk_state peer_disk) __must_hold(local)
+{
+	int hg, rule_nr;
+	enum drbd_conns rv = C_MASK;
+	enum drbd_disk_state mydisk;
+
+	mydisk = mdev->state.disk;
+	if (mydisk == D_NEGOTIATING)
+		mydisk = mdev->new_state_tmp.disk;
+
+	dev_info(DEV, "drbd_sync_handshake:\n");
+	drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
+	drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
+		       mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+
+	hg = drbd_uuid_compare(mdev, &rule_nr);
+
+	dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+	if (hg == -1000) {
+		dev_alert(DEV, "Unrelated data, aborting!\n");
+		return C_MASK;
+	}
+	if (hg < -1000) {
+		dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
+		return C_MASK;
+	}
+
+	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
+		int f = (hg == -100) || abs(hg) == 2;
+		hg = mydisk > D_INCONSISTENT ? 1 : -1;
+		if (f)
+			hg = hg*2;
+		dev_info(DEV, "Becoming sync %s due to disk states.\n",
+		     hg > 0 ? "source" : "target");
+	}
+
+	if (abs(hg) == 100)
+		drbd_khelper(mdev, "initial-split-brain");
+
+	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+		int pcount = (mdev->state.role == R_PRIMARY)
+			   + (peer_role == R_PRIMARY);
+		int forced = (hg == -100);
+
+		switch (pcount) {
+		case 0:
+			hg = drbd_asb_recover_0p(mdev);
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(mdev);
+			break;
+		case 2:
+			hg = drbd_asb_recover_2p(mdev);
+			break;
+		}
+		if (abs(hg) < 100) {
+			dev_warn(DEV, "Split-Brain detected, %d primaries, "
+			     "automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer" : "this");
+			if (forced) {
+				dev_warn(DEV, "Doing a full sync, since"
+				     " UUIDs where ambiguous.\n");
+				hg = hg*2;
+			}
+		}
+	}
+
+	if (hg == -100) {
+		if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+			hg = -1;
+		if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+			hg = 1;
+
+		if (abs(hg) < 100)
+			dev_warn(DEV, "Split-Brain detected, manually solved. "
+			     "Sync from %s node\n",
+			     (hg < 0) ? "peer" : "this");
+	}
+
+	if (hg == -100) {
+		/* FIXME this log message is not correct if we end up here
+		 * after an attempted attach on a diskless node.
+		 * We just refuse to attach -- well, we drop the "connection"
+		 * to that disk, in a way... */
+		dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
+		drbd_khelper(mdev, "split-brain");
+		return C_MASK;
+	}
+
+	if (hg > 0 && mydisk <= D_INCONSISTENT) {
+		dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
+		return C_MASK;
+	}
+
+	if (hg < 0 && /* by intention we do not use mydisk here. */
+	    mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
+		switch (mdev->net_conf->rr_conflict) {
+		case ASB_CALL_HELPER:
+			drbd_khelper(mdev, "pri-lost");
+			/* fall through */
+		case ASB_DISCONNECT:
+			dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
+			return C_MASK;
+		case ASB_VIOLENTLY:
+			dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
+
+	if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
+		if (hg == 0)
+			dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
+		else
+			dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
+				 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
+				 abs(hg) >= 2 ? "full" : "bit-map based");
+		return C_MASK;
+	}
+
+	if (abs(hg) >= 2) {
+		dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+					BM_LOCKED_SET_ALLOWED))
+			return C_MASK;
+	}
+
+	if (hg > 0) { /* become sync source. */
+		rv = C_WF_BITMAP_S;
+	} else if (hg < 0) { /* become sync target */
+		rv = C_WF_BITMAP_T;
+	} else {
+		rv = C_CONNECTED;
+		if (drbd_bm_total_weight(mdev)) {
+			dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
+			     drbd_bm_total_weight(mdev));
+		}
+	}
+
+	return rv;
+}
+
+/* returns 1 if invalid */
+STATIC int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+{
+	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+	if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
+	    (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
+		return 0;
+
+	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+	if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
+	    self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
+		return 1;
+
+	/* everything else is valid if they are equal on both sides. */
+	if (peer == self)
+		return 0;
+
+	/* everything es is invalid. */
+	return 1;
+}
+
+STATIC int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_protocol *p = &mdev->data.rbuf.protocol;
+	int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_want_lose, p_two_primaries, cf;
+	char p_integrity_alg[SHARED_SECRET_MAX] = "";
+
+	p_proto		= be32_to_cpu(p->protocol);
+	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
+	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
+	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
+	p_two_primaries = be32_to_cpu(p->two_primaries);
+	cf		= be32_to_cpu(p->conn_flags);
+	p_want_lose = cf & CF_WANT_LOSE;
+
+	clear_bit(CONN_DRY_RUN, &mdev->flags);
+
+	if (cf & CF_DRY_RUN)
+		set_bit(CONN_DRY_RUN, &mdev->flags);
+
+	if (p_proto != mdev->net_conf->wire_protocol) {
+		dev_err(DEV, "incompatible communication protocols\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
+		dev_err(DEV, "incompatible after-sb-0pri settings\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
+		dev_err(DEV, "incompatible after-sb-1pri settings\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
+		dev_err(DEV, "incompatible after-sb-2pri settings\n");
+		goto disconnect;
+	}
+
+	if (p_want_lose && mdev->net_conf->want_lose) {
+		dev_err(DEV, "both sides have the 'want_lose' flag set\n");
+		goto disconnect;
+	}
+
+	if (p_two_primaries != mdev->net_conf->two_primaries) {
+		dev_err(DEV, "incompatible setting of the two-primaries options\n");
+		goto disconnect;
+	}
+
+	if (mdev->agreed_pro_version >= 87) {
+		unsigned char *my_alg = mdev->net_conf->integrity_alg;
+
+		if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
+			return false;
+
+		p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
+		if (strcmp(p_integrity_alg, my_alg)) {
+			dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+			goto disconnect;
+		}
+		dev_info(DEV, "data-integrity-alg: %s\n",
+		     my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
+	}
+
+	return true;
+
+disconnect:
+	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	return false;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
+		const char *alg, const char *name)
+{
+	struct crypto_hash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+			alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
+		crypto_free_hash(tfm);
+		dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
+		return ERR_PTR(-EINVAL);
+	}
+	return tfm;
+}
+
+STATIC int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
+{
+	int ok = true;
+	struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	const int apv = mdev->agreed_pro_version;
+	int *rs_plan_s = NULL;
+	int fifo_size = 0;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
+		    : apv == 88 ? sizeof(struct p_rs_param)
+					+ SHARED_SECRET_MAX
+		    : apv <= 94 ? sizeof(struct p_rs_param_89)
+		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	if (packet_size > exp_max_sz) {
+		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    packet_size, exp_max_sz);
+		return false;
+	}
+
+	if (apv <= 88) {
+		header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
+		data_size   = packet_size  - header_size;
+	} else if (apv <= 94) {
+		header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
+		data_size   = packet_size  - header_size;
+		D_ASSERT(data_size == 0);
+	} else {
+		header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
+		data_size   = packet_size  - header_size;
+		D_ASSERT(data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
+		return false;
+
+	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
+
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				dev_err(DEV, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
+				return false;
+			}
+
+			if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
+				return false;
+
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
+
+		if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+				dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.verify_alg, p->verify_alg);
+				goto disconnect;
+			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm)) {
+				verify_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+				dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm)) {
+				csums_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv > 94) {
+			mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
+			mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+			mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
+			mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
+			mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+
+			fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+			if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+				rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+				if (!rs_plan_s) {
+					dev_err(DEV, "kmalloc of fifo_buffer failed");
+					goto disconnect;
+				}
+			}
+		}
+
+		spin_lock(&mdev->peer_seq_lock);
+		/* lock against drbd_nl_syncer_conf() */
+		if (verify_tfm) {
+			strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
+			mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
+			crypto_free_hash(mdev->verify_tfm);
+			mdev->verify_tfm = verify_tfm;
+			dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+		}
+		if (csums_tfm) {
+			strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
+			mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
+			crypto_free_hash(mdev->csums_tfm);
+			mdev->csums_tfm = csums_tfm;
+			dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+		}
+		if (fifo_size != mdev->rs_plan_s.size) {
+			kfree(mdev->rs_plan_s.values);
+			mdev->rs_plan_s.values = rs_plan_s;
+			mdev->rs_plan_s.size   = fifo_size;
+			mdev->rs_planed = 0;
+		}
+		spin_unlock(&mdev->peer_seq_lock);
+	}
+
+	return ok;
+disconnect:
+	/* just for completeness: actually not needed,
+	 * as this is not reached if csums_tfm was ok. */
+	crypto_free_hash(csums_tfm);
+	/* but free the verify_tfm again, if csums_tfm did not work out */
+	crypto_free_hash(verify_tfm);
+	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	return false;
+}
+
+STATIC void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ */
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_conf *mdev,
+	const char *s, sector_t a, sector_t b)
+{
+	sector_t d;
+	if (a == 0 || b == 0)
+		return;
+	d = (a > b) ? (a - b) : (b - a);
+	if (d > (a>>3) || d > (b>>3))
+		dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
+		     (unsigned long long)a, (unsigned long long)b);
+}
+
+STATIC int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_sizes *p = &mdev->data.rbuf.sizes;
+	enum determine_dev_size dd = unchanged;
+	sector_t p_size, p_usize, my_usize;
+	int ldsc = 0; /* local disk size changed */
+	enum dds_flags ddsf;
+
+	p_size = be64_to_cpu(p->d_size);
+	p_usize = be64_to_cpu(p->u_size);
+
+	if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
+		dev_err(DEV, "some backing storage is needed\n");
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return false;
+	}
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
+	mdev->p_size = p_size;
+
+	if (get_ldev(mdev)) {
+		warn_if_differ_considerably(mdev, "lower level device sizes",
+			   p_size, drbd_get_max_capacity(mdev->ldev));
+		warn_if_differ_considerably(mdev, "user requested size",
+					    p_usize, mdev->ldev->dc.disk_size);
+
+		/* if this is the first connect, or an otherwise expected
+		 * param exchange, choose the minimum */
+		if (mdev->state.conn == C_WF_REPORT_PARAMS)
+			p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
+					     p_usize);
+
+		my_usize = mdev->ldev->dc.disk_size;
+
+		if (mdev->ldev->dc.disk_size != p_usize) {
+			mdev->ldev->dc.disk_size = p_usize;
+			dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+			     (unsigned long)mdev->ldev->dc.disk_size);
+		}
+
+		/* Never shrink a device with usable data during connect.
+		   But allow online shrinking if we are connected. */
+		if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
+		   drbd_get_capacity(mdev->this_bdev) &&
+		   mdev->state.disk >= D_OUTDATED &&
+		   mdev->state.conn < C_CONNECTED) {
+			dev_err(DEV, "The peer's disk size is too small!\n");
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			mdev->ldev->dc.disk_size = my_usize;
+			put_ldev(mdev);
+			return false;
+		}
+		put_ldev(mdev);
+	}
+
+	ddsf = be16_to_cpu(p->dds_flags);
+	if (get_ldev(mdev)) {
+		dd = drbd_determine_dev_size(mdev, ddsf);
+		put_ldev(mdev);
+		if (dd == dev_size_error)
+			return false;
+		drbd_md_sync(mdev);
+	} else {
+		/* I am diskless, need to accept the peer's size. */
+		drbd_set_my_capacity(mdev, p_size);
+	}
+
+	mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	drbd_reconsider_max_bio_size(mdev);
+
+	if (get_ldev(mdev)) {
+		if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+			mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+			ldsc = 1;
+		}
+
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.conn > C_WF_REPORT_PARAMS) {
+		if (be64_to_cpu(p->c_size) !=
+		    drbd_get_capacity(mdev->this_bdev) || ldsc) {
+			/* we have different sizes, probably peer
+			 * needs to know my new size... */
+			drbd_send_sizes(mdev, 0, ddsf);
+		}
+		if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
+		    (dd == grew && mdev->state.conn == C_CONNECTED)) {
+			if (mdev->state.pdsk >= D_INCONSISTENT &&
+			    mdev->state.disk >= D_INCONSISTENT) {
+				if (ddsf & DDSF_NO_RESYNC)
+					dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
+				else
+					resync_after_online_grow(mdev);
+			} else
+				set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+		}
+	}
+
+	return true;
+}
+
+STATIC int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_uuids *p = &mdev->data.rbuf.uuids;
+	u64 *p_uuid;
+	int i, updated_uuids = 0;
+
+	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+
+	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+	kfree(mdev->p_uuid);
+	mdev->p_uuid = p_uuid;
+
+	if (mdev->state.conn < C_CONNECTED &&
+	    mdev->state.disk < D_INCONSISTENT &&
+	    mdev->state.role == R_PRIMARY &&
+	    (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+		dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
+		    (unsigned long long)mdev->ed_uuid);
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return false;
+	}
+
+	if (get_ldev(mdev)) {
+		int skip_initial_sync =
+			mdev->state.conn == C_CONNECTED &&
+			mdev->agreed_pro_version >= 90 &&
+			mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+			(p_uuid[UI_FLAGS] & 8);
+		if (skip_initial_sync) {
+			dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
+			drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
+					"clear_n_write from receive_uuids",
+					BM_LOCKED_TEST_ALLOWED);
+			_drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
+			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			drbd_md_sync(mdev);
+			updated_uuids = 1;
+		}
+		put_ldev(mdev);
+	} else if (mdev->state.disk < D_INCONSISTENT &&
+		   mdev->state.role == R_PRIMARY) {
+		/* I am a diskless primary, the peer just created a new current UUID
+		   for me. */
+		updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+	}
+
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+	if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
+		updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+
+	if (updated_uuids)
+		drbd_print_uuids(mdev, "receiver updated UUIDs to");
+
+	return true;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps:		The state as seen by the peer.
+ */
+STATIC union drbd_state convert_state(union drbd_state ps)
+{
+	union drbd_state ms;
+
+	static enum drbd_conns c_tab[] = {
+		[C_CONNECTED] = C_CONNECTED,
+
+		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_VERIFY_S]       = C_VERIFY_T,
+		[C_MASK]   = C_MASK,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+	return ms;
+}
+
+STATIC int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_req_state *p = &mdev->data.rbuf.req_state;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
+	    test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
+		drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
+		return true;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	DRBD_STATE_DEBUG_INIT_VAL(val);
+	rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+
+	drbd_send_sr_reply(mdev, rv);
+	drbd_md_sync(mdev);
+
+	return true;
+}
+
+STATIC int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_state *p = &mdev->data.rbuf.state;
+	union drbd_state os, ns, peer_state;
+	enum drbd_disk_state real_peer_disk;
+	enum chg_state_flags cs_flags;
+	int rv;
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	/* maybe we should always send a state seqence number with the state
+	 * packet, so we can more easily correlate with the sending side? */
+	drbd_state_dbg(mdev, 0, __func__, __LINE__, "recv", peer_state);
+
+	real_peer_disk = peer_state.disk;
+	if (peer_state.disk == D_NEGOTIATING) {
+		real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+		dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+ retry:
+	os = ns = mdev->state;
+	spin_unlock_irq(&mdev->req_lock);
+
+	/* If some other part of the code (asender thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return false;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
+	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+		/* If we are (becoming) SyncSource, but peer is still in sync
+		 * preparation, ignore its uptodate-ness to avoid flapping, it
+		 * will change to inconsistent once the peer reaches active
+		 * syncing states.
+		 * It may have changed syncer-paused flags, however, so we
+		 * cannot ignore this completely. */
+		if (peer_state.conn > C_CONNECTED &&
+		    peer_state.conn < C_SYNC_SOURCE)
+			real_peer_disk = D_INCONSISTENT;
+
+		/* if peer_state changes to connected at the same time,
+		 * it explicitly notifies us that it finished resync.
+		 * Maybe we should finish it up, too? */
+		else if (os.conn >= C_SYNC_SOURCE &&
+			 peer_state.conn == C_CONNECTED) {
+			if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+				drbd_resync_finished(mdev);
+			return true;
+		}
+	}
+
+	/* peer says his disk is inconsistent, while we think it is uptodate,
+	 * and this happens while the peer still thinks we have a sync going on,
+	 * but we think we are already done with the sync.
+	 * We ignore this to avoid flapping pdsk.
+	 * This should not happen, if the peer is a recent version of drbd. */
+	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+		real_peer_disk = D_UP_TO_DATE;
+
+	if (ns.conn == C_WF_REPORT_PARAMS)
+		ns.conn = C_CONNECTED;
+
+	if (peer_state.conn == C_AHEAD)
+		ns.conn = C_BEHIND;
+
+	if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		int cr; /* consider resync */
+
+		/* if we established a new connection */
+		cr  = (os.conn < C_CONNECTED);
+		/* if we had an established connection
+		 * and one of the nodes newly attaches a disk */
+		cr |= (os.conn == C_CONNECTED &&
+		       (peer_state.disk == D_NEGOTIATING ||
+			os.disk == D_NEGOTIATING));
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --overwrite-data */
+		cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		cr |= (os.conn == C_CONNECTED &&
+				(peer_state.conn >= C_STARTING_SYNC_S &&
+				 peer_state.conn <= C_WF_BITMAP_T));
+
+		if (cr)
+			ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+
+		put_ldev(mdev);
+		if (ns.conn == C_MASK) {
+			ns.conn = C_CONNECTED;
+			if (mdev->state.disk == D_NEGOTIATING) {
+				drbd_force_state(mdev, NS(disk, D_FAILED));
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
+				peer_state.disk = D_DISKLESS;
+				real_peer_disk = D_DISKLESS;
+			} else {
+				if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
+					return false;
+				D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return false;
+			}
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.i != os.i)
+		goto retry;
+	clear_bit(CONSIDER_RESYNC, &mdev->flags);
+	ns.peer = peer_state.role;
+	ns.pdsk = real_peer_disk;
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+		ns.disk = mdev->new_state_tmp.disk;
+	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+	if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+	    test_bit(NEW_CUR_UUID, &mdev->flags)) {
+		/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+		   for temporal network outages! */
+		spin_unlock_irq(&mdev->req_lock);
+		dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+		tl_clear(mdev);
+		drbd_uuid_new_current(mdev);
+		clear_bit(NEW_CUR_UUID, &mdev->flags);
+		drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
+		return false;
+	}
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
+	rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
+	ns.i = mdev->state.i;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS) {
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return false;
+	}
+
+	if (os.conn > C_WF_REPORT_PARAMS) {
+		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+		    peer_state.disk != D_NEGOTIATING ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(mdev);
+			drbd_send_current_state(mdev);
+		}
+	}
+
+	mdev->net_conf->want_lose = 0;
+
+	drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+
+	return true;
+}
+
+STATIC int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
+
+	wait_event(mdev->misc_wait,
+		   mdev->state.conn == C_WF_SYNC_UUID ||
+		   mdev->state.conn == C_BEHIND ||
+		   mdev->state.conn < C_CONNECTED ||
+		   mdev->state.disk < D_NEGOTIATING);
+
+	/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		_drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
+		_drbd_uuid_set(mdev, UI_BITMAP, 0UL);
+
+		drbd_print_uuids(mdev, "updated sync uuid");
+		drbd_start_resync(mdev, C_SYNC_TARGET);
+
+		put_ldev(mdev);
+	} else
+		dev_err(DEV, "Ignoring SyncUUID packet!\n");
+
+	return true;
+}
+
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
+		     unsigned long *buffer, struct bm_xfer_ctx *c)
+{
+	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+	unsigned want = num_words * sizeof(long);
+	int err;
+
+	if (want != data_size) {
+		dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
+		return -EIO;
+	}
+	if (want == 0)
+		return 0;
+	err = drbd_recv(mdev, buffer, want);
+	if (err != want) {
+		if (err >= 0)
+			err = -EIO;
+		return err;
+	}
+
+	drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+
+	c->word_offset += num_words;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return 1;
+}
+
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_conf *mdev,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl;
+	u64 tmp;
+	unsigned long s = c->bit_offset;
+	unsigned long e;
+	int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
+	int toggle = DCBP_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return -EIO;
+
+	for (have = bits; have > 0; s += rl, toggle = !toggle) {
+		bits = vli_decode_bits(&rl, look_ahead);
+		if (bits <= 0)
+			return -EIO;
+
+		if (toggle) {
+			e = s + rl -1;
+			if (e >= c->bm_bits) {
+				dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return -EIO;
+			}
+			_drbd_bm_set_bits(mdev, s, e);
+		}
+
+		if (have < bits) {
+			dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return -EIO;
+		}
+		look_ahead >>= bits;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return -EIO;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset = s;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (s != c->bm_bits);
+}
+
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_conf *mdev,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c)
+{
+	if (DCBP_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(mdev, p, c);
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests. */
+
+	dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+	return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned plain = sizeof(struct p_header80) *
+		((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+		+ c->bm_words * sizeof(long);
+	unsigned total = c->bytes[0] + c->bytes[1];
+	unsigned r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+		                    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+STATIC int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct bm_xfer_ctx c;
+	void *buffer;
+	int err;
+	int ok = false;
+	struct p_header80 *h = &mdev->data.rbuf.header.h80;
+
+	drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+	/* you are supposed to send additional out-of-sync information
+	 * if you actually set bits during this phase */
+
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	buffer	 = (unsigned long *) __get_free_page(GFP_NOIO);
+	if (!buffer) {
+		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+		goto out;
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(mdev),
+		.bm_words = drbd_bm_words(mdev),
+	};
+
+	for(;;) {
+		if (cmd == P_BITMAP) {
+			err = receive_bitmap_plain(mdev, data_size, buffer, &c);
+		} else if (cmd == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p;
+
+			if (data_size > BM_PACKET_PAYLOAD_BYTES) {
+				dev_err(DEV, "ReportCBitmap packet too large\n");
+				goto out;
+			}
+			/* use the page buff */
+			p = buffer;
+			memcpy(p, h, sizeof(*h));
+			if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
+				goto out;
+			if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
+				goto out;
+			}
+			err = decode_bitmap_c(mdev, p, &c);
+		} else {
+			dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
+			goto out;
+		}
+
+		c.packets[cmd == P_BITMAP]++;
+		c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
+
+		if (err <= 0) {
+			if (err < 0)
+				goto out;
+			break;
+		}
+		if (!drbd_recv_header(mdev, &cmd, &data_size))
+			goto out;
+	}
+
+	INFO_bm_xfer_stats(mdev, "receive", &c);
+
+	if (mdev->state.conn == C_WF_BITMAP_T) {
+		enum drbd_state_rv rv;
+
+		ok = !drbd_send_bitmap(mdev);
+		if (!ok)
+			goto out;
+		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+		rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(rv == SS_SUCCESS);
+	} else if (mdev->state.conn != C_WF_BITMAP_S) {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
+		    drbd_conn_str(mdev->state.conn));
+	}
+
+	ok = true;
+ out:
+	drbd_bm_unlock(mdev);
+	if (ok && mdev->state.conn == C_WF_BITMAP_S)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	free_page((unsigned long) buffer);
+	return ok;
+}
+
+STATIC int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	/* TODO zero copy sink :) */
+	static char sink[128];
+	int size, want, r;
+
+	dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+		 cmd, data_size);
+
+	size = data_size;
+	while (size > 0) {
+		want = min_t(int, size, sizeof(sink));
+		r = drbd_recv(mdev, sink, want);
+		ERR_IF(r <= 0) break;
+		size -= r;
+	}
+	return size == 0;
+}
+
+STATIC int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
+
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(mdev->data.socket);
+
+	return true;
+}
+
+STATIC int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+
+	switch (mdev->state.conn) {
+	case C_WF_SYNC_UUID:
+	case C_WF_BITMAP_T:
+	case C_BEHIND:
+			break;
+	default:
+		dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+				drbd_conn_str(mdev->state.conn));
+	}
+
+	drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+	return true;
+}
+
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+
+struct data_cmd {
+	int expect_payload;
+	size_t pkt_size;
+	drbd_cmd_handler_f function;
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
+	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
+	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+	[P_BITMAP]	    = { 1, sizeof(struct p_header80), receive_bitmap } ,
+	[P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+	[P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_SYNC_PARAM]	    = { 1, sizeof(struct p_header80), receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
+	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
+	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
+	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
+	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
+	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+	/* anything missing from this table is in
+	 * the asender_tbl, see get_asender_cmd */
+	[P_MAX_CMD]	    = { 0, 0, NULL },
+};
+
+/* All handler functions that expect a sub-header get that sub-heder in
+   mdev->data.rbuf.header.head.payload.
+
+   Usually in mdev->data.rbuf.header.head the callback can find the usual
+   p_header, but they may not rely on that. Since there is also p_header95 !
+ */
+
+STATIC void drbdd(struct drbd_conf *mdev)
+{
+	union p_header *header = &mdev->data.rbuf.header;
+	unsigned int packet_size;
+	enum drbd_packets cmd;
+	size_t shs; /* sub header size */
+	int rv;
+
+	while (get_t_state(&mdev->receiver) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+		if (!drbd_recv_header(mdev, &cmd, &packet_size))
+			goto err_out;
+
+		if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
+			dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+			goto err_out;
+		}
+
+		shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
+		if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
+			dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+			goto err_out;
+		}
+
+		if (shs) {
+			rv = drbd_recv(mdev, &header->h80.payload, shs);
+			if (unlikely(rv != shs)) {
+				if (!signal_pending(current))
+					dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
+				goto err_out;
+			}
+		}
+
+		rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
+
+		if (unlikely(!rv)) {
+			dev_err(DEV, "error receiving %s, l: %d!\n",
+			    cmdname(cmd), packet_size);
+			goto err_out;
+		}
+
+		trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf,
+				__FILE__, __LINE__);
+	}
+
+	if (0) {
+	err_out:
+		drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+	}
+	/* If we leave here, we probably want to update at least the
+	 * "Connected" indicator on stable storage. Do so explicitly here. */
+	drbd_md_sync(mdev);
+}
+
+void drbd_flush_workqueue(struct drbd_conf *mdev)
+{
+	struct drbd_wq_barrier barr;
+
+	barr.w.cb = w_prev_work_done;
+	init_completion(&barr.done);
+	drbd_queue_work(&mdev->data.work, &barr.w);
+	wait_for_completion(&barr.done);
+}
+
+void drbd_free_tl_hash(struct drbd_conf *mdev)
+{
+	struct hlist_head *h;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
+		spin_unlock_irq(&mdev->req_lock);
+		return;
+	}
+	/* paranoia code */
+	for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+		if (h->first)
+			dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+				(int)(h - mdev->ee_hash), h->first);
+	kfree(mdev->ee_hash);
+	mdev->ee_hash = NULL;
+	mdev->ee_hash_s = 0;
+
+	/* paranoia code */
+	for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+		if (h->first)
+			dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+				(int)(h - mdev->tl_hash), h->first);
+	kfree(mdev->tl_hash);
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+STATIC void drbd_disconnect(struct drbd_conf *mdev)
+{
+	enum drbd_fencing_p fp;
+	union drbd_state os, ns;
+	int rv = SS_UNKNOWN_ERROR;
+	unsigned int i;
+
+	if (mdev->state.conn == C_STANDALONE)
+		return;
+
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+
+	/* asender does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&mdev->asender);
+	drbd_free_sock(mdev);
+
+	/* wait for current activity to cease. */
+	spin_lock_irq(&mdev->req_lock);
+	_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+	_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
+	_drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	/* We do not have data structures that would allow us to
+	 * get the rs_pending_cnt down to 0 again.
+	 *  * On C_SYNC_TARGET we do not have any data structures describing
+	 *    the pending RSDataRequest's we have sent.
+	 *  * On C_SYNC_SOURCE there is no data structure that tracks
+	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+	 *  And no, it is not the sum of the reference counts in the
+	 *  resync_LRU. The resync_LRU tracks the whole operation including
+	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
+	 *  on the fly. */
+	drbd_rs_cancel_all(mdev);
+	mdev->rs_total = 0;
+	mdev->rs_failed = 0;
+	atomic_set(&mdev->rs_pending_cnt, 0);
+	wake_up(&mdev->misc_wait);
+
+	/* make sure syncer is stopped and w_resume_next_sg queued */
+	del_timer_sync(&mdev->resync_timer);
+	resync_timer_fn((unsigned long)mdev);
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * w_make_resync_request etc. which may still be on the worker queue
+	 * to be "canceled" */
+	drbd_flush_workqueue(mdev);
+
+	/* This also does reclaim_net_ee().  If we do this too early, we might
+	 * miss some resync ee and pages.*/
+	drbd_process_done_ee(mdev);
+
+	kfree(mdev->p_uuid);
+	mdev->p_uuid = NULL;
+
+	if (!is_susp(mdev->state))
+		tl_clear(mdev);
+
+	dev_info(DEV, "Connection closed\n");
+
+	drbd_md_sync(mdev);
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
+		drbd_try_outdate_peer_async(mdev);
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+	if (os.conn >= C_UNCONNECTED) {
+		/* Do not restart in case we are C_DISCONNECTING */
+		ns.i = os.i;
+		ns.conn = C_UNCONNECTED;
+		DRBD_STATE_DEBUG_INIT_VAL(ns);
+		rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (os.conn == C_DISCONNECTING) {
+		wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
+
+		crypto_free_hash(mdev->cram_hmac_tfm);
+		mdev->cram_hmac_tfm = NULL;
+
+		kfree(mdev->net_conf);
+		mdev->net_conf = NULL;
+		drbd_request_state(mdev, NS(conn, C_STANDALONE));
+	}
+
+	/* serialize with bitmap writeout triggered by the state change,
+	 * if any. */
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
+	/* tcp_close and release of sendpage pages can be deferred.  I don't
+	 * want to use SO_LINGER, because apparently it can be deferred for
+	 * more than 20 seconds (longest time I checked).
+	 *
+	 * Actually we don't care for exactly when the network stack does its
+	 * put_page(), but release our reference on these pages right here.
+	 */
+	i = drbd_release_ee(mdev, &mdev->net_ee);
+	if (i)
+		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&mdev->pp_in_use_by_net);
+	if (i)
+		dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
+	i = atomic_read(&mdev->pp_in_use);
+	if (i)
+		dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
+
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&mdev->current_epoch->epoch_size, 0);
+	D_ASSERT(list_empty(&mdev->current_epoch->list));
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+STATIC int drbd_send_handshake(struct drbd_conf *mdev)
+{
+	/* ASSERT current == mdev->receiver ... */
+	struct p_handshake *p = &mdev->data.sbuf.handshake;
+	int ok;
+
+	if (mutex_lock_interruptible(&mdev->data.mutex)) {
+		dev_err(DEV, "interrupted during initial handshake\n");
+		return 0; /* interrupted. not ok. */
+	}
+
+	if (mdev->data.socket == NULL) {
+		mutex_unlock(&mdev->data.mutex);
+		return 0;
+	}
+
+	memset(p, 0, sizeof(*p));
+	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
+			     (struct p_header80 *)p, sizeof(*p), 0 );
+	mutex_unlock(&mdev->data.mutex);
+	return ok;
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+STATIC int drbd_do_handshake(struct drbd_conf *mdev)
+{
+	/* ASSERT current == mdev->receiver ... */
+	struct p_handshake *p = &mdev->data.rbuf.handshake;
+	const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
+	unsigned int length;
+	enum drbd_packets cmd;
+	int rv;
+
+	rv = drbd_send_handshake(mdev);
+	if (!rv)
+		return 0;
+
+	rv = drbd_recv_header(mdev, &cmd, &length);
+	if (!rv)
+		return 0;
+
+	if (cmd != P_HAND_SHAKE) {
+		dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
+		     cmdname(cmd), cmd);
+		return -1;
+	}
+
+	if (length != expect) {
+		dev_err(DEV, "expected HandShake length: %u, received: %u\n",
+		     expect, length);
+		return -1;
+	}
+
+	rv = drbd_recv(mdev, &p->head.payload, expect);
+
+	if (rv != expect) {
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
+		return 0;
+	}
+
+	trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf,
+			__FILE__, __LINE__);
+
+	p->protocol_min = be32_to_cpu(p->protocol_min);
+	p->protocol_max = be32_to_cpu(p->protocol_max);
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
+
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
+
+	mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+
+	dev_info(DEV, "Handshake successful: "
+	     "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+
+	return 1;
+
+ incompat:
+	dev_err(DEV, "incompatible DRBD dialects: "
+	    "I support %d-%d, peer supports %d-%d\n",
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
+	    p->protocol_min, p->protocol_max);
+	return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+STATIC int drbd_do_auth(struct drbd_conf *mdev)
+{
+	dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	return -1;
+}
+#else
+#define CHALLENGE_LEN 64
+
+/* Return value:
+	1 - auth succeeded,
+	0 - failed, try again (network error),
+	-1 - auth failed, don't try again.
+*/
+
+STATIC int drbd_do_auth(struct drbd_conf *mdev)
+{
+	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
+	struct scatterlist sg;
+	char *response = NULL;
+	char *right_response = NULL;
+	char *peers_ch = NULL;
+	unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+	unsigned int resp_size;
+	struct hash_desc desc;
+	enum drbd_packets cmd;
+	unsigned int length;
+	int rv;
+
+	desc.tfm = mdev->cram_hmac_tfm;
+	desc.flags = 0;
+
+	rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
+				(u8 *)mdev->net_conf->shared_secret, key_len);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+	rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+	if (!rv)
+		goto fail;
+
+	rv = drbd_recv_header(mdev, &cmd, &length);
+	if (!rv)
+		goto fail;
+
+	if (cmd != P_AUTH_CHALLENGE) {
+		dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+		    cmdname(cmd), cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (length > CHALLENGE_LEN * 2) {
+		dev_err(DEV, "expected AuthChallenge payload too big.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	peers_ch = kmalloc(length, GFP_NOIO);
+	if (peers_ch == NULL) {
+		dev_err(DEV, "kmalloc of peers_ch failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	rv = drbd_recv(mdev, peers_ch, length);
+
+	if (rv != length) {
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+	response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		dev_err(DEV, "kmalloc of response failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	sg_init_table(&sg, 1);
+	sg_set_buf(&sg, peers_ch, length);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
+	if (!rv)
+		goto fail;
+
+	rv = drbd_recv_header(mdev, &cmd, &length);
+	if (!rv)
+		goto fail;
+
+	if (cmd != P_AUTH_RESPONSE) {
+		dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
+			cmdname(cmd), cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (length != resp_size) {
+		dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_recv(mdev, response , resp_size);
+
+	if (rv != resp_size) {
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	right_response = kmalloc(resp_size, GFP_NOIO);
+	if (right_response == NULL) {
+		dev_err(DEV, "kmalloc of right_response failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	rv = !memcmp(response, right_response, resp_size);
+
+	if (rv)
+		dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
+		     resp_size, mdev->net_conf->cram_hmac_alg);
+	else
+		rv = -1;
+
+ fail:
+	kfree(peers_ch);
+	kfree(response);
+	kfree(right_response);
+
+	return rv;
+}
+#endif
+
+int drbdd_init(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned int minor = mdev_to_minor(mdev);
+	int h;
+
+	sprintf(current->comm, "drbd%d_receiver", minor);
+
+	dev_info(DEV, "receiver (re)started\n");
+
+	do {
+		h = drbd_connect(mdev);
+		if (h == 0) {
+			drbd_disconnect(mdev);
+			schedule_timeout_interruptible(HZ);
+		}
+		if (h == -1) {
+			dev_warn(DEV, "Discarding network configuration.\n");
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		}
+	} while (h == 0);
+
+	if (h > 0) {
+		if (get_net_conf(mdev)) {
+			drbdd(mdev);
+			put_net_conf(mdev);
+		}
+	}
+
+	drbd_disconnect(mdev);
+
+	dev_info(DEV, "receiver terminated\n");
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+STATIC int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL, &mdev->flags);
+		dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
+		    drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&mdev->state_wait);
+
+	return true;
+}
+
+STATIC int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	return drbd_send_ping_ack(mdev);
+
+}
+
+STATIC int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	/* restore idle timeout */
+	mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+	if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
+		wake_up(&mdev->misc_wait);
+
+	return true;
+}
+
+STATIC int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	D_ASSERT(mdev->agreed_pro_version >= 89);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (get_ldev(mdev)) {
+		drbd_rs_complete_io(mdev, sector);
+		drbd_set_in_sync(mdev, sector, blksize);
+		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+		mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+		put_ldev(mdev);
+	}
+	dec_rs_pending(mdev);
+	atomic_add(blksize >> 9, &mdev->rs_sect_in);
+
+	return true;
+}
+
+/* when we receive the ACK for a write request,
+ * verify that we actually know about it */
+static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
+	u64 id, sector_t sector)
+{
+	struct hlist_head *slot = tl_hash_slot(mdev, sector);
+	struct hlist_node *n;
+	struct drbd_request *req;
+
+	hlist_for_each_entry(req, n, slot, collision) {
+		if ((unsigned long)req == (unsigned long)id) {
+			if (req->sector != sector) {
+				dev_err(DEV, "_ack_id_to_req: found req %p but it has "
+				    "wrong sector (%llus versus %llus)\n", req,
+				    (unsigned long long)req->sector,
+				    (unsigned long long)sector);
+				break;
+			}
+			return req;
+		}
+	}
+	return NULL;
+}
+
+typedef struct drbd_request *(req_validator_fn)
+	(struct drbd_conf *mdev, u64 id, sector_t sector);
+
+static int validate_req_change_req_state(struct drbd_conf *mdev,
+	u64 id, sector_t sector, req_validator_fn validator,
+	const char *func, enum drbd_req_event what)
+{
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	spin_lock_irq(&mdev->req_lock);
+	req = validator(mdev, id, sector);
+	if (unlikely(!req)) {
+		spin_unlock_irq(&mdev->req_lock);
+
+		dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
+			(void *)(unsigned long)id, (unsigned long long)sector);
+		return false;
+	}
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	return true;
+}
+
+STATIC int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+	enum drbd_req_event what;
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (is_syncer_block_id(p->block_id)) {
+		drbd_set_in_sync(mdev, sector, blksize);
+		dec_rs_pending(mdev);
+		return true;
+	}
+	switch (be16_to_cpu(h->command)) {
+	case P_RS_WRITE_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = write_acked_by_peer_and_sis;
+		break;
+	case P_WRITE_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = write_acked_by_peer;
+		break;
+	case P_RECV_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
+		what = recv_acked_by_peer;
+		break;
+	case P_DISCARD_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = conflict_discarded_by_peer;
+		break;
+	default:
+		D_ASSERT(0);
+		return false;
+	}
+
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ack_id_to_req, __func__ , what);
+}
+
+STATIC int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (is_syncer_block_id(p->block_id)) {
+		dec_rs_pending(mdev);
+		drbd_rs_failed_io(mdev, sector, size);
+		return true;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	req = _ack_id_to_req(mdev, p->block_id, sector);
+	if (!req) {
+		spin_unlock_irq(&mdev->req_lock);
+		if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
+		    mdev->net_conf->wire_protocol == DRBD_PROT_B) {
+			/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+			   The master bio might already be completed, therefore the
+			   request is no longer in the collision hash.
+			   => Do not try to validate block_id as request. */
+			/* In Protocol B we might already have got a P_RECV_ACK
+			   but then get a P_NEG_ACK after wards. */
+			drbd_set_out_of_sync(mdev, sector, size);
+			return true;
+		} else {
+			dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
+				(void *)(unsigned long)p->block_id, (unsigned long long)sector);
+			return false;
+		}
+	}
+	__req_mod(req, neg_acked, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	return true;
+}
+
+STATIC int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+	dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ar_id_to_req, __func__ , neg_acked);
+}
+
+STATIC int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	sector_t sector;
+	int size;
+	struct p_block_ack *p = (struct p_block_ack *)h;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	dec_rs_pending(mdev);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		drbd_rs_complete_io(mdev, sector);
+		switch (be16_to_cpu(h->command)) {
+		case P_NEG_RS_DREPLY:
+			drbd_rs_failed_io(mdev, sector, size);
+		case P_RS_CANCEL:
+			break;
+		default:
+			D_ASSERT(0);
+			put_ldev(mdev);
+			return false;
+		}
+		put_ldev(mdev);
+	}
+
+	return true;
+}
+
+STATIC int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_barrier_ack *p = (struct p_barrier_ack *)h;
+
+	tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+
+	if (mdev->state.conn == C_AHEAD &&
+	    atomic_read(&mdev->ap_in_flight) == 0 &&
+	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
+		mdev->start_resync_timer.expires = jiffies + HZ;
+		add_timer(&mdev->start_resync_timer);
+	}
+
+	return true;
+}
+
+STATIC int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_work *w;
+	sector_t sector;
+	int size;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+		drbd_ov_oos_found(mdev, sector, size);
+	else
+		ov_oos_print(mdev);
+
+	if (!get_ldev(mdev))
+		return true;
+
+	drbd_rs_complete_io(mdev, sector);
+	dec_rs_pending(mdev);
+
+	--mdev->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((mdev->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(mdev, mdev->ov_left);
+
+	if (mdev->ov_left == 0) {
+		w = kmalloc(sizeof(*w), GFP_NOIO);
+		if (w) {
+			w->cb = w_ov_finished;
+			drbd_queue_work_front(&mdev->data.work, w);
+		} else {
+			dev_err(DEV, "kmalloc(w) failed.");
+			ov_oos_print(mdev);
+			drbd_resync_finished(mdev);
+		}
+	}
+	put_ldev(mdev);
+	return true;
+}
+
+STATIC int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
+{
+	return true;
+}
+
+struct asender_cmd {
+	size_t pkt_size;
+	int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
+};
+
+static struct asender_cmd *get_asender_cmd(int cmd)
+{
+	static struct asender_cmd asender_tbl[] = {
+		/* anything missing from this table is in
+		 * the drbd_cmd_handler (drbd_default_handler) table,
+		 * see the beginning of drbdd() */
+	[P_PING]	    = { sizeof(struct p_header80), got_Ping },
+	[P_PING_ACK]	    = { sizeof(struct p_header80), got_PingAck },
+	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_DISCARD_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
+	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
+	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
+	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply},
+	[P_MAX_CMD]	    = { 0, NULL },
+	};
+	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
+		return NULL;
+	return &asender_tbl[cmd];
+}
+
+int drbd_asender(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct p_header80 *h = &mdev->meta.rbuf.header.h80;
+	struct asender_cmd *cmd = NULL;
+
+	int rv, len;
+	void *buf    = h;
+	int received = 0;
+	int expect   = sizeof(struct p_header80);
+	int empty;
+	int ping_timeout_active = 0;
+
+	sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+
+	current->policy = SCHED_RR;  /* Make this a realtime task! */
+	current->rt_priority = 2;    /* more important than all other tasks */
+
+	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+		if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
+			ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
+			mdev->meta.socket->sk->sk_rcvtimeo =
+				mdev->net_conf->ping_timeo*HZ/10;
+			ping_timeout_active = 1;
+		}
+
+		/* conditionally cork;
+		 * it may hurt latency if we cork without much to send */
+		if (!mdev->net_conf->no_cork &&
+			3 < atomic_read(&mdev->unacked_cnt))
+			drbd_tcp_cork(mdev->meta.socket);
+		while (1) {
+			clear_bit(SIGNAL_ASENDER, &mdev->flags);
+			flush_signals(current);
+			if (!drbd_process_done_ee(mdev))
+				goto reconnect;
+			/* to avoid race with newly queued ACKs */
+			set_bit(SIGNAL_ASENDER, &mdev->flags);
+			spin_lock_irq(&mdev->req_lock);
+			empty = list_empty(&mdev->done_ee);
+			spin_unlock_irq(&mdev->req_lock);
+			/* new ack may have been queued right here,
+			 * but then there is also a signal pending,
+			 * and we start over... */
+			if (empty)
+				break;
+		}
+		/* but unconditionally uncork unless disabled */
+		if (!mdev->net_conf->no_cork)
+			drbd_tcp_uncork(mdev->meta.socket);
+
+		/* short circuit, recv_msg would return EINTR anyways. */
+		if (signal_pending(current))
+			continue;
+
+		rv = drbd_recv_short(mdev, mdev->meta.socket,
+				     buf, expect-received, 0);
+		clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+		flush_signals(current);
+
+		/* Note:
+		 * -EINTR	 (on meta) we got a signal
+		 * -EAGAIN	 (on meta) rcvtimeo expired
+		 * -ECONNRESET	 other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0	 other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0	 : "connection shut down by peer"
+		 */
+		if (likely(rv > 0)) {
+			received += rv;
+			buf	 += rv;
+		} else if (rv == 0) {
+			dev_err(DEV, "meta connection shut down by peer.\n");
+			goto reconnect;
+		} else if (rv == -EAGAIN) {
+			/* If the data socket received something meanwhile,
+			 * that is good enough: peer is still alive. */
+			if (time_after(mdev->last_received,
+				jiffies - mdev->meta.socket->sk->sk_rcvtimeo))
+				continue;
+			if (ping_timeout_active) {
+				dev_err(DEV, "PingAck did not arrive in time.\n");
+				goto reconnect;
+			}
+			set_bit(SEND_PING, &mdev->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			continue;
+		} else {
+			dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			goto reconnect;
+		}
+
+		if (received == expect && cmd == NULL) {
+			if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+				dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+				    be32_to_cpu(h->magic),
+				    be16_to_cpu(h->command),
+				    be16_to_cpu(h->length));
+				goto reconnect;
+			}
+			cmd = get_asender_cmd(be16_to_cpu(h->command));
+			len = be16_to_cpu(h->length);
+			if (unlikely(cmd == NULL)) {
+				dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+				    be32_to_cpu(h->magic),
+				    be16_to_cpu(h->command),
+				    be16_to_cpu(h->length));
+				goto disconnect;
+			}
+			expect = cmd->pkt_size;
+			ERR_IF(len != expect-sizeof(struct p_header80)) {
+				trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
+				DUMPI(expect);
+				goto reconnect;
+			}
+		}
+		if (received == expect) {
+			mdev->last_received = jiffies;
+			D_ASSERT(cmd != NULL);
+			trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
+			if (!cmd->process(mdev, h))
+				goto reconnect;
+
+			/* the idle_timeout (ping-int)
+			 * has been restored in got_PingAck() */
+			if (cmd == get_asender_cmd(P_PING_ACK))
+				ping_timeout_active = 0;
+
+			buf	 = h;
+			received = 0;
+			expect	 = sizeof(struct p_header80);
+			cmd	 = NULL;
+		}
+	}
+
+	if (0) {
+reconnect:
+		drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+		drbd_md_sync(mdev);
+	}
+	if (0) {
+disconnect:
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		drbd_md_sync(mdev);
+	}
+	clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+	D_ASSERT(mdev->state.conn < C_CONNECTED);
+	dev_info(DEV, "asender terminated\n");
+
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_req.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_req.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_req.c	2015-01-21 12:02:58.386823859 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_req.c	2015-01-21 12:02:58.386823859 +0300
@@ -0,0 +1,1333 @@
+/*
+   drbd_req.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_req.h"
+
+
+/* We only support diskstats for 2.6.16 and up.
+ * see also commit commit a362357b6cd62643d4dda3b152639303d78473da
+ * Author: Jens Axboe <axboe@suse.de>
+ * Date:   Tue Nov 1 09:26:16 2005 +0100
+ *     [BLOCK] Unify the separate read/write io stat fields into arrays */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+#define _drbd_start_io_acct(...) do {} while (0)
+#define _drbd_end_io_acct(...)   do {} while (0)
+#else
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+{
+	const int rw = bio_data_dir(bio);
+#ifndef __disk_stat_inc
+	int cpu;
+#endif
+
+#ifdef __disk_stat_inc
+	__disk_stat_inc(mdev->vdisk, ios[rw]);
+	__disk_stat_add(mdev->vdisk, sectors[rw], bio_sectors(bio));
+	disk_round_stats(mdev->vdisk);
+	mdev->vdisk->in_flight++;
+#else
+	cpu = part_stat_lock();
+	part_round_stats(cpu, &mdev->vdisk->part0);
+	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
+	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+	part_inc_in_flight(&mdev->vdisk->part0, rw);
+	part_stat_unlock();
+#endif
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	int rw = bio_data_dir(req->master_bio);
+	unsigned long duration = jiffies - req->start_time;
+#ifndef __disk_stat_inc
+	int cpu;
+#endif
+
+#ifdef __disk_stat_add
+	__disk_stat_add(mdev->vdisk, ticks[rw], duration);
+	disk_round_stats(mdev->vdisk);
+	mdev->vdisk->in_flight--;
+#else
+	cpu = part_stat_lock();
+	part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
+	part_round_stats(cpu, &mdev->vdisk->part0);
+	part_dec_in_flight(&mdev->vdisk->part0, rw);
+	part_stat_unlock();
+#endif
+}
+
+#endif
+
+/* rw is bio_data_dir(), only READ or WRITE */
+static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+{
+	const unsigned long s = req->rq_state;
+
+	/* remove it from the transfer log.
+	 * well, only if it had been there in the first
+	 * place... if it had not (local only or conflicting
+	 * and never sent), it should still be "empty" as
+	 * initialized in drbd_req_new(), so we can list_del() it
+	 * here unconditionally */
+	list_del(&req->tl_requests);
+
+	/* if it was a write, we may have to set the corresponding
+	 * bit(s) out-of-sync first. If it had a local part, we need to
+	 * release the reference to the activity log. */
+	if (rw == WRITE) {
+		/* Set out-of-sync unless both OK flags are set
+		 * (local only or remote failed).
+		 * Other places where we set out-of-sync:
+		 * READ with local io-error */
+		if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+			drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+		if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+			drbd_set_in_sync(mdev, req->sector, req->size);
+
+		/* one might be tempted to move the drbd_al_complete_io
+		 * to the local io completion callback drbd_endio_pri.
+		 * but, if this was a mirror write, we may only
+		 * drbd_al_complete_io after this is RQ_NET_DONE,
+		 * otherwise the extent could be dropped from the al
+		 * before it has actually been written on the peer.
+		 * if we crash before our peer knows about the request,
+		 * but after the extent has been dropped from the al,
+		 * we would forget to resync the corresponding extent.
+		 */
+		if (s & RQ_LOCAL_MASK) {
+			if (get_ldev_if_state(mdev, D_FAILED)) {
+				if (s & RQ_IN_ACT_LOG)
+					drbd_al_complete_io(mdev, req->sector);
+				put_ldev(mdev);
+			} else if (DRBD_ratelimit(5*HZ, 3)) {
+				dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
+				     "but my Disk seems to have failed :(\n",
+				     (unsigned long long) req->sector);
+			}
+		}
+	}
+
+	drbd_req_free(req);
+}
+
+static void queue_barrier(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+
+	/* We are within the req_lock. Once we queued the barrier for sending,
+	 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
+	 * barrier/epoch object is added. This is the only place this bit is
+	 * set. It indicates that the barrier for this epoch is already queued,
+	 * and no new epoch has been created yet. */
+	if (test_bit(CREATE_BARRIER, &mdev->flags))
+		return;
+
+	b = mdev->newest_tle;
+	b->w.cb = w_send_barrier;
+	/* inc_ap_pending done here, so we won't
+	 * get imbalanced on connection loss.
+	 * dec_ap_pending will be done in got_BarrierAck
+	 * or (on connection loss) in tl_clear.  */
+	inc_ap_pending(mdev);
+	drbd_queue_work(&mdev->data.work, &b->w);
+	set_bit(CREATE_BARRIER, &mdev->flags);
+}
+
+static void _about_to_complete_local_write(struct drbd_conf *mdev,
+	struct drbd_request *req)
+{
+	const unsigned long s = req->rq_state;
+	struct drbd_request *i;
+	struct drbd_epoch_entry *e;
+	struct hlist_node *n;
+	struct hlist_head *slot;
+
+	/* Before we can signal completion to the upper layers,
+	 * we may need to close the current epoch.
+	 * We can skip this, if this request has not even been sent, because we
+	 * did not have a fully established connection yet/anymore, during
+	 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
+	 */
+	if (mdev->state.conn >= C_CONNECTED &&
+	    (s & RQ_NET_SENT) != 0 &&
+	    req->epoch == mdev->newest_tle->br_number)
+		queue_barrier(mdev);
+
+	/* we need to do the conflict detection stuff,
+	 * if we have the ee_hash (two_primaries) and
+	 * this has been on the network */
+	if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
+		const sector_t sector = req->sector;
+		const int size = req->size;
+
+		/* ASSERT:
+		 * there must be no conflicting requests, since
+		 * they must have been failed on the spot */
+#define OVERLAPS overlaps(sector, size, i->sector, i->size)
+		slot = tl_hash_slot(mdev, sector);
+		hlist_for_each_entry(i, n, slot, collision) {
+			if (OVERLAPS) {
+				dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
+				      "other: %p %llus +%u\n",
+				      req, (unsigned long long)sector, size,
+				      i, (unsigned long long)i->sector, i->size);
+			}
+		}
+
+		/* maybe "wake" those conflicting epoch entries
+		 * that wait for this request to finish.
+		 *
+		 * currently, there can be only _one_ such ee
+		 * (well, or some more, which would be pending
+		 * P_DISCARD_ACK not yet sent by the asender...),
+		 * since we block the receiver thread upon the
+		 * first conflict detection, which will wait on
+		 * misc_wait.  maybe we want to assert that?
+		 *
+		 * anyways, if we found one,
+		 * we just have to do a wake_up.  */
+#undef OVERLAPS
+#define OVERLAPS overlaps(sector, size, e->sector, e->size)
+		slot = ee_hash_slot(mdev, req->sector);
+		hlist_for_each_entry(e, n, slot, collision) {
+			if (OVERLAPS) {
+				wake_up(&mdev->misc_wait);
+				break;
+			}
+		}
+	}
+#undef OVERLAPS
+}
+
+void complete_master_bio(struct drbd_conf *mdev,
+		struct bio_and_error *m)
+{
+	trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL);
+	bio_endio(m->bio, m->error);
+	dec_ap_bio(mdev);
+}
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+{
+	const unsigned long s = req->rq_state;
+	struct drbd_conf *mdev = req->mdev;
+	int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
+
+	trace_drbd_req(req, nothing, "_req_may_be_done");
+
+	/* we must not complete the master bio, while it is
+	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+	 *	not yet acknowledged by the peer
+	 *	not yet completed by the local io subsystem
+	 * these flags may get cleared in any order by
+	 *	the worker,
+	 *	the receiver,
+	 *	the bio_endio completion callbacks.
+	 */
+	if (s & RQ_NET_QUEUED)
+		return;
+	if (s & RQ_NET_PENDING)
+		return;
+	if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
+		return;
+
+	if (req->master_bio) {
+		/* this is data_received (remote read)
+		 * or protocol C P_WRITE_ACK
+		 * or protocol B P_RECV_ACK
+		 * or protocol A "handed_over_to_network" (SendAck)
+		 * or canceled or failed,
+		 * or killed from the transfer log due to connection loss.
+		 */
+
+		/*
+		 * figure out whether to report success or failure.
+		 *
+		 * report success when at least one of the operations succeeded.
+		 * or, to put the other way,
+		 * only report failure, when both operations failed.
+		 *
+		 * what to do about the failures is handled elsewhere.
+		 * what we need to do here is just: complete the master_bio.
+		 *
+		 * local completion error, if any, has been stored as ERR_PTR
+		 * in private_bio within drbd_endio_pri.
+		 */
+		int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+		int error = PTR_ERR(req->private_bio);
+
+		/* remove the request from the conflict detection
+		 * respective block_id verification hash */
+		if (!hlist_unhashed(&req->collision))
+			hlist_del(&req->collision);
+		else
+			D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+
+		/* for writes we need to do some extra housekeeping */
+		if (rw == WRITE)
+			_about_to_complete_local_write(mdev, req);
+
+		/* Update disk stats */
+		_drbd_end_io_acct(mdev, req);
+
+		m->error = ok ? 0 : (error ?: -EIO);
+		m->bio = req->master_bio;
+		req->master_bio = NULL;
+	}
+
+	if (s & RQ_LOCAL_PENDING)
+		return;
+
+	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
+		/* this is disconnected (local only) operation,
+		 * or protocol C P_WRITE_ACK,
+		 * or protocol A or B P_BARRIER_ACK,
+		 * or killed from the transfer log due to connection loss. */
+		_req_is_done(mdev, req, rw);
+	}
+	/* else: network part and not DONE yet. that is
+	 * protocol A or B, barrier ack still pending... */
+}
+
+static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+{
+	struct drbd_conf *mdev = req->mdev;
+
+	if (!is_susp(mdev->state))
+		_req_may_be_done(req, m);
+}
+
+/*
+ * checks whether there was an overlapping request
+ * or ee already registered.
+ *
+ * if so, return 1, in which case this request is completed on the spot,
+ * without ever being submitted or send.
+ *
+ * return 0 if it is ok to submit this request.
+ *
+ * NOTE:
+ * paranoia: assume something above us is broken, and issues different write
+ * requests for the same block simultaneously...
+ *
+ * To ensure these won't be reordered differently on both nodes, resulting in
+ * diverging data sets, we discard the later one(s). Not that this is supposed
+ * to happen, but this is the rationale why we also have to check for
+ * conflicting requests with local origin, and why we have to do so regardless
+ * of whether we allowed multiple primaries.
+ *
+ * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
+ * second hlist_for_each_entry becomes a noop. This is even simpler than to
+ * grab a reference on the net_conf, and check for the two_primaries flag...
+ */
+STATIC int _req_conflicts(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->mdev;
+	const sector_t sector = req->sector;
+	const int size = req->size;
+	struct drbd_request *i;
+	struct drbd_epoch_entry *e;
+	struct hlist_node *n;
+	struct hlist_head *slot;
+
+	D_ASSERT(hlist_unhashed(&req->collision));
+
+	if (!get_net_conf(mdev))
+		return 0;
+
+	/* BUG_ON */
+	ERR_IF (mdev->tl_hash_s == 0)
+		goto out_no_conflict;
+	BUG_ON(mdev->tl_hash == NULL);
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+	slot = tl_hash_slot(mdev, sector);
+	hlist_for_each_entry(i, n, slot, collision) {
+		if (OVERLAPS) {
+			dev_alert(DEV, "%s[%u] Concurrent local write detected! "
+			      "[DISCARD L] new: %llus +%u; "
+			      "pending: %llus +%u\n",
+			      current->comm, current->pid,
+			      (unsigned long long)sector, size,
+			      (unsigned long long)i->sector, i->size);
+			goto out_conflict;
+		}
+	}
+
+	if (mdev->ee_hash_s) {
+		/* now, check for overlapping requests with remote origin */
+		BUG_ON(mdev->ee_hash == NULL);
+#undef OVERLAPS
+#define OVERLAPS overlaps(e->sector, e->size, sector, size)
+		slot = ee_hash_slot(mdev, sector);
+		hlist_for_each_entry(e, n, slot, collision) {
+			if (OVERLAPS) {
+				dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
+				      " [DISCARD L] new: %llus +%u; "
+				      "pending: %llus +%u\n",
+				      current->comm, current->pid,
+				      (unsigned long long)sector, size,
+				      (unsigned long long)e->sector, e->size);
+				goto out_conflict;
+			}
+		}
+	}
+#undef OVERLAPS
+
+out_no_conflict:
+	/* this is like it should be, and what we expected.
+	 * our users do behave after all... */
+	put_net_conf(mdev);
+	return 0;
+
+out_conflict:
+	put_net_conf(mdev);
+	return 1;
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ *  enforces that it is all in this one place, where it is easier to audit,
+ *  it makes it obvious that whatever "event" "happens" to a request should
+ *  happen "atomically" within the req_lock,
+ *  and it enforces that we have to think in a very structured manner
+ *  about the "events" that may happen to a request during its life time ...
+ */
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m)
+{
+	struct drbd_conf *mdev = req->mdev;
+	int rv = 0;
+	m->bio = NULL;
+
+	trace_drbd_req(req, what, NULL);
+
+	switch (what) {
+	default:
+		dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+		break;
+
+	/* does not happen...
+	 * initialization done in drbd_req_new
+	case created:
+		break;
+		*/
+
+	case to_be_send: /* via network */
+		/* reached via drbd_make_request_common
+		 * and from w_read_retry_remote */
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+		req->rq_state |= RQ_NET_PENDING;
+		inc_ap_pending(mdev);
+		break;
+
+	case to_be_submitted: /* locally */
+		/* reached via drbd_make_request_common */
+		D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
+		req->rq_state |= RQ_LOCAL_PENDING;
+		break;
+
+	case completed_ok:
+		if (req->rq_state & RQ_WRITE)
+			mdev->writ_cnt += req->size>>9;
+		else
+			mdev->read_cnt += req->size>>9;
+
+		req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case abort_disk_io:
+		req->rq_state |= RQ_LOCAL_ABORTED;
+		if (req->rq_state & RQ_WRITE)
+			_req_may_be_done_not_susp(req, m);
+		else
+			goto goto_queue_for_net_read;
+		break;
+
+	case write_completed_with_error:
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		__drbd_chk_io_error(mdev, false);
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case read_ahead_completed_with_error:
+		/* it is legal to fail READA */
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case read_completed_with_error:
+		drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+
+		__drbd_chk_io_error(mdev, false);
+
+	goto_queue_for_net_read:
+
+		/* no point in retrying if there is no good remote data,
+		 * or we have no connection. */
+		if (mdev->state.pdsk != D_UP_TO_DATE) {
+			_req_may_be_done_not_susp(req, m);
+			break;
+		}
+
+		/* _req_mod(req,to_be_send); oops, recursion... */
+		req->rq_state |= RQ_NET_PENDING;
+		inc_ap_pending(mdev);
+		/* fall through: _req_mod(req,queue_for_net_read); */
+
+	case queue_for_net_read:
+		/* READ or READA, and
+		 * no local disk,
+		 * or target area marked as invalid,
+		 * or just got an io-error. */
+		/* from drbd_make_request_common
+		 * or from bio_endio during read io-error recovery */
+
+		/* so we can verify the handle in the answer packet
+		 * corresponding hlist_del is in _req_may_be_done() */
+		hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
+
+		set_bit(UNPLUG_REMOTE, &mdev->flags);
+
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
+			? w_read_retry_remote
+			: w_send_read_req;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
+	case queue_for_net_write:
+		/* assert something? */
+		/* from drbd_make_request_common only */
+
+		hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
+		/* corresponding hlist_del is in _req_may_be_done() */
+
+		/* NOTE
+		 * In case the req ended up on the transfer log before being
+		 * queued on the worker, it could lead to this request being
+		 * missed during cleanup after connection loss.
+		 * So we have to do both operations here,
+		 * within the same lock that protects the transfer log.
+		 *
+		 * _req_add_to_epoch(req); this has to be after the
+		 * _maybe_start_new_epoch(req); which happened in
+		 * drbd_make_request_common, because we now may set the bit
+		 * again ourselves to close the current epoch.
+		 *
+		 * Add req to the (now) current epoch (barrier). */
+
+		/* otherwise we may lose an unplug, which may cause some remote
+		 * io-scheduler timeout to expire, increasing maximum latency,
+		 * hurting performance. */
+		set_bit(UNPLUG_REMOTE, &mdev->flags);
+
+		/* see drbd_make_request_common,
+		 * just after it grabs the req_lock */
+		D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
+
+		req->epoch = mdev->newest_tle->br_number;
+
+		/* increment size of current epoch */
+		mdev->newest_tle->n_writes++;
+
+		/* queue work item to send data */
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb =  w_send_dblock;
+		drbd_queue_work(&mdev->data.work, &req->w);
+
+		/* close the epoch, in case it outgrew the limit */
+		if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
+			queue_barrier(mdev);
+
+		break;
+
+	case queue_for_send_oos:
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb =  w_send_oos;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
+	case read_retry_remote_canceled:
+	case send_canceled:
+	case send_failed:
+		/* real cleanup will be done from tl_clear.  just update flags
+		 * so it is no longer marked as on the worker queue */
+		req->rq_state &= ~RQ_NET_QUEUED;
+		/* if we did it right, tl_clear should be scheduled only after
+		 * this, so this should not be necessary! */
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case handed_over_to_network:
+		/* assert something? */
+		if (bio_data_dir(req->master_bio) == WRITE)
+			atomic_add(req->size>>9, &mdev->ap_in_flight);
+
+		if (bio_data_dir(req->master_bio) == WRITE &&
+		    mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+			/* this is what is dangerous about protocol A:
+			 * pretend it was successfully written on the peer. */
+			if (req->rq_state & RQ_NET_PENDING) {
+				dec_ap_pending(mdev);
+				req->rq_state &= ~RQ_NET_PENDING;
+				req->rq_state |= RQ_NET_OK;
+			} /* else: neg-ack was faster... */
+			/* it is still not yet RQ_NET_DONE until the
+			 * corresponding epoch barrier got acked as well,
+			 * so we know what to dirty on connection loss */
+		}
+		req->rq_state &= ~RQ_NET_QUEUED;
+		req->rq_state |= RQ_NET_SENT;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case oos_handed_to_network:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
+		req->rq_state &= ~RQ_NET_QUEUED;
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case connection_lost_while_pending:
+		/* transfer log cleanup after connection loss */
+		/* assert something? */
+		if (req->rq_state & RQ_NET_PENDING)
+			dec_ap_pending(mdev);
+		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_DONE;
+		if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
+			atomic_sub(req->size>>9, &mdev->ap_in_flight);
+
+		/* if it is still queued, we may not complete it here.
+		 * it will be canceled soon. */
+		if (!(req->rq_state & RQ_NET_QUEUED))
+			_req_may_be_done(req, m); /* Allowed while state.susp */
+		break;
+
+	case conflict_discarded_by_peer:
+		/* for discarded conflicting writes of multiple primaries,
+		 * there is no need to keep anything in the tl, potential
+		 * node crashes are covered by the activity log. */
+		if (what == conflict_discarded_by_peer)
+			dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
+			      " DRBD is not a random data generator!\n",
+			      (unsigned long long)req->sector, req->size);
+		req->rq_state |= RQ_NET_DONE;
+		/* fall through */
+	case write_acked_by_peer_and_sis:
+	case write_acked_by_peer:
+		if (what == write_acked_by_peer_and_sis)
+			req->rq_state |= RQ_NET_SIS;
+		/* protocol C; successfully written on peer.
+		 * Nothing more to do here.
+		 * We want to keep the tl in place for all protocols, to cater
+		 * for volatile write-back caches on lower level devices. */
+
+	case recv_acked_by_peer:
+		/* protocol B; pretends to be successfully written on peer.
+		 * see also notes above in handed_over_to_network about
+		 * protocol != C */
+		req->rq_state |= RQ_NET_OK;
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		dec_ap_pending(mdev);
+		atomic_sub(req->size>>9, &mdev->ap_in_flight);
+		req->rq_state &= ~RQ_NET_PENDING;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
+	case neg_acked:
+		/* assert something? */
+		if (req->rq_state & RQ_NET_PENDING) {
+			dec_ap_pending(mdev);
+			atomic_sub(req->size>>9, &mdev->ap_in_flight);
+		}
+		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done_not_susp(req, m);
+		/* else: done by handed_over_to_network */
+		break;
+
+	case fail_frozen_disk_io:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		_req_may_be_done(req, m); /* Allowed while state.susp */
+		break;
+
+	case restart_frozen_disk_io:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		req->rq_state &= ~RQ_LOCAL_COMPLETED;
+
+		rv = MR_READ;
+		if (bio_data_dir(req->master_bio) == WRITE)
+			rv = MR_WRITE;
+
+		get_ldev(mdev);
+		req->w.cb = w_restart_disk_io;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
+	case resend:
+		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+		   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
+		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
+		   We ensure that the peer was not rebooted */
+		if (!(req->rq_state & RQ_NET_OK)) {
+			if (req->w.cb) {
+				drbd_queue_work(&mdev->data.work, &req->w);
+				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+			}
+			break;
+		}
+		/* else, fall through to barrier_acked */
+
+	case barrier_acked:
+		if (!(req->rq_state & RQ_WRITE))
+			break;
+
+		if (req->rq_state & RQ_NET_PENDING) {
+			/* barrier came in before all requests have been acked.
+			 * this is bad, because if the connection is lost now,
+			 * we won't be able to clean them up... */
+			dev_err(DEV, "FIXME (barrier_acked but pending)\n");
+			trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)");
+			list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+		}
+		if ((req->rq_state & RQ_NET_MASK) != 0) {
+			req->rq_state |= RQ_NET_DONE;
+			if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+				atomic_sub(req->size>>9, &mdev->ap_in_flight);
+		}
+		_req_may_be_done(req, m); /* Allowed while state.susp */
+		break;
+
+	case data_received:
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		dec_ap_pending(mdev);
+		req->rq_state &= ~RQ_NET_PENDING;
+		req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
+		_req_may_be_done_not_susp(req, m);
+		break;
+	};
+
+	return rv;
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be bigger than BM_BLOCK_SIZE,
+ *   we may need to check several bits.
+ */
+STATIC int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	unsigned long sbnr, ebnr;
+	sector_t esector, nr_sectors;
+
+	if (mdev->state.disk == D_UP_TO_DATE)
+		return 1;
+	if (mdev->state.disk >= D_OUTDATED)
+		return 0;
+	if (mdev->state.disk <  D_INCONSISTENT)
+		return 0;
+	/* state.disk == D_INCONSISTENT   We will have a look at the BitMap */
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	D_ASSERT(sector  < nr_sectors);
+	D_ASSERT(esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+}
+
+STATIC int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+	const int rw = bio_rw(bio);
+	const int size = bio->bi_size;
+	const sector_t sector = bio->bi_sector;
+	struct drbd_tl_epoch *b = NULL;
+	struct drbd_request *req;
+	int local, remote, send_oos = 0;
+	int err = -EIO;
+	int ret = 0;
+	union drbd_state s;
+
+	/* allocate outside of all locks; */
+	req = drbd_req_new(mdev, bio);
+	if (!req) {
+		dec_ap_bio(mdev);
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, that's not our business. */
+		dev_err(DEV, "could not kmalloc() req\n");
+		bio_endio(bio, -ENOMEM);
+		return 0;
+	}
+	req->start_time = start_time;
+
+	trace_drbd_bio(mdev, "Rq", bio, 0, req);
+
+	local = get_ldev(mdev);
+	if (!local) {
+		bio_put(req->private_bio); /* or we get a bio leak */
+		req->private_bio = NULL;
+	}
+	if (rw == WRITE) {
+		remote = 1;
+	} else {
+		/* READ || READA */
+		if (local) {
+			if (!drbd_may_do_local_read(mdev, sector, size)) {
+				/* we could kick the syncer to
+				 * sync this extent asap, wait for
+				 * it, then continue locally.
+				 * Or just issue the request remotely.
+				 */
+				local = 0;
+				bio_put(req->private_bio);
+				req->private_bio = NULL;
+				put_ldev(mdev);
+			}
+		}
+		remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
+	}
+
+	/* If we have a disk, but a READA request is mapped to remote,
+	 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
+	 * Just fail that READA request right here.
+	 *
+	 * THINK: maybe fail all READA when not local?
+	 *        or make this configurable...
+	 *        if network is slow, READA won't do any good.
+	 */
+	if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
+		err = -EWOULDBLOCK;
+		goto fail_and_free_req;
+	}
+
+	/* For WRITES going to the local disk, grab a reference on the target
+	 * extent.  This waits for any resync activity in the corresponding
+	 * resync extent to finish, and, if necessary, pulls in the target
+	 * extent into the activity log, which involves further disk io because
+	 * of transactional on-disk meta data updates. */
+	if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+		req->rq_state |= RQ_IN_ACT_LOG;
+		drbd_al_begin_io(mdev, sector);
+	}
+
+	s = mdev->state;
+	remote = remote && drbd_should_do_remote(s);
+	send_oos = rw == WRITE && drbd_should_send_oos(s);
+	D_ASSERT(!(remote && send_oos));
+
+	if (!(local || remote) && !is_susp(mdev->state)) {
+		if (DRBD_ratelimit(5*HZ, 3))
+			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+		goto fail_free_complete;
+	}
+
+	/* For WRITE request, we have to make sure that we have an
+	 * unused_spare_tle, in case we need to start a new epoch.
+	 * I try to be smart and avoid to pre-allocate always "just in case",
+	 * but there is a race between testing the bit and pointer outside the
+	 * spinlock, and grabbing the spinlock.
+	 * if we lost that race, we retry.  */
+	if (rw == WRITE && (remote || send_oos) &&
+	    mdev->unused_spare_tle == NULL &&
+	    test_bit(CREATE_BARRIER, &mdev->flags)) {
+allocate_barrier:
+		b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
+		if (!b) {
+			dev_err(DEV, "Failed to alloc barrier.\n");
+			err = -ENOMEM;
+			goto fail_free_complete;
+		}
+	}
+
+	/* GOOD, everything prepared, grab the spin_lock */
+	spin_lock_irq(&mdev->req_lock);
+
+	if (is_susp(mdev->state)) {
+		/* If we got suspended, use the retry mechanism of
+		   drbd_make_request() to restart processing of this
+		   bio. In the next call to drbd_make_request
+		   we sleep in inc_ap_bio() */
+		ret = 1;
+		spin_unlock_irq(&mdev->req_lock);
+		goto fail_free_complete;
+	}
+
+	if (remote || send_oos) {
+		remote = drbd_should_do_remote(mdev->state);
+		send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+		D_ASSERT(!(remote && send_oos));
+
+		if (!(remote || send_oos))
+			dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
+		if (!(local || remote)) {
+			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+			spin_unlock_irq(&mdev->req_lock);
+			goto fail_free_complete;
+		}
+	}
+
+	if (b && mdev->unused_spare_tle == NULL) {
+		mdev->unused_spare_tle = b;
+		b = NULL;
+	}
+	if (rw == WRITE && (remote || send_oos) &&
+	    mdev->unused_spare_tle == NULL &&
+	    test_bit(CREATE_BARRIER, &mdev->flags)) {
+		/* someone closed the current epoch
+		 * while we were grabbing the spinlock */
+		spin_unlock_irq(&mdev->req_lock);
+		goto allocate_barrier;
+	}
+
+
+	/* Update disk stats */
+	_drbd_start_io_acct(mdev, req, bio);
+
+	/* _maybe_start_new_epoch(mdev);
+	 * If we need to generate a write barrier packet, we have to add the
+	 * new epoch (barrier) object, and queue the barrier packet for sending,
+	 * and queue the req's data after it _within the same lock_, otherwise
+	 * we have race conditions were the reorder domains could be mixed up.
+	 *
+	 * Even read requests may start a new epoch and queue the corresponding
+	 * barrier packet.  To get the write ordering right, we only have to
+	 * make sure that, if this is a write request and it triggered a
+	 * barrier packet, this request is queued within the same spinlock. */
+	if ((remote || send_oos) && mdev->unused_spare_tle &&
+	    test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+		_tl_add_barrier(mdev, mdev->unused_spare_tle);
+		mdev->unused_spare_tle = NULL;
+	} else {
+		D_ASSERT(!(remote && rw == WRITE &&
+			   test_bit(CREATE_BARRIER, &mdev->flags)));
+	}
+
+	/* NOTE
+	 * Actually, 'local' may be wrong here already, since we may have failed
+	 * to write to the meta data, and may become wrong anytime because of
+	 * local io-error for some other request, which would lead to us
+	 * "detaching" the local disk.
+	 *
+	 * 'remote' may become wrong any time because the network could fail.
+	 *
+	 * This is a harmless race condition, though, since it is handled
+	 * correctly at the appropriate places; so it just defers the failure
+	 * of the respective operation.
+	 */
+
+	/* mark them early for readability.
+	 * this just sets some state flags. */
+	if (remote)
+		_req_mod(req, to_be_send);
+	if (local)
+		_req_mod(req, to_be_submitted);
+
+	/* check this request on the collision detection hash tables.
+	 * if we have a conflict, just complete it here.
+	 * THINK do we want to check reads, too? (I don't think so...) */
+	if (rw == WRITE && _req_conflicts(req))
+		goto fail_conflicting;
+
+	list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+
+	/* NOTE remote first: to get the concurrent write detection right,
+	 * we must register the request before start of local IO.  */
+	if (remote) {
+		/* either WRITE and C_CONNECTED,
+		 * or READ, and no local disk,
+		 * or READ, but not in sync.
+		 */
+		_req_mod(req, (rw == WRITE)
+				? queue_for_net_write
+				: queue_for_net_read);
+	}
+	if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
+		_req_mod(req, queue_for_send_oos);
+
+	if (remote &&
+	    mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
+		int congested = 0;
+
+		if (mdev->net_conf->cong_fill &&
+		    atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+			dev_info(DEV, "Congestion-fill threshold reached\n");
+			congested = 1;
+		}
+
+		if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+			dev_info(DEV, "Congestion-extents threshold reached\n");
+			congested = 1;
+		}
+
+		if (congested) {
+			queue_barrier(mdev); /* last barrier, after mirrored writes */
+
+			if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+				_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+			else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+				_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+		}
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+	kfree(b); /* if someone else has beaten us to it... */
+
+	if (local) {
+		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+
+		trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL);
+
+		/* State may have changed since we grabbed our reference on the
+		 * mdev->ldev member. Double check, and short-circuit to endio.
+		 * In case the last activity log transaction failed to get on
+		 * stable storage, and this is a WRITE, we may not even submit
+		 * this bio. */
+		if (get_ldev(mdev)) {
+			if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
+						    : rw == READ  ? DRBD_FAULT_DT_RD
+						    :               DRBD_FAULT_DT_RA))
+				bio_endio(req->private_bio, -EIO);
+			else
+				generic_make_request(req->private_bio);
+			put_ldev(mdev);
+		} else
+			bio_endio(req->private_bio, -EIO);
+	}
+
+	/* we need to plug ALWAYS since we possibly need to kick lo_dev.
+	 * we plug after submit, so we won't miss an unplug event */
+	drbd_plug_device(mdev);
+
+	return 0;
+
+fail_conflicting:
+	/* this is a conflicting request.
+	 * even though it may have been only _partially_
+	 * overlapping with one of the currently pending requests,
+	 * without even submitting or sending it, we will
+	 * pretend that it was successfully served right now.
+	 */
+	_drbd_end_io_acct(mdev, req);
+	spin_unlock_irq(&mdev->req_lock);
+	if (remote)
+		dec_ap_pending(mdev);
+	/* THINK: do we want to fail it (-EIO), or pretend success?
+	 * this pretends success. */
+	err = 0;
+
+fail_free_complete:
+	if (req->rq_state & RQ_IN_ACT_LOG)
+		drbd_al_complete_io(mdev, sector);
+fail_and_free_req:
+	if (local) {
+		bio_put(req->private_bio);
+		req->private_bio = NULL;
+		put_ldev(mdev);
+	}
+	if (!ret)
+		bio_endio(bio, err);
+
+	drbd_req_free(req);
+	dec_ap_bio(mdev);
+	kfree(b);
+
+	return ret;
+}
+
+/* helper function for drbd_make_request
+ * if we can determine just by the mdev (state) that this request will fail,
+ * return 1
+ * otherwise return 0
+ */
+static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
+{
+	if (mdev->state.role != R_PRIMARY &&
+		(!allow_oos || is_write)) {
+		if (DRBD_ratelimit(5*HZ, 5)) {
+			dev_err(DEV, "Process %s[%u] tried to %s; "
+			    "since we are not in Primary state, "
+			    "we cannot allow this\n",
+			    current->comm, current->pid,
+			    is_write ? "WRITE" : "READ");
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+MAKE_REQUEST_TYPE drbd_make_request(struct request_queue *q, struct bio *bio)
+{
+	unsigned int s_enr, e_enr;
+	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+	unsigned long start_time;
+
+	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
+		bio_endio(bio, -EPERM);
+		MAKE_REQUEST_RETURN;
+	}
+
+	/* We never supported BIO_RW_BARRIER.
+	 * We don't need to, anymore, either: starting with kernel 2.6.36,
+	 * we have REQ_FUA and REQ_FLUSH, which will be handled transparently
+	 * by the block layer. */
+	if (unlikely(bio->bi_rw & DRBD_REQ_HARDBARRIER)) {
+		bio_endio(bio, -EOPNOTSUPP);
+		MAKE_REQUEST_RETURN;
+	}
+
+	start_time = jiffies;
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(bio->bi_size > 0);
+	D_ASSERT((bio->bi_size & 0x1ff) == 0);
+
+	/* to make some things easier, force alignment of requests within the
+	 * granularity of our hash tables */
+	s_enr = bio->bi_sector >> HT_SHIFT;
+	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
+
+	if (likely(s_enr == e_enr)) {
+		do {
+			inc_ap_bio(mdev, 1);
+		} while (drbd_make_request_common(mdev, bio, start_time));
+		MAKE_REQUEST_RETURN;
+	}
+
+	/* can this bio be split generically?
+	 * Maybe add our own split-arbitrary-bios function. */
+	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
+		/* rather error out here than BUG in bio_split */
+		dev_err(DEV, "bio would need to, but cannot, be split: "
+		    "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
+		    bio->bi_vcnt, bio->bi_idx, bio->bi_size,
+		    (unsigned long long)bio->bi_sector);
+		bio_endio(bio, -EINVAL);
+	} else {
+		/* This bio crosses some boundary, so we have to split it. */
+		struct bio_pair *bp;
+		/* works for the "do not cross hash slot boundaries" case
+		 * e.g. sector 262269, size 4096
+		 * s_enr = 262269 >> 6 = 4097
+		 * e_enr = (262269+8-1) >> 6 = 4098
+		 * HT_SHIFT = 6
+		 * sps = 64, mask = 63
+		 * first_sectors = 64 - (262269 & 63) = 3
+		 */
+		const sector_t sect = bio->bi_sector;
+		const int sps = 1 << HT_SHIFT; /* sectors per slot */
+		const int mask = sps - 1;
+		const sector_t first_sectors = sps - (sect & mask);
+		bp = bio_split(bio,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+				bio_split_pool,
+#endif
+				first_sectors);
+
+		/* we need to get a "reference count" (ap_bio_cnt)
+		 * to avoid races with the disconnect/reconnect/suspend code.
+		 * In case we need to split the bio here, we need to get three references
+		 * atomically, otherwise we might deadlock when trying to submit the
+		 * second one! */
+		inc_ap_bio(mdev, 3);
+
+		D_ASSERT(e_enr == s_enr + 1);
+
+		while (drbd_make_request_common(mdev, &bp->bio1, start_time))
+			inc_ap_bio(mdev, 1);
+
+		while (drbd_make_request_common(mdev, &bp->bio2, start_time))
+			inc_ap_bio(mdev, 1);
+
+		dec_ap_bio(mdev);
+
+		bio_pair_release(bp);
+	}
+	MAKE_REQUEST_RETURN;
+}
+
+/* This is called by bio_add_page().  With this function we reduce
+ * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
+ * units (was AL_EXTENTs).
+ *
+ * we do the calculation within the lower 32bit of the byte offsets,
+ * since we don't care for actual offset, but only check whether it
+ * would cross "activity log extent" boundaries.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset.  so the resulting bio may still
+ * cross extent boundaries.  those are dealt with (bio_split) in
+ * drbd_make_request.
+ */
+int drbd_merge_bvec(struct request_queue *q,
+#ifdef HAVE_bvec_merge_data
+		struct bvec_merge_data *bvm,
+#else
+		struct bio *bvm,
+#endif
+		struct bio_vec *bvec)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+	unsigned int bio_offset =
+		(unsigned int)bvm->bi_sector << 9; /* 32 bit */
+	unsigned int bio_size = bvm->bi_size;
+	int limit, backing_limit;
+
+	limit = DRBD_MAX_BIO_SIZE
+	      - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
+	if (limit < 0)
+		limit = 0;
+	if (bio_size == 0) {
+		if (limit <= bvec->bv_len)
+			limit = bvec->bv_len;
+	} else if (limit && get_ldev(mdev)) {
+		struct request_queue * const b =
+			mdev->ldev->backing_bdev->bd_disk->queue;
+		if (b->merge_bvec_fn) {
+			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+			limit = min(limit, backing_limit);
+		}
+		put_ldev(mdev);
+	}
+	return limit;
+}
+
+void request_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+	struct drbd_request *req; /* oldest request */
+	struct list_head *le;
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
+
+	if (get_net_conf(mdev)) {
+		if (mdev->state.conn >= C_WF_REPORT_PARAMS)
+			ent = mdev->net_conf->timeout*HZ/10
+				* mdev->net_conf->ko_count;
+		put_net_conf(mdev);
+	}
+	if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+		put_ldev(mdev);
+	}
+	et = min_not_zero(dt, ent);
+
+	if (!et)
+		return; /* Recurring timer stopped */
+
+	now = jiffies;
+
+	spin_lock_irq(&mdev->req_lock);
+	le = &mdev->oldest_tle->requests;
+	if (list_empty(le)) {
+		spin_unlock_irq(&mdev->req_lock);
+		mod_timer(&mdev->request_timer, now + et);
+		return;
+	}
+
+	le = le->prev;
+	req = list_entry(le, struct drbd_request, tl_requests);
+
+	/* The request is considered timed out, if
+	 * - we have some effective timeout from the configuration,
+	 *   with above state restrictions applied,
+	 * - the oldest request is waiting for a response from the network
+	 *   resp. the local disk,
+	 * - the oldest request is in fact older than the effective timeout,
+	 * - the connection was established (resp. disk was attached)
+	 *   for longer than the timeout already.
+	 * Note that for 32bit jiffies and very stable connections/disks,
+	 * we may have a wrap around, which is catched by
+	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+	 *
+	 * Side effect: once per 32bit wrap-around interval, which means every
+	 * ~198 days with 250 HZ, we have a window where the timeout would need
+	 * to expire twice (worst case) to become effective. Good enough.
+	 */
+	if (ent && req->rq_state & RQ_NET_PENDING &&
+		 time_after(now, req->start_time + ent) &&
+		!time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+		dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+		_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+	}
+	if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+		 time_after(now, req->start_time + dt) &&
+		!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
+		dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(mdev, 1);
+	}
+	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
+	spin_unlock_irq(&mdev->req_lock);
+	mod_timer(&mdev->request_timer, nt);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_req.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_req.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_req.h	2015-01-21 12:02:58.386823859 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_req.h	2015-01-21 12:02:58.386823859 +0300
@@ -0,0 +1,386 @@
+/*
+   drbd_req.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+   Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+   DRBD is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   DRBD is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+   and in Softirqs/Tasklets/BH context by the SCSI drivers,
+   and by the receiver and worker in kernel-thread context.
+   Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ *  It will be created.
+ *  It will be marked with the intention to be
+ *    submitted to local disk and/or
+ *    send via the network.
+ *
+ *  It has to be placed on the transfer log and other housekeeping lists,
+ *  In case we have a network connection.
+ *
+ *  It may be identified as a concurrent (write) request
+ *    and be handled accordingly.
+ *
+ *  It may me handed over to the local disk subsystem.
+ *  It may be completed by the local disk subsystem,
+ *    either successfully or with io-error.
+ *  In case it is a READ request, and it failed locally,
+ *    it may be retried remotely.
+ *
+ *  It may be queued for sending.
+ *  It may be handed over to the network stack,
+ *    which may fail.
+ *  It may be acknowledged by the "peer" according to the wire_protocol in use.
+ *    this may be a negative ack.
+ *  It may receive a faked ack when the network connection is lost and the
+ *  transfer log is cleaned up.
+ *  Sending may be canceled due to network connection loss.
+ *  When it finally has outlived its time,
+ *    corresponding dirty bits in the resync-bitmap may be cleared or set,
+ *    it will be destroyed,
+ *    and completion will be signalled to the originator,
+ *      with or without "success".
+ */
+
+enum drbd_req_event {
+	created,
+	to_be_send,
+	to_be_submitted,
+
+	/* XXX yes, now I am inconsistent...
+	 * these are not "events" but "actions"
+	 * oh, well... */
+	queue_for_net_write,
+	queue_for_net_read,
+	queue_for_send_oos,
+
+	send_canceled,
+	send_failed,
+	handed_over_to_network,
+	oos_handed_to_network,
+	connection_lost_while_pending,
+	read_retry_remote_canceled,
+	recv_acked_by_peer,
+	write_acked_by_peer,
+	write_acked_by_peer_and_sis, /* and set_in_sync */
+	conflict_discarded_by_peer,
+	neg_acked,
+	barrier_acked, /* in protocol A and B */
+	data_received, /* (remote read) */
+
+	read_completed_with_error,
+	read_ahead_completed_with_error,
+	write_completed_with_error,
+	abort_disk_io,
+	completed_ok,
+	resend,
+	fail_frozen_disk_io,
+	restart_frozen_disk_io,
+	nothing, /* for tracing only */
+};
+
+/* encoding of request states for now.  we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
+	 *    UNUSED, we could map: 011: submitted, completion still pending
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
+	 */
+	__RQ_LOCAL_PENDING,
+	__RQ_LOCAL_COMPLETED,
+	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
+
+	/* 87654
+	 * 00000: no network possible
+	 * 00001: to be send
+	 * 00011: to be send, on worker queue
+	 * 00101: sent, expecting recv_ack (B) or write_ack (C)
+	 * 11101: sent,
+	 *        recv_ack (B) or implicit "ack" (A),
+	 *        still waiting for the barrier ack.
+	 *        master_bio may already be completed and invalidated.
+	 * 11100: write_acked (C),
+	 *        data_received (for remote read, any protocol)
+	 *        or finally the barrier ack has arrived (B,A)...
+	 *        request can be freed
+	 * 01100: neg-acked (write, protocol C)
+	 *        or neg-d-acked (read, any protocol)
+	 *        or killed from the transfer log
+	 *        during cleanup after connection loss
+	 *        request can be freed
+	 * 01000: canceled or send failed...
+	 *        request can be freed
+	 */
+
+	/* if "SENT" is not set, yet, this can still fail or be canceled.
+	 * if "SENT" is set already, we still wait for an Ack packet.
+	 * when cleared, the master_bio may be completed.
+	 * in (B,A) the request object may still linger on the transaction log
+	 * until the corresponding barrier ack comes in */
+	__RQ_NET_PENDING,
+
+	/* If it is QUEUED, and it is a WRITE, it is also registered in the
+	 * transfer log. Currently we need this flag to avoid conflicts between
+	 * worker canceling the request and tl_clear_barrier killing it from
+	 * transfer log.  We should restructure the code so this conflict does
+	 * no longer occur. */
+	__RQ_NET_QUEUED,
+
+	/* well, actually only "handed over to the network stack".
+	 *
+	 * TODO can potentially be dropped because of the similar meaning
+	 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+	 * however it is not exactly the same. before we drop it
+	 * we must ensure that we can tell a request with network part
+	 * from a request without, regardless of what happens to it. */
+	__RQ_NET_SENT,
+
+	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+	 * basically this means the corresponding P_BARRIER_ACK was received */
+	__RQ_NET_DONE,
+
+	/* whether or not we know (C) or pretend (B,A) that the write
+	 * was successfully written on the peer.
+	 */
+	__RQ_NET_OK,
+
+	/* peer called drbd_set_in_sync() for this write */
+	__RQ_NET_SIS,
+
+	/* keep this last, its for the RQ_NET_MASK */
+	__RQ_NET_MAX,
+
+	/* Set when this is a write, clear for a read */
+	__RQ_WRITE,
+
+	/* Should call drbd_al_complete_io() for this request... */
+	__RQ_IN_ACT_LOG,
+};
+
+#define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
+
+#define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT        (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE        (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK          (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS         (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+#define RQ_WRITE           (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG      (1UL << __RQ_IN_ACT_LOG)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+   should be counted in the epoch object*/
+#define MR_WRITE_SHIFT 0
+#define MR_WRITE       (1 << MR_WRITE_SHIFT)
+#define MR_READ_SHIFT  1
+#define MR_READ        (1 << MR_READ_SHIFT)
+
+/* epoch entries */
+static inline
+struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	BUG_ON(mdev->ee_hash_s == 0);
+	return mdev->ee_hash +
+		((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
+}
+
+/* transfer log (drbd_request objects) */
+static inline
+struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	BUG_ON(mdev->tl_hash_s == 0);
+	return mdev->tl_hash +
+		((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
+}
+
+/* application reads (drbd_request objects) */
+static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	return mdev->app_reads_hash
+		+ ((unsigned int)(sector) % APP_R_HSIZE);
+}
+
+/* when we receive the answer for a read request,
+ * verify that we actually know about it */
+static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
+	u64 id, sector_t sector)
+{
+	struct hlist_head *slot = ar_hash_slot(mdev, sector);
+	struct hlist_node *n;
+	struct drbd_request *req;
+
+	hlist_for_each_entry(req, n, slot, collision) {
+		if ((unsigned long)req == (unsigned long)id) {
+			D_ASSERT(req->sector == sector);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+	struct bio *bio;
+	bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+	req->private_bio = bio;
+
+	bio->bi_private  = req;
+	bio->bi_end_io   = drbd_endio_pri;
+	bio->bi_next     = NULL;
+}
+
+static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+	struct bio *bio_src)
+{
+	struct drbd_request *req =
+		mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	if (likely(req)) {
+		drbd_req_make_private_bio(req, bio_src);
+
+		req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+		req->mdev        = mdev;
+		req->master_bio  = bio_src;
+		req->epoch       = 0;
+		req->sector      = bio_src->bi_sector;
+		req->size        = bio_src->bi_size;
+		INIT_HLIST_NODE(&req->collision);
+		INIT_LIST_HEAD(&req->tl_requests);
+		INIT_LIST_HEAD(&req->w.list);
+	}
+	return req;
+}
+
+static inline void drbd_req_free(struct drbd_request *req)
+{
+	mempool_free(req, drbd_request_mempool);
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+	struct bio *bio;
+	int error;
+};
+
+extern void _req_may_be_done(struct drbd_request *req,
+		struct bio_and_error *m);
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_conf *mdev,
+		struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	int rv;
+
+	/* __req_mod possibly frees req, do not touch req after that! */
+	rv = __req_mod(req, what, &m);
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+
+	return rv;
+}
+
+/* completion of master bio is outside of spinlock.
+ * If you need it irqsave, do it your self!
+ * Which means: don't use from bio endio callback. */
+static inline int req_mod(struct drbd_request *req,
+		enum drbd_req_event what)
+{
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	int rv;
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = __req_mod(req, what, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+
+	return rv;
+}
+
+static inline bool drbd_should_do_remote(union drbd_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+static inline bool drbd_should_send_oos(union drbd_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_strings.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_strings.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_strings.c	2015-01-21 12:02:58.386823859 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_strings.c	2015-01-21 12:02:58.386823859 +0300
@@ -0,0 +1,115 @@
+/*
+  drbd.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+
+static const char *drbd_conn_s_names[] = {
+	[C_STANDALONE]       = "StandAlone",
+	[C_DISCONNECTING]    = "Disconnecting",
+	[C_UNCONNECTED]      = "Unconnected",
+	[C_TIMEOUT]          = "Timeout",
+	[C_BROKEN_PIPE]      = "BrokenPipe",
+	[C_NETWORK_FAILURE]  = "NetworkFailure",
+	[C_PROTOCOL_ERROR]   = "ProtocolError",
+	[C_WF_CONNECTION]    = "WFConnection",
+	[C_WF_REPORT_PARAMS] = "WFReportParams",
+	[C_TEAR_DOWN]        = "TearDown",
+	[C_CONNECTED]        = "Connected",
+	[C_STARTING_SYNC_S]  = "StartingSyncS",
+	[C_STARTING_SYNC_T]  = "StartingSyncT",
+	[C_WF_BITMAP_S]      = "WFBitMapS",
+	[C_WF_BITMAP_T]      = "WFBitMapT",
+	[C_WF_SYNC_UUID]     = "WFSyncUUID",
+	[C_SYNC_SOURCE]      = "SyncSource",
+	[C_SYNC_TARGET]      = "SyncTarget",
+	[C_PAUSED_SYNC_S]    = "PausedSyncS",
+	[C_PAUSED_SYNC_T]    = "PausedSyncT",
+	[C_VERIFY_S]         = "VerifyS",
+	[C_VERIFY_T]         = "VerifyT",
+	[C_AHEAD]            = "Ahead",
+	[C_BEHIND]           = "Behind",
+};
+
+static const char *drbd_role_s_names[] = {
+	[R_PRIMARY]   = "Primary",
+	[R_SECONDARY] = "Secondary",
+	[R_UNKNOWN]   = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+	[D_DISKLESS]     = "Diskless",
+	[D_ATTACHING]    = "Attaching",
+	[D_FAILED]       = "Failed",
+	[D_NEGOTIATING]  = "Negotiating",
+	[D_INCONSISTENT] = "Inconsistent",
+	[D_OUTDATED]     = "Outdated",
+	[D_UNKNOWN]      = "DUnknown",
+	[D_CONSISTENT]   = "Consistent",
+	[D_UP_TO_DATE]   = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+	[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
+	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+	[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+	[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+	[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+	[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+	[-SS_DEVICE_IN_USE] = "Device is held open by someone",
+	[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+	[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+	[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+	[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+	/* enums are unsigned... */
+	return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+	return s > R_SECONDARY   ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
+{
+	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+			: drbd_state_sw_errors[-err];
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_tracing.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_tracing.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_tracing.c	2015-01-21 12:02:58.387823833 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_tracing.c	2015-01-21 12:02:58.387823833 +0300
@@ -0,0 +1,760 @@
+/*
+   drbd_tracing.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/ctype.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include <linux/drbd_tag_magic.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg");
+MODULE_DESCRIPTION("DRBD tracepoint probes");
+MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c");
+MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)");
+MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)");
+
+unsigned int trace_mask = 0;  /* Bitmap of events to trace */
+int trace_level;              /* Current trace level */
+int trace_devs;		      /* Bitmap of devices to trace */
+
+module_param(trace_mask, uint, 0444);
+module_param(trace_level, int, 0644);
+module_param(trace_devs, int, 0644);
+
+enum {
+	TRACE_PACKET  = 0x0001,
+	TRACE_RQ      = 0x0002,
+	TRACE_UUID    = 0x0004,
+	TRACE_RESYNC  = 0x0008,
+	TRACE_EE      = 0x0010,
+	TRACE_UNPLUG  = 0x0020,
+	TRACE_NL      = 0x0040,
+	TRACE_AL_EXT  = 0x0080,
+	TRACE_INT_RQ  = 0x0100,
+	TRACE_MD_IO   = 0x0200,
+	TRACE_EPOCH   = 0x0400,
+};
+
+/* Buffer printing support
+ * dbg_print_flags: used for Flags arg to drbd_print_buffer
+ * - DBGPRINT_BUFFADDR; if set, each line starts with the
+ *	 virtual address of the line being output. If clear,
+ *	 each line starts with the offset from the beginning
+ *	 of the buffer. */
+enum dbg_print_flags {
+    DBGPRINT_BUFFADDR = 0x0001,
+};
+
+/* Macro stuff */
+STATIC char *nl_packet_name(int packet_type)
+{
+/* Generate packet type strings */
+#define NL_PACKET(name, number, fields) \
+	[P_ ## name] = # name,
+#define NL_RESPONSE(name, number) \
+	[P_ ## name] = # name,
+#define NL_INTEGER Argh!
+#define NL_BIT Argh!
+#define NL_INT64 Argh!
+#define NL_STRING Argh!
+
+	static char *nl_tag_name[P_nl_after_last_packet] = {
+#include "linux/drbd_nl.h"
+	};
+
+	return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ?
+	    nl_tag_name[packet_type] : "*Unknown*";
+}
+/* /Macro stuff */
+
+static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level)
+{
+	return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs);
+}
+
+static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg)
+{
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt));
+}
+
+static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
+{
+	static char *uuid_str[UI_EXTENDED_SIZE] = {
+		[UI_CURRENT] = "CURRENT",
+		[UI_BITMAP] = "BITMAP",
+		[UI_HISTORY_START] = "HISTORY_START",
+		[UI_HISTORY_END] = "HISTORY_END",
+		[UI_SIZE] = "SIZE",
+		[UI_FLAGS] = "FLAGS",
+	};
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	if (index >= UI_EXTENDED_SIZE) {
+		dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
+		return;
+	}
+
+	dev_info(DEV, " uuid[%s] now %016llX\n",
+		 uuid_str[index],
+		 (unsigned long long)mdev->ldev->md.uuid[index]);
+}
+
+static void probe_drbd_md_io(struct drbd_conf *mdev, int rw,
+			     struct drbd_backing_dev *bdev)
+{
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, " %s metadata superblock now\n",
+		 rw == READ ? "Reading" : "Writing");
+}
+
+static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg)
+{
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n",
+		 msg, (unsigned long long)e->sector, e->size, e);
+}
+
+static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch,
+			     enum epoch_event ev)
+{
+	static char *epoch_event_str[] = {
+		[EV_PUT] = "put",
+		[EV_GOT_BARRIER_NR] = "got_barrier_nr",
+		[EV_BARRIER_DONE] = "barrier_done",
+		[EV_BECAME_LAST] = "became_last",
+		[EV_TRACE_FLUSH] = "issuing_flush",
+		[EV_TRACE_ADD_BARRIER] = "added_barrier",
+		[EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch",
+	};
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	ev &= ~EV_CLEANUP;
+
+	switch (ev) {
+	case EV_TRACE_ALLOC:
+		dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs);
+		break;
+	case EV_TRACE_FREE:
+		dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n",
+			 epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
+			 mdev->epochs);
+		break;
+	default:
+		dev_info(DEV, "Update epoch  %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n",
+			 epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
+			 atomic_read(&epoch->active),
+			 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-',
+			 test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-',
+			 test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-',
+			 test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-',
+			 epoch_event_str[ev]);
+	}
+}
+
+static void probe_drbd_netlink(void *data, int is_req)
+{
+	struct cn_msg *msg = data;
+
+	if (is_req) {
+		struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data;
+
+		printk(KERN_INFO "drbd%d: "
+			 "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n",
+			 nlp->drbd_minor,
+			 nl_packet_name(nlp->packet_type),
+			 nlp->packet_type,
+			 msg->seq, msg->ack, msg->len);
+	} else {
+		struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data;
+
+		printk(KERN_INFO "drbd%d: "
+		       "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x ret: %d\n",
+		       nlp->minor,
+		       nl_packet_name(nlp->packet_type),
+		       nlp->packet_type,
+		       msg->seq, msg->ack, msg->len, nlp->ret_code);
+	}
+}
+
+static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n",
+		 msg, (unsigned long long) sector, enr,
+		 (int)BM_SECT_TO_EXT(sector));
+}
+
+/**
+ * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer
+ * @prefix:	String is output at the beginning of each line output.
+ * @flags:	Currently only defined flag: DBGPRINT_BUFFADDR; if set, each
+ *		line starts with the virtual address of the line being
+ *		output. If clear, each line starts with the offset from the
+ *		beginning of the buffer.
+ * @size:	Indicates the size of each entry in the buffer. Supported
+ * 		values are sizeof(char), sizeof(short) and sizeof(int)
+ * @buffer:	Start address of buffer
+ * @buffer_va:	Virtual address of start of buffer (normally the same
+ *		as Buffer, but having it separate allows it to hold
+ *		file address for example)
+ * @length:	length of buffer
+ */
+static void drbd_print_buffer(const char *prefix, unsigned int flags, int size,
+			      const void *buffer, const void *buffer_va,
+			      unsigned int length)
+
+#define LINE_SIZE       16
+#define LINE_ENTRIES    (int)(LINE_SIZE/size)
+{
+	const unsigned char *pstart;
+	const unsigned char *pstart_va;
+	const unsigned char *pend;
+	char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8];
+	char *pbytes = bytes_str, *pascii = ascii_str;
+	int  offset = 0;
+	long sizemask;
+	int  field_width;
+	int  index;
+	const unsigned char *pend_str;
+	const unsigned char *p;
+	int count;
+
+	/* verify size parameter */
+	if (size != sizeof(char) &&
+	    size != sizeof(short) &&
+	    size != sizeof(int)) {
+		printk(KERN_DEBUG "drbd_print_buffer: "
+			"ERROR invalid size %d\n", size);
+		return;
+	}
+
+	sizemask = size-1;
+	field_width = size*2;
+
+	/* Adjust start/end to be on appropriate boundary for size */
+	buffer = (const char *)((long)buffer & ~sizemask);
+	pend   = (const unsigned char *)
+		(((long)buffer + length + sizemask) & ~sizemask);
+
+	if (flags & DBGPRINT_BUFFADDR) {
+		/* Move start back to nearest multiple of line size,
+		 * if printing address. This results in nicely formatted output
+		 * with addresses being on line size (16) byte boundaries */
+		pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1));
+	} else {
+		pstart = (const unsigned char *)buffer;
+	}
+
+	/* Set value of start VA to print if addresses asked for */
+	pstart_va = (const unsigned char *)buffer_va
+		 - ((const unsigned char *)buffer-pstart);
+
+	/* Calculate end position to nicely align right hand side */
+	pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1));
+
+	/* Init strings */
+	*pbytes = *pascii = '\0';
+
+	/* Start at beginning of first line */
+	p = pstart;
+	count = 0;
+
+	while (p < pend_str) {
+		if (p < (const unsigned char *)buffer || p >= pend) {
+			/* Before start of buffer or after end- print spaces */
+			pbytes += sprintf(pbytes, "%*c ", field_width, ' ');
+			pascii += sprintf(pascii, "%*c", size, ' ');
+			p += size;
+		} else {
+			/* Add hex and ascii to strings */
+			int val;
+			switch (size) {
+			default:
+			case 1:
+				val = *(unsigned char *)p;
+				break;
+			case 2:
+				val = *(unsigned short *)p;
+				break;
+			case 4:
+				val = *(unsigned int *)p;
+				break;
+			}
+
+			pbytes += sprintf(pbytes, "%0*x ", field_width, val);
+
+			for (index = size; index; index--) {
+				*pascii++ = isprint(*p) ? *p : '.';
+				p++;
+			}
+		}
+
+		count++;
+
+		if (count == LINE_ENTRIES || p >= pend_str) {
+			/* Null terminate and print record */
+			*pascii = '\0';
+			printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n",
+			       prefix,
+			       (flags & DBGPRINT_BUFFADDR)
+			       ? (long)pstart_va:(long)offset,
+			       LINE_ENTRIES*(field_width+1), bytes_str,
+			       LINE_SIZE, ascii_str);
+
+			/* Move onto next line */
+			pstart_va += (p-pstart);
+			pstart = p;
+			count  = 0;
+			offset += LINE_SIZE;
+
+			/* Re-init strings */
+			pbytes = bytes_str;
+			pascii = ascii_str;
+			*pbytes = *pascii = '\0';
+		}
+	}
+}
+
+static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args)
+{
+	char str[256];
+
+	if (!is_mdev_trace(mdev, level))
+		return;
+
+	if (vsnprintf(str, 256, fmt, args) >= 256)
+		str[255] = 0;
+
+	printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)),
+	       dev_name(disk_to_dev(mdev->vdisk)), str);
+}
+
+static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
+			   struct drbd_request *r)
+{
+#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD)
+#define SECTOR_FORMAT "%Lx"
+#else
+#define SECTOR_FORMAT "%lx"
+#endif
+#define SECTOR_SHIFT 9
+
+	unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT);
+	char *faddr = (char *)(lowaddr);
+	char rb[sizeof(void *)*2+6] = { 0, };
+	struct bio_vec *bvec;
+	int segno;
+
+	const int rw = bio->bi_rw;
+	const int biorw      = (rw & (RW_MASK|RWA_MASK));
+	const int biobarrier = (rw & (1<<BIO_RW_BARRIER));
+#ifdef BIO_RW_SYNC
+	const int biosync = (rw & (1<<BIO_RW_SYNC));
+#else
+	const int biosync = (rw & ((1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG)));
+#endif
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	if (r)
+		sprintf(rb, "Req:%p ", r);
+
+	dev_info(DEV, "%s %s:%s%s%s Bio:%p %s- %soffset " SECTOR_FORMAT ", size %x\n",
+		 complete ? "<<<" : ">>>",
+		 pfx,
+		 biorw == WRITE ? "Write" : "Read",
+		 biobarrier ? " : B" : "",
+		 biosync ? " : S" : "",
+		 bio,
+		 rb,
+		 complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "",
+		 bio->bi_sector << SECTOR_SHIFT,
+		 bio->bi_size);
+
+	if (trace_level >= TRACE_LVL_METRICS &&
+	    ((biorw == WRITE) ^ complete)) {
+		printk(KERN_DEBUG "  ind     page   offset   length\n");
+		bio_for_each_segment(bvec, bio, segno) {
+			printk(KERN_DEBUG "  [%d] %p %8.8x %8.8x\n", segno,
+			       bvec->bv_page, bvec->bv_offset, bvec->bv_len);
+
+			if (trace_level >= TRACE_LVL_ALL) {
+				char *bvec_buf;
+				unsigned long flags;
+
+				bvec_buf = bvec_kmap_irq(bvec, &flags);
+
+				drbd_print_buffer("    ", DBGPRINT_BUFFADDR, 1,
+						  bvec_buf,
+						  faddr,
+						  (bvec->bv_len <= 0x80)
+						  ? bvec->bv_len : 0x80);
+
+				bvec_kunmap_irq(bvec_buf, &flags);
+
+				if (bvec->bv_len > 0x40)
+					printk(KERN_DEBUG "    ....\n");
+
+				faddr += bvec->bv_len;
+			}
+		}
+	}
+}
+
+static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg)
+{
+	static const char *rq_event_names[] = {
+		[created] = "created",
+		[to_be_send] = "to_be_send",
+		[to_be_submitted] = "to_be_submitted",
+		[queue_for_net_write] = "queue_for_net_write",
+		[queue_for_net_read] = "queue_for_net_read",
+		[send_canceled] = "send_canceled",
+		[send_failed] = "send_failed",
+		[handed_over_to_network] = "handed_over_to_network",
+		[connection_lost_while_pending] =
+					"connection_lost_while_pending",
+		[recv_acked_by_peer] = "recv_acked_by_peer",
+		[write_acked_by_peer] = "write_acked_by_peer",
+		[neg_acked] = "neg_acked",
+		[conflict_discarded_by_peer] = "conflict_discarded_by_peer",
+		[barrier_acked] = "barrier_acked",
+		[data_received] = "data_received",
+		[read_completed_with_error] = "read_completed_with_error",
+		[read_ahead_completed_with_error] = "reada_completed_with_error",
+		[write_completed_with_error] = "write_completed_with_error",
+		[completed_ok] = "completed_ok",
+	};
+
+	struct drbd_conf *mdev = req->mdev;
+
+	const int rw = (req->master_bio == NULL ||
+			bio_data_dir(req->master_bio) == WRITE) ?
+		'W' : 'R';
+	const unsigned long s = req->rq_state;
+
+	if (what != nothing) {
+		dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]);
+	} else {
+		dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n",
+			 msg, req, rw,
+			 s & RQ_LOCAL_PENDING ? 'p' : '-',
+			 s & RQ_LOCAL_COMPLETED ? 'c' : '-',
+			 s & RQ_LOCAL_OK ? 'o' : '-',
+			 s & RQ_NET_PENDING ? 'p' : '-',
+			 s & RQ_NET_QUEUED ? 'q' : '-',
+			 s & RQ_NET_SENT ? 's' : '-',
+			 s & RQ_NET_DONE ? 'd' : '-',
+			 s & RQ_NET_OK ? 'o' : '-',
+			 req->epoch,
+			 (unsigned long long)req->sector,
+			 req->size,
+			 drbd_conn_str(mdev->state.conn));
+	}
+}
+
+
+#define drbd_peer_str drbd_role_str
+#define drbd_pdsk_str drbd_disk_str
+
+#define PSM(A)							\
+do {								\
+	if (mask.A) {						\
+		int i = snprintf(p, len, " " #A "( %s )",	\
+				 drbd_##A##_str(val.A));	\
+		if (i >= len)					\
+			return op;				\
+		p += i;						\
+		len -= i;					\
+	}							\
+} while (0)
+
+STATIC char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val)
+{
+	char *op = p;
+	*p = '\0';
+	PSM(role);
+	PSM(peer);
+	PSM(conn);
+	PSM(disk);
+	PSM(pdsk);
+
+	return op;
+}
+
+#define INFOP(fmt, args...) \
+do { \
+	if (trace_level >= TRACE_LVL_ALL) { \
+		dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \
+		     file, line, current->comm, current->pid, \
+		     sockname, recv ? "<<<" : ">>>" , \
+		     ## args); \
+	} else { \
+		dev_info(DEV, "%s %s " fmt, sockname, \
+		     recv ? "<<<" : ">>>" , \
+		     ## args); \
+	} \
+} while (0)
+
+STATIC char *_dump_block_id(u64 block_id, char *buff)
+{
+	if (is_syncer_block_id(block_id))
+		strcpy(buff, "SyncerId");
+	else
+		sprintf(buff, "%llx", (unsigned long long)block_id);
+
+	return buff;
+}
+
+static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock,
+			      int recv, union p_polymorph *p, char *file, int line)
+{
+	char *sockname = sock == mdev->meta.socket ? "meta" : "data";
+	int cmd;
+	char tmp[300];
+	union drbd_state m, v;
+
+	cmd = be16_to_cpu(p->header.h80.magic == BE_DRBD_MAGIC ?
+			  p->header.h80.command : p->header.h95.command);
+
+	switch (cmd) {
+	case P_HAND_SHAKE:
+		INFOP("%s (protocol %u-%u)\n", cmdname(cmd),
+			be32_to_cpu(p->handshake.protocol_min),
+			be32_to_cpu(p->handshake.protocol_max));
+		break;
+
+	case P_BITMAP: /* don't report this */
+	case P_COMPRESSED_BITMAP: /* don't report this */
+		break;
+
+	case P_DATA:
+		INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd),
+		      (unsigned long long)be64_to_cpu(p->data.sector),
+		      _dump_block_id(p->data.block_id, tmp),
+		      be32_to_cpu(p->data.seq_num),
+		      be32_to_cpu(p->data.dp_flags)
+			);
+		break;
+
+	case P_DATA_REPLY:
+	case P_RS_DATA_REPLY:
+		INFOP("%s (sector %llus, id %s)\n", cmdname(cmd),
+		      (unsigned long long)be64_to_cpu(p->data.sector),
+		      _dump_block_id(p->data.block_id, tmp)
+			);
+		break;
+
+	case P_RECV_ACK:
+	case P_WRITE_ACK:
+	case P_RS_WRITE_ACK:
+	case P_DISCARD_ACK:
+	case P_NEG_ACK:
+	case P_NEG_RS_DREPLY:
+		INFOP("%s (sector %llus, size %u, id %s, seq %u)\n",
+			cmdname(cmd),
+		      (long long)be64_to_cpu(p->block_ack.sector),
+		      be32_to_cpu(p->block_ack.blksize),
+		      _dump_block_id(p->block_ack.block_id, tmp),
+		      be32_to_cpu(p->block_ack.seq_num)
+			);
+		break;
+
+	case P_DATA_REQUEST:
+	case P_RS_DATA_REQUEST:
+		INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd),
+		      (long long)be64_to_cpu(p->block_req.sector),
+		      be32_to_cpu(p->block_req.blksize),
+		      _dump_block_id(p->block_req.block_id, tmp)
+			);
+		break;
+
+	case P_BARRIER:
+	case P_BARRIER_ACK:
+		INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier);
+		break;
+
+	case P_SYNC_PARAM:
+	case P_SYNC_PARAM89:
+		INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n",
+			cmdname(cmd), be32_to_cpu(p->rs_param_89.rate),
+			p->rs_param_89.verify_alg, p->rs_param_89.csums_alg);
+		break;
+
+	case P_UUIDS:
+		INFOP("%s Curr:%016llX, Bitmap:%016llX, "
+		      "HisSt:%016llX, HisEnd:%016llX\n",
+		      cmdname(cmd),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END]));
+		break;
+
+	case P_SIZES:
+		INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, "
+		      "max bio %x, q order %x)\n",
+		      cmdname(cmd),
+		      (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)),
+		      (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)),
+		      (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)),
+		      be32_to_cpu(p->sizes.max_bio_size),
+		      be32_to_cpu(p->sizes.queue_order_type));
+		break;
+
+	case P_STATE:
+		v.i = be32_to_cpu(p->state.state);
+		m.i = 0xffffffff;
+		dump_st(tmp, sizeof(tmp), m, v);
+		INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp);
+		break;
+
+	case P_STATE_CHG_REQ:
+		m.i = be32_to_cpu(p->req_state.mask);
+		v.i = be32_to_cpu(p->req_state.val);
+		dump_st(tmp, sizeof(tmp), m, v);
+		INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp);
+		break;
+
+	case P_STATE_CHG_REPLY:
+		INFOP("%s (ret %x)\n", cmdname(cmd),
+		      be32_to_cpu(p->req_state_reply.retcode));
+		break;
+
+	case P_PING:
+	case P_PING_ACK:
+		/*
+		 * Dont trace pings at summary level
+		 */
+		if (trace_level < TRACE_LVL_ALL)
+			break;
+		/* fall through... */
+	default:
+		INFOP("%s (%u)\n", cmdname(cmd), cmd);
+		break;
+	}
+}
+
+
+static int __init drbd_trace_init(void)
+{
+	int ret;
+
+	if (trace_mask & TRACE_UNPLUG) {
+		ret = register_trace_drbd_unplug(probe_drbd_unplug);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_UUID) {
+		ret = register_trace_drbd_uuid(probe_drbd_uuid);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_EE) {
+		ret = register_trace_drbd_ee(probe_drbd_ee);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_PACKET) {
+		ret = register_trace_drbd_packet(probe_drbd_packet);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_MD_IO) {
+		ret = register_trace_drbd_md_io(probe_drbd_md_io);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_EPOCH) {
+		ret = register_trace_drbd_epoch(probe_drbd_epoch);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_NL) {
+		ret = register_trace_drbd_netlink(probe_drbd_netlink);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_AL_EXT) {
+		ret = register_trace_drbd_actlog(probe_drbd_actlog);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_RQ) {
+		ret = register_trace_drbd_bio(probe_drbd_bio);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_INT_RQ) {
+		ret = register_trace_drbd_req(probe_drbd_req);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_RESYNC) {
+		ret = register_trace__drbd_resync(probe_drbd_resync);
+		WARN_ON(ret);
+	}
+	return 0;
+}
+
+module_init(drbd_trace_init);
+
+static void __exit drbd_trace_exit(void)
+{
+	if (trace_mask & TRACE_UNPLUG)
+		unregister_trace_drbd_unplug(probe_drbd_unplug);
+	if (trace_mask & TRACE_UUID)
+		unregister_trace_drbd_uuid(probe_drbd_uuid);
+	if (trace_mask & TRACE_EE)
+		unregister_trace_drbd_ee(probe_drbd_ee);
+	if (trace_mask & TRACE_PACKET)
+		unregister_trace_drbd_packet(probe_drbd_packet);
+	if (trace_mask & TRACE_MD_IO)
+		unregister_trace_drbd_md_io(probe_drbd_md_io);
+	if (trace_mask & TRACE_EPOCH)
+		unregister_trace_drbd_epoch(probe_drbd_epoch);
+	if (trace_mask & TRACE_NL)
+		unregister_trace_drbd_netlink(probe_drbd_netlink);
+	if (trace_mask & TRACE_AL_EXT)
+		unregister_trace_drbd_actlog(probe_drbd_actlog);
+	if (trace_mask & TRACE_RQ)
+		unregister_trace_drbd_bio(probe_drbd_bio);
+	if (trace_mask & TRACE_INT_RQ)
+		unregister_trace_drbd_req(probe_drbd_req);
+	if (trace_mask & TRACE_RESYNC)
+		unregister_trace__drbd_resync(probe_drbd_resync);
+
+	tracepoint_synchronize_unregister();
+}
+
+module_exit(drbd_trace_exit);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_tracing.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_tracing.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_tracing.h	2015-01-21 12:02:58.387823833 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_tracing.h	2015-01-21 12:02:58.387823833 +0300
@@ -0,0 +1,87 @@
+/*
+   drbd_tracing.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#ifndef DRBD_TRACING_H
+#define DRBD_TRACING_H
+
+#include <linux/tracepoint.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+enum {
+	TRACE_LVL_ALWAYS = 0,
+	TRACE_LVL_SUMMARY,
+	TRACE_LVL_METRICS,
+	TRACE_LVL_ALL,
+	TRACE_LVL_MAX
+};
+
+DECLARE_TRACE(drbd_unplug,
+	TP_PROTO(struct drbd_conf *mdev, char* msg),
+	TP_ARGS(mdev, msg));
+
+DECLARE_TRACE(drbd_uuid,
+	TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index),
+	TP_ARGS(mdev, index));
+
+DECLARE_TRACE(drbd_ee,
+	TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg),
+	TP_ARGS(mdev, e, msg));
+
+DECLARE_TRACE(drbd_md_io,
+	TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev),
+	TP_ARGS(mdev, rw, bdev));
+
+DECLARE_TRACE(drbd_epoch,
+	TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev),
+	TP_ARGS(mdev, epoch, ev));
+
+DECLARE_TRACE(drbd_netlink,
+	TP_PROTO(void *data, int is_req),
+	TP_ARGS(data, is_req));
+
+DECLARE_TRACE(drbd_actlog,
+	TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg),
+	TP_ARGS(mdev, sector, msg));
+
+DECLARE_TRACE(drbd_bio,
+	TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
+		 struct drbd_request *r),
+	TP_ARGS(mdev, pfx, bio, complete, r));
+
+DECLARE_TRACE(drbd_req,
+	TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg),
+	      TP_ARGS(req, what, msg));
+
+DECLARE_TRACE(drbd_packet,
+	TP_PROTO(struct drbd_conf *mdev, struct socket *sock,
+		 int recv, union p_polymorph *p, char *file, int line),
+	TP_ARGS(mdev, sock, recv, p, file, line));
+
+DECLARE_TRACE(_drbd_resync,
+	TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args),
+	TP_ARGS(mdev, level, fmt, args));
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_vli.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_vli.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_vli.h	2015-01-21 12:02:58.387823833 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_vli.h	2015-01-21 12:02:58.387823833 +0300
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transferred bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ *  * simple byte-based
+ *  * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total).  The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix    data bits                                    max val  Nº data bits
+0 x                                                         0x2            1
+10 x                                                        0x4            1
+110 xx                                                      0x8            2
+1110 xxx                                                   0x10            3
+11110 xxx xx                                               0x30            5
+111110 xx xxxxxx                                          0x130            8
+11111100  xxxxxxxx xxxxx                                 0x2130           13
+11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
+11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
+11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted   x                                0.29
+ as plaintext x                                  ........................
+             x                                   ........................
+            x                                    ........................
+           x    0.59                         0.21........................
+          x      ........................................................
+         x       .. c ...................................................
+        x    0.44.. o ...................................................
+       x .......... d ...................................................
+      x  .......... e ...................................................
+     X.............   ...................................................
+    x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ...........................  plain bits  ..........
+-+-----------------------------------------------------------------------
+ 1             16              32                              64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+	LEVEL( 2, 1, 0x00); \
+	LEVEL( 3, 2, 0x01); \
+	LEVEL( 5, 3, 0x03); \
+	LEVEL( 7, 4, 0x07); \
+	LEVEL(10, 5, 0x0f); \
+	LEVEL(14, 6, 0x1f); \
+	LEVEL(21, 8, 0x3f); \
+	LEVEL(29, 8, 0x7f); \
+	LEVEL(42, 8, 0xbf); \
+	LEVEL(64, 8, 0xff); \
+	} while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+	u64 adj = 1;
+
+#define LEVEL(t,b,v)					\
+	do {						\
+		if ((in & ((1 << b) -1)) == v) {	\
+			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
+			return t;			\
+		}					\
+		adj += 1ULL << (t - b);			\
+	} while (0)
+
+	VLI_L_1_1();
+
+	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
+	BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+	u64 max = 0;
+	u64 adj = 1;
+
+	if (in == 0)
+		return -EINVAL;
+
+#define LEVEL(t,b,v) do {		\
+		max += 1ULL << (t - b);	\
+		if (in <= max) {	\
+			if (out)	\
+				*out = ((in - adj) << b) | v;	\
+			return t;	\
+		}			\
+		adj = max + 1;		\
+	} while (0)
+
+	VLI_L_1_1();
+
+	return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+	/* the current byte */
+	u8 *b;
+	/* the current bit within *b, nomalized: 0..7 */
+	unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+	cur->b = s;
+	cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+	bits += cur->bit;
+	cur->b = cur->b + (bits >> 3);
+	cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+	struct bitstream_cursor cur;
+	unsigned char *buf;
+	size_t buf_len;		/* in bytes */
+
+	/* for input stream:
+	 * number of trailing 0 bits for padding
+	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
+	unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+	bs->buf = s;
+	bs->buf_len = len;
+	bs->pad_bits = pad_bits;
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+	memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+	unsigned char *b = bs->cur.b;
+	unsigned int tmp;
+
+	if (bits == 0)
+		return 0;
+
+	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+		return -ENOBUFS;
+
+	/* paranoia: strip off hi bits; they should not be set anyways. */
+	if (bits < 64)
+		val &= ~0ULL >> (64 - bits);
+
+	*b++ |= (val & 0xff) << bs->cur.bit;
+
+	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+		*b++ |= (val >> tmp) & 0xff;
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+	u64 val;
+	unsigned int n;
+
+	if (bits > 64)
+		return -EINVAL;
+
+	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+			- bs->cur.bit - bs->pad_bits;
+
+	if (bits == 0) {
+		*out = 0;
+		return 0;
+	}
+
+	/* get the high bits */
+	val = 0;
+	n = (bs->cur.bit + bits + 7) >> 3;
+	/* n may be at most 9, if cur.bit + bits > 64 */
+	/* which means this copies at most 8 byte */
+	if (n) {
+		memcpy(&val, bs->cur.b+1, n - 1);
+		val = le64_to_cpu(val) << (8 - bs->cur.bit);
+	}
+
+	/* we still need the low bits */
+	val |= bs->cur.b[0] >> bs->cur.bit;
+
+	/* and mask out bits we don't want */
+	val &= ~0ULL >> (64 - bits);
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	*out = val;
+
+	return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ *  > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+	u64 code = code;
+	int bits = __vli_encode_bits(&code, in);
+
+	if (bits <= 0)
+		return bits;
+
+	return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_worker.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_worker.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_worker.c	2015-01-21 12:02:58.388823806 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_worker.c	2015-01-21 12:02:58.388823806 +0300
@@ -0,0 +1,1793 @@
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_tracing.h"
+
+#ifdef HAVE_LINUX_SCATTERLIST_H
+/* 2.6.11 (suse 9.3, fc4) does not include requisites
+ * from linux/scatterlist.h :( */
+#include <asm/scatterlist.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#endif
+
+STATIC int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
+
+
+
+/* endio handlers:
+ *   drbd_md_io_complete (defined here)
+ *   drbd_endio_pri (defined here)
+ *   drbd_endio_sec (defined here)
+ *   bm_async_io_complete (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+   Each state transition on an device holds a read lock. In case we have
+   to evaluate the sync after dependencies, we grab a write lock, because
+   we need stable states on all devices for that.  */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error)
+{
+	struct drbd_md_io *md_io;
+	struct drbd_conf *mdev;
+
+	BIO_ENDIO_FN_START;
+
+	md_io = (struct drbd_md_io *)bio->bi_private;
+	mdev = container_of(md_io, struct drbd_conf, md_io);
+
+	md_io->error = error;
+
+	trace_drbd_bio(mdev, "Md", bio, 1, NULL);
+
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(mdev);
+	md_io->done = 1;
+	wake_up(&mdev->misc_wait);
+	bio_put(bio);
+	put_ldev(mdev);
+
+	BIO_ENDIO_FN_RETURN;
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_conf *mdev = e->mdev;
+
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->read_cnt += e->size >> 9;
+	list_del(&e->w.list);
+	if (list_empty(&mdev->read_ee))
+		wake_up(&mdev->ee_wait);
+	if (test_bit(__EE_WAS_ERROR, &e->flags))
+		__drbd_chk_io_error(mdev, false);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	trace_drbd_ee(mdev, e, "read completed");
+	drbd_queue_work(&mdev->data.work, &e->w);
+	put_ldev(mdev);
+}
+
+static int is_failed_barrier(int ee_flags)
+{
+	return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
+			== (EE_IS_BARRIER|EE_WAS_ERROR);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage.  */
+static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_conf *mdev = e->mdev;
+	sector_t e_sector;
+	int do_wake;
+	int is_syncer_req;
+	int do_al_complete_io;
+
+	/* if this is a failed barrier request, disable use of barriers,
+	 * and schedule for resubmission */
+	if (is_failed_barrier(e->flags)) {
+		drbd_bump_write_ordering(mdev, WO_bdev_flush);
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		list_del(&e->w.list);
+		e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
+		e->w.cb = w_e_reissue;
+		/* put_ldev actually happens below, once we come here again. */
+		__release(local);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+		drbd_queue_work(&mdev->data.work, &e->w);
+		return;
+	}
+
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	/* after we moved e to done_ee,
+	 * we may no longer access it,
+	 * it may be freed/reused already!
+	 * (as soon as we release the req_lock) */
+	e_sector = e->sector;
+	do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+	is_syncer_req = is_syncer_block_id(e->block_id);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->writ_cnt += e->size >> 9;
+	list_del(&e->w.list); /* has been on active_ee or sync_ee */
+	list_add_tail(&e->w.list, &mdev->done_ee);
+
+	trace_drbd_ee(mdev, e, "write completed");
+
+	/* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
+	 * neither did we wake possibly waiting conflicting requests.
+	 * done from "drbd_process_done_ee" within the appropriate w.cb
+	 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+
+	do_wake = is_syncer_req
+		? list_empty(&mdev->sync_ee)
+		: list_empty(&mdev->active_ee);
+
+	if (test_bit(__EE_WAS_ERROR, &e->flags))
+		__drbd_chk_io_error(mdev, false);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (is_syncer_req)
+		drbd_rs_complete_io(mdev, e_sector);
+
+	if (do_wake)
+		wake_up(&mdev->ee_wait);
+
+	if (do_al_complete_io)
+		drbd_al_complete_io(mdev, e_sector);
+
+	wake_asender(mdev);
+	put_ldev(mdev);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+BIO_ENDIO_TYPE drbd_endio_sec BIO_ENDIO_ARGS(struct bio *bio, int error)
+{
+	struct drbd_epoch_entry *e = bio->bi_private;
+	struct drbd_conf *mdev = e->mdev;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+	int is_write = bio_data_dir(bio) == WRITE;
+
+	BIO_ENDIO_FN_START;
+	if (error && DRBD_ratelimit(5*HZ, 5))
+		dev_warn(DEV, "%s: error=%d s=%llus\n",
+				is_write ? "write" : "read", error,
+				(unsigned long long)e->sector);
+	if (!error && !uptodate) {
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
+					is_write ? "write" : "read",
+					(unsigned long long)e->sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	if (error)
+		set_bit(__EE_WAS_ERROR, &e->flags);
+
+	trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
+	bio_put(bio); /* no need for the bio anymore */
+	if (atomic_dec_and_test(&e->pending_bios)) {
+		if (is_write)
+			drbd_endio_write_sec_final(e);
+		else
+			drbd_endio_read_sec_final(e);
+	}
+	BIO_ENDIO_FN_RETURN;
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error)
+{
+	unsigned long flags;
+	struct drbd_request *req = bio->bi_private;
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	enum drbd_req_event what;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	BIO_ENDIO_FN_START;
+	if (!error && !uptodate) {
+		dev_warn(DEV, "p %s: setting error to -EIO\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read");
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	trace_drbd_bio(mdev, "Pri", bio, 1, NULL);
+
+	/* to avoid recursion in __req_mod */
+	if (unlikely(error)) {
+		what = (bio_data_dir(bio) == WRITE)
+			? write_completed_with_error
+			: (bio_rw(bio) == READ)
+			  ? read_completed_with_error
+			  : read_ahead_completed_with_error;
+	} else
+		what = completed_ok;
+
+	bio_put(req->private_bio);
+	req->private_bio = ERR_PTR(error);
+
+	/* not req_mod(), we need irqsave here! */
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	__req_mod(req, what, &m);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	put_ldev(mdev);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	BIO_ENDIO_FN_RETURN;
+}
+
+int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	/* We should not detach for read io-error,
+	 * but try to WRITE the P_DATA_REPLY to the failed location,
+	 * to give the disk the chance to relocate that block */
+
+	spin_lock_irq(&mdev->req_lock);
+	if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
+		_req_mod(req, read_retry_remote_canceled);
+		spin_unlock_irq(&mdev->req_lock);
+		return 1;
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return w_send_read_req(mdev, w, 0);
+}
+
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct page *page = e->pages;
+	struct page *tmp;
+	unsigned len;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	while ((tmp = page_chain_next(page))) {
+		/* all but the last page will be fully used */
+		sg_set_page(&sg, page, PAGE_SIZE, 0);
+		crypto_hash_update(&desc, &sg, sg.length);
+		page = tmp;
+	}
+	/* and now the last, possibly only partially used page */
+	len = e->size & (PAGE_SIZE - 1);
+	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+	crypto_hash_update(&desc, &sg, sg.length);
+	crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct bio_vec *bvec;
+	int i;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	bio_for_each_segment(bvec, bio, i) {
+		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		crypto_hash_update(&desc, &sg, sg.length);
+	}
+	crypto_hash_final(&desc, digest);
+}
+
+/* TODO merge common code with w_e_end_ov_req */
+int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int digest_size;
+	void *digest;
+	int ok = 1;
+
+	D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+
+	if (unlikely(cancel))
+		goto out;
+
+	if (likely((e->flags & EE_WAS_ERROR) != 0))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		sector_t sector = e->sector;
+		unsigned int size = e->size;
+		drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+		/* Free e and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_pp_alloc due to pp_in_use > max_buffers. */
+		drbd_free_ee(mdev, e);
+		e = NULL;
+		inc_rs_pending(mdev);
+		ok = drbd_send_drequest_csum(mdev, sector, size,
+					     digest, digest_size,
+					     P_CSUM_RS_REQUEST);
+		kfree(digest);
+	} else {
+		dev_err(DEV, "kmalloc() of digest failed.\n");
+		ok = 0;
+	}
+
+out:
+	if (e)
+		drbd_free_ee(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
+	return ok;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+STATIC int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	struct drbd_epoch_entry *e;
+
+	if (!get_ldev(mdev))
+		return -EIO;
+
+	if (drbd_rs_should_slow_down(mdev, sector))
+		goto defer;
+
+	/* GFP_TRY, because if there is no memory available right now, this may
+	 * be rescheduled for later. It is "only" background resync, after all. */
+	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
+	if (!e)
+		goto defer;
+
+	e->w.cb = w_e_send_csum;
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	atomic_add(size >> 9, &mdev->rs_sect_ev);
+	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+		return 0;
+
+	/* If it failed because of ENOMEM, retry should help.  If it failed
+	 * because bio_add_page failed (probably broken lower level driver),
+	 * retry may or may not help.
+	 * If it does not, you may need to force disconnect. */
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	drbd_free_ee(mdev, e);
+defer:
+	put_ldev(mdev);
+	return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	switch (mdev->state.conn) {
+	case C_VERIFY_S:
+		w_make_ov_request(mdev, w, cancel);
+		break;
+	case C_SYNC_TARGET:
+		w_make_resync_request(mdev, w, cancel);
+		break;
+	}
+
+	return 1;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	if (list_empty(&mdev->resync_work.list))
+		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+	int ov;
+
+	ov = fb->values[fb->head_index];
+	fb->values[fb->head_index++] = value;
+
+	if (fb->head_index >= fb->size)
+		fb->head_index = 0;
+
+	return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] += value;
+}
+
+STATIC int drbd_rs_controller(struct drbd_conf *mdev)
+{
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	unsigned int want;     /* The number of sectors we want in the proxy */
+	int req_sect; /* Number of sectors to request in this turn */
+	int correction; /* Number of sectors more we need in the proxy*/
+	int cps; /* correction per invocation of drbd_rs_controller() */
+	int steps; /* Number of time steps to plan ahead */
+	int curr_corr;
+	int max_sect;
+
+	sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
+	mdev->rs_in_flight -= sect_in;
+
+	spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+
+	steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+	if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
+		want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+	} else { /* normal path */
+		want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
+			sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+	}
+
+	correction = want - mdev->rs_in_flight - mdev->rs_planed;
+
+	/* Plan ahead */
+	cps = correction / steps;
+	fifo_add_val(&mdev->rs_plan_s, cps);
+	mdev->rs_planed += cps * steps;
+
+	/* What we do in this step */
+	curr_corr = fifo_push(&mdev->rs_plan_s, 0);
+	spin_unlock(&mdev->peer_seq_lock);
+	mdev->rs_planed -= curr_corr;
+
+	req_sect = sect_in + curr_corr;
+	if (req_sect < 0)
+		req_sect = 0;
+
+	max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+	if (req_sect > max_sect)
+		req_sect = max_sect;
+
+	/*
+	dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+		 sect_in, mdev->rs_in_flight, want, correction,
+		 steps, cps, mdev->rs_planed, curr_corr, req_sect);
+	*/
+
+	return req_sect;
+}
+
+STATIC int drbd_rs_number_requests(struct drbd_conf *mdev)
+{
+	int number;
+	if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
+		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+	} else {
+		mdev->c_sync_rate = mdev->sync_conf.rate;
+		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	}
+
+	/* ignore the amount of pending requests, the resync controller should
+	 * throttle down to incoming reply rate soon enough anyways. */
+	return number;
+}
+
+STATIC int w_make_resync_request(struct drbd_conf *mdev,
+				 struct drbd_work *w, int cancel)
+{
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	int max_bio_size;
+	int number, rollback_i, size;
+	int align, queued, sndbuf;
+	int i = 0;
+
+	PARANOIA_BUG_ON(w != &mdev->resync_work);
+
+	if (unlikely(cancel))
+		return 1;
+
+	if (mdev->rs_total == 0) {
+		/* empty resync? */
+		drbd_resync_finished(mdev);
+		return 1;
+	}
+
+	if (!get_ldev(mdev)) {
+		/* Since we only need to access mdev->rsync a
+		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
+		   to continue resync with a broken disk makes no sense at
+		   all */
+		dev_err(DEV, "Disk broke down during resync!\n");
+		return 1;
+	}
+
+	max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
+	number = drbd_rs_number_requests(mdev);
+	if (number == 0)
+		goto requeue;
+
+	for (i = 0; i < number; i++) {
+		/* Stop generating RS requests, when half of the send buffer is filled */
+		mutex_lock(&mdev->data.mutex);
+		if (mdev->data.socket) {
+			queued = mdev->data.socket->sk->sk_wmem_queued;
+			sndbuf = mdev->data.socket->sk->sk_sndbuf;
+		} else {
+			queued = 1;
+			sndbuf = 0;
+		}
+		mutex_unlock(&mdev->data.mutex);
+		if (queued > sndbuf / 2)
+			goto requeue;
+
+next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
+
+		if (bit == DRBD_END_OF_BITMAP) {
+			mdev->bm_resync_fo = drbd_bm_bits(mdev);
+			put_ldev(mdev);
+			return 1;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if (drbd_rs_should_slow_down(mdev, sector) ||
+		    drbd_try_rs_begin_io(mdev, sector)) {
+			mdev->bm_resync_fo = bit;
+			goto requeue;
+		}
+		mdev->bm_resync_fo = bit + 1;
+
+		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
+			drbd_rs_complete_io(mdev, sector);
+			goto next_sector;
+		}
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+		/* try to find some adjacent bits.
+		 * we stop if we have already the maximum req size.
+		 *
+		 * Additionally always align bigger requests, in order to
+		 * be prepared for all stripe sizes of software RAIDs.
+		 */
+		align = 1;
+		rollback_i = i;
+		for (;;) {
+			if (size + BM_BLOCK_SIZE > max_bio_size)
+				break;
+
+			/* Be always aligned */
+			if (sector & ((1<<(align+3))-1))
+				break;
+
+			/* do not cross extent boundaries */
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+				break;
+			/* now, is it actually dirty, after all?
+			 * caution, drbd_bm_test_bit is tri-state for some
+			 * obscure reason; ( b == 0 ) would get the out-of-band
+			 * only accidentally right because of the "oddly sized"
+			 * adjustment below */
+			if (drbd_bm_test_bit(mdev, bit+1) != 1)
+				break;
+			bit++;
+			size += BM_BLOCK_SIZE;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
+			i++;
+		}
+		/* if we merged some,
+		 * reset the offset to start the next drbd_bm_find_next from */
+		if (size > BM_BLOCK_SIZE)
+			mdev->bm_resync_fo = bit + 1;
+#endif
+
+		/* adjust very last sectors, in case we are oddly sized */
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+		if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+			switch (read_for_csum(mdev, sector, size)) {
+			case -EIO: /* Disk failure */
+				put_ldev(mdev);
+				return 0;
+			case -EAGAIN: /* allocation failed, or ldev busy */
+				drbd_rs_complete_io(mdev, sector);
+				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				i = rollback_i;
+				goto requeue;
+			case 0:
+				/* everything ok */
+				break;
+			default:
+				BUG();
+			}
+		} else {
+			inc_rs_pending(mdev);
+			if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+					       sector, size, ID_SYNCER)) {
+				dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(mdev);
+				put_ldev(mdev);
+				return 0;
+			}
+		}
+	}
+
+	if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
+		/* last syncer _request_ was sent,
+		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		put_ldev(mdev);
+		return 1;
+	}
+
+ requeue:
+	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	put_ldev(mdev);
+	return 1;
+}
+
+STATIC int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	int number, i, size;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+
+	if (unlikely(cancel))
+		return 1;
+
+	number = drbd_rs_number_requests(mdev);
+
+	sector = mdev->ov_position;
+	for (i = 0; i < number; i++) {
+		if (sector >= capacity) {
+			return 1;
+		}
+
+		size = BM_BLOCK_SIZE;
+
+		if (drbd_rs_should_slow_down(mdev, sector) ||
+		    drbd_try_rs_begin_io(mdev, sector)) {
+			mdev->ov_position = sector;
+			goto requeue;
+		}
+
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		inc_rs_pending(mdev);
+		if (!drbd_send_ov_request(mdev, sector, size)) {
+			dec_rs_pending(mdev);
+			return 0;
+		}
+		sector += BM_SECT_PER_BIT;
+	}
+	mdev->ov_position = sector;
+
+ requeue:
+	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+
+void start_resync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
+}
+
+int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
+		dev_warn(DEV, "w_start_resync later...\n");
+		mdev->start_resync_timer.expires = jiffies + HZ/10;
+		add_timer(&mdev->start_resync_timer);
+		return 1;
+	}
+
+	drbd_start_resync(mdev, C_SYNC_SOURCE);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
+	return 1;
+}
+
+int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	kfree(w);
+	ov_oos_print(mdev);
+	drbd_resync_finished(mdev);
+
+	return 1;
+}
+
+STATIC int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	kfree(w);
+
+	drbd_resync_finished(mdev);
+
+	return 1;
+}
+
+STATIC void ping_peer(struct drbd_conf *mdev)
+{
+	clear_bit(GOT_PING_ACK, &mdev->flags);
+	request_ping(mdev);
+	wait_event(mdev->misc_wait,
+		   test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_conf *mdev)
+{
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
+	union drbd_state os, ns;
+	struct drbd_work *w;
+	char *khelper_cmd = NULL;
+	int verify_done = 0;
+
+	/* Remove all elements from the resync LRU. Since future actions
+	 * might set bits in the (main) bitmap, then the entries in the
+	 * resync LRU would be wrong. */
+	if (drbd_rs_del_all(mdev)) {
+		/* In case this is not possible now, most probably because
+		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
+		 * queue (or even the read operations for those packets
+		 * is not finished by now).   Retry in 100ms. */
+
+		drbd_kick_lo(mdev);
+		schedule_timeout_interruptible(HZ / 10);
+		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
+		if (w) {
+			w->cb = w_resync_finished;
+			drbd_queue_work(&mdev->data.work, w);
+			return 1;
+		}
+		dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
+	}
+
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+	db = mdev->rs_total;
+	dbdt = Bit2KB(db/dt);
+	mdev->rs_paused /= HZ;
+
+	if (!get_ldev(mdev))
+		goto out;
+
+	ping_peer(mdev);
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+
+	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+	/* This protects us against multiple calls (that can happen in the presence
+	   of application IO), and against connectivity loss just before we arrive here. */
+	if (os.conn <= C_CONNECTED)
+		goto out_unlock;
+
+	ns.i = os.i;
+	ns.conn = C_CONNECTED;
+
+	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     verify_done ? "Online verify " : "Resync",
+	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
+
+	n_oos = drbd_bm_total_weight(mdev);
+
+	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+		if (n_oos) {
+			dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
+		}
+	} else {
+		D_ASSERT((n_oos - mdev->rs_failed) == 0);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+			khelper_cmd = "after-resync-target";
+
+		if (mdev->csums_tfm && mdev->rs_total) {
+			const unsigned long s = mdev->rs_same_csum;
+			const unsigned long t = mdev->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total - mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total));
+		}
+	}
+
+	if (mdev->rs_failed) {
+		dev_info(DEV, "            %lu failed blocks\n", mdev->rs_failed);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			ns.disk = D_INCONSISTENT;
+			ns.pdsk = D_UP_TO_DATE;
+		} else {
+			ns.disk = D_UP_TO_DATE;
+			ns.pdsk = D_INCONSISTENT;
+		}
+	} else {
+		ns.disk = D_UP_TO_DATE;
+		ns.pdsk = D_UP_TO_DATE;
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			if (mdev->p_uuid) {
+				int i;
+				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+					_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
+				drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
+				_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
+			} else {
+				dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
+			}
+		}
+
+		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+			/* for verify runs, we don't update uuids here,
+			 * so there would be nothing to report. */
+			drbd_uuid_set_bm(mdev, 0UL);
+			drbd_print_uuids(mdev, "updated UUIDs");
+			if (mdev->p_uuid) {
+				/* Now the two UUID sets are equal, update what we
+				 * know of the peer. */
+				int i;
+				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+					mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
+			}
+		}
+	}
+
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
+	_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+out_unlock:
+	spin_unlock_irq(&mdev->req_lock);
+	put_ldev(mdev);
+out:
+	mdev->rs_total  = 0;
+	mdev->rs_failed = 0;
+	mdev->rs_paused = 0;
+	if (verify_done)
+		mdev->ov_start_sector = 0;
+
+	drbd_md_sync(mdev);
+
+	if (khelper_cmd)
+		drbd_khelper(mdev, khelper_cmd);
+
+	return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	if (drbd_ee_has_active_page(e)) {
+		/* This might happen if sendpage() has not finished */
+		int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+		atomic_add(i, &mdev->pp_in_use_by_net);
+		atomic_sub(i, &mdev->pp_in_use);
+		spin_lock_irq(&mdev->req_lock);
+		list_add_tail(&e->w.list, &mdev->net_ee);
+		spin_unlock_irq(&mdev->req_lock);
+		wake_up(&drbd_pp_wait);
+	} else
+		drbd_free_ee(mdev, e);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+		ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+	} else {
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
+			    (unsigned long long)e->sector);
+
+		ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+	}
+
+	dec_unacked(mdev);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block() failed\n");
+	return ok;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.conn == C_AHEAD) {
+		ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
+	} else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
+			inc_rs_pending(mdev);
+			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+		} else {
+			if (DRBD_ratelimit(5*HZ, 5))
+				dev_err(DEV, "Not sending RSDataReply, "
+				    "partner DISKLESS!\n");
+			ok = 1;
+		}
+	} else {
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
+			    (unsigned long long)e->sector);
+
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+
+		/* update resync data with failure */
+		drbd_rs_failed_io(mdev, e->sector, e->size);
+	}
+
+	dec_unacked(mdev);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block() failed\n");
+	return ok;
+}
+
+int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int ok, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (get_ldev(mdev)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
+
+	di = e->digest;
+
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (mdev->csums_tfm) {
+			digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+			D_ASSERT(digest_size == di->digest_size);
+			digest = kmalloc(digest_size, GFP_NOIO);
+		}
+		if (digest) {
+			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(mdev, e->sector, e->size);
+			/* rs_same_csums unit is BM_BLOCK_SIZE */
+			mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
+			ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+		} else {
+			inc_rs_pending(mdev);
+			e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+			e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+			kfree(di);
+			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+		}
+	} else {
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		if (DRBD_ratelimit(5*HZ, 5))
+			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(mdev);
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block/ack() failed\n");
+	return ok;
+}
+
+/* TODO merge common code with w_e_send_csum */
+int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	sector_t sector = e->sector;
+	unsigned int size = e->size;
+	int digest_size;
+	void *digest;
+	int ok = 1;
+
+	if (unlikely(cancel))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (!digest) {
+		ok = 0;	/* terminate the connection in case the allocation failed */
+		goto out;
+	}
+
+	if (likely(!(e->flags & EE_WAS_ERROR)))
+		drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+	else
+		memset(digest, 0, digest_size);
+
+	/* Free e and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_pp_alloc due to pp_in_use > max_buffers. */
+	drbd_free_ee(mdev, e);
+	e = NULL;
+	inc_rs_pending(mdev);
+	ok = drbd_send_drequest_csum(mdev, sector, size,
+				     digest, digest_size,
+				     P_OV_REPLY);
+	if (!ok)
+		dec_rs_pending(mdev);
+	kfree(digest);
+
+out:
+	if (e)
+		drbd_free_ee(mdev, e);
+	dec_unacked(mdev);
+	return ok;
+}
+
+void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
+		mdev->ov_last_oos_size += size>>9;
+	} else {
+		mdev->ov_last_oos_start = sector;
+		mdev->ov_last_oos_size = size>>9;
+	}
+	drbd_set_out_of_sync(mdev, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct digest_info *di;
+	void *digest;
+	sector_t sector = e->sector;
+	unsigned int size = e->size;
+	int digest_size;
+	int ok, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+	 * the resync lru has been cleaned up already */
+	if (get_ldev(mdev)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
+
+	di = e->digest;
+
+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+
+			D_ASSERT(digest_size == di->digest_size);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+	}
+
+		/* Free e and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_pp_alloc due to pp_in_use > max_buffers. */
+	drbd_free_ee(mdev, e);
+	if (!eq)
+		drbd_ov_oos_found(mdev, sector, size);
+	else
+		ov_oos_print(mdev);
+
+	ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
+			      eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+	dec_unacked(mdev);
+
+	--mdev->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((mdev->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(mdev, mdev->ov_left);
+
+	if (mdev->ov_left == 0) {
+		ov_oos_print(mdev);
+		drbd_resync_finished(mdev);
+	}
+
+	return ok;
+}
+
+int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
+	complete(&b->done);
+	return 1;
+}
+
+int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
+	struct p_barrier *p = &mdev->data.sbuf.barrier;
+	int ok = 1;
+
+	/* really avoid racing with tl_clear.  w.cb may have been referenced
+	 * just before it was reassigned and re-queued, so double check that.
+	 * actually, this race was harmless, since we only try to send the
+	 * barrier packet here, and otherwise do nothing with the object.
+	 * but compare with the head of w_clear_epoch */
+	spin_lock_irq(&mdev->req_lock);
+	if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
+		cancel = 1;
+	spin_unlock_irq(&mdev->req_lock);
+	if (cancel)
+		return 1;
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+	p->barrier = b->br_number;
+	/* inc_ap_pending was done where this was queued.
+	 * dec_ap_pending will be done in got_BarrierAck
+	 * or (on connection loss) in w_clear_epoch.  */
+	ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
+				(struct p_header80 *)p, sizeof(*p), 0);
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (cancel)
+		return 1;
+	return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+}
+
+int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_oos(mdev, req);
+	req_mod(req, oos_handed_to_network);
+
+	return ok;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_dblock(mdev, req);
+	req_mod(req, ok ? handed_over_to_network : send_failed);
+
+	return ok;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
+				(unsigned long)req);
+
+	if (!ok) {
+		/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
+		 * so this is probably redundant */
+		if (mdev->state.conn >= C_CONNECTED)
+			drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+	}
+	req_mod(req, ok ? handed_over_to_network : send_failed);
+
+	return ok;
+}
+
+int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+		drbd_al_begin_io(mdev, req->sector);
+	/* Calling drbd_al_begin_io() out of the worker might deadlocks
+	   theoretically. Practically it can not deadlock, since this is
+	   only used when unfreezing IOs. All the extents of the requests
+	   that made it into the TL are already active */
+
+	drbd_req_make_private_bio(req, req->master_bio);
+	req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+	generic_make_request(req->private_bio);
+
+	return 1;
+}
+
+STATIC int _drbd_may_sync_now(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev = mdev;
+
+	while (1) {
+		if (odev->sync_conf.after == -1)
+			return 1;
+		odev = minor_to_mdev(odev->sync_conf.after);
+		ERR_IF(!odev) return 1;
+		if ((odev->state.conn >= C_SYNC_SOURCE &&
+		     odev->state.conn <= C_PAUSED_SYNC_T) ||
+		    odev->state.aftr_isp || odev->state.peer_isp ||
+		    odev->state.user_isp)
+			return 0;
+	}
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @mdev:	DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+STATIC int _drbd_pause_after(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev;
+	int i, rv = 0;
+
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev)
+			continue;
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (!_drbd_may_sync_now(odev))
+			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+			       != SS_NOTHING_TO_DO);
+	}
+
+	return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @mdev:	DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+STATIC int _drbd_resume_next(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev;
+	int i, rv = 0;
+
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev)
+			continue;
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (odev->state.aftr_isp) {
+			if (_drbd_may_sync_now(odev))
+				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+							CS_HARD, NULL)
+				       != SS_NOTHING_TO_DO) ;
+		}
+	}
+	return rv;
+}
+
+void resume_next_sg(struct drbd_conf *mdev)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_resume_next(mdev);
+	write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_conf *mdev)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_pause_after(mdev);
+	write_unlock_irq(&global_state_lock);
+}
+
+static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+{
+	struct drbd_conf *odev;
+
+	if (o_minor == -1)
+		return NO_ERROR;
+	if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
+		return ERR_SYNC_AFTER;
+
+	/* check for loops */
+	odev = minor_to_mdev(o_minor);
+	while (1) {
+		if (odev == mdev)
+			return ERR_SYNC_AFTER_CYCLE;
+
+		/* dependency chain ends here, no cycles. */
+		if (odev->sync_conf.after == -1)
+			return NO_ERROR;
+
+		/* follow the dependency chain */
+		odev = minor_to_mdev(odev->sync_conf.after);
+	}
+}
+
+int drbd_alter_sa(struct drbd_conf *mdev, int na)
+{
+	int changes;
+	int retcode;
+
+	write_lock_irq(&global_state_lock);
+	retcode = sync_after_error(mdev, na);
+	if (retcode == NO_ERROR) {
+		mdev->sync_conf.after = na;
+		do {
+			changes  = _drbd_pause_after(mdev);
+			changes |= _drbd_resume_next(mdev);
+		} while (changes);
+	}
+	write_unlock_irq(&global_state_lock);
+	return retcode;
+}
+
+void drbd_rs_controller_reset(struct drbd_conf *mdev)
+{
+	atomic_set(&mdev->rs_sect_in, 0);
+	atomic_set(&mdev->rs_sect_ev, 0);
+	mdev->rs_in_flight = 0;
+	mdev->rs_planed = 0;
+	spin_lock(&mdev->peer_seq_lock);
+	fifo_set(&mdev->rs_plan_s, 0);
+	spin_unlock(&mdev->peer_seq_lock);
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @mdev:	DRBD device.
+ * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
+{
+	union drbd_state ns;
+	int r;
+
+	if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
+		dev_err(DEV, "Resync already running!\n");
+		return;
+	}
+
+	trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n",
+			  side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource");
+
+	if (mdev->state.conn < C_AHEAD) {
+		/* In case a previous resync run was aborted by an IO error/detach on the peer. */
+		drbd_rs_cancel_all(mdev);
+		/* This should be done when we abort the resync. We definitely do not
+		   want to have this for connections going back and forth between
+		   Ahead/Behind and SyncSource/SyncTarget */
+	}
+
+	if (side == C_SYNC_TARGET) {
+		/* Since application IO was locked out during C_WF_BITMAP_T and
+		   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+		   we check that we might make the data inconsistent. */
+		r = drbd_khelper(mdev, "before-resync-target");
+		r = (r >> 8) & 0xff;
+		if (r > 0) {
+			dev_info(DEV, "before-resync-target handler returned %d, "
+			     "dropping connection.\n", r);
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return;
+		}
+	} else /* C_SYNC_SOURCE */ {
+		r = drbd_khelper(mdev, "before-resync-source");
+		r = (r >> 8) & 0xff;
+		if (r > 0) {
+			if (r == 3) {
+				dev_info(DEV, "before-resync-source handler returned %d, "
+					 "ignoring. Old userland tools?", r);
+			} else {
+				dev_info(DEV, "before-resync-source handler returned %d, "
+					 "dropping connection.\n", r);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return;
+			}
+		}
+	}
+
+	drbd_state_lock(mdev);
+	write_lock_irq(&global_state_lock);
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		write_unlock_irq(&global_state_lock);
+		drbd_state_unlock(mdev);
+		return;
+	}
+
+	ns.i = mdev->state.i;
+
+	ns.aftr_isp = !_drbd_may_sync_now(mdev);
+
+	ns.conn = side;
+
+	if (side == C_SYNC_TARGET)
+		ns.disk = D_INCONSISTENT;
+	else /* side == C_SYNC_SOURCE */
+		ns.pdsk = D_INCONSISTENT;
+
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
+	r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	ns.i = mdev->state.i;
+
+	if (ns.conn < C_CONNECTED)
+		r = SS_UNKNOWN_ERROR;
+
+	if (r == SS_SUCCESS) {
+		unsigned long tw = drbd_bm_total_weight(mdev);
+		unsigned long now = jiffies;
+		int i;
+
+		mdev->rs_failed    = 0;
+		mdev->rs_paused    = 0;
+		mdev->rs_same_csum = 0;
+		mdev->rs_last_events = 0;
+		mdev->rs_last_sect_ev = 0;
+		mdev->rs_total     = tw;
+		mdev->rs_start     = now;
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			mdev->rs_mark_left[i] = tw;
+			mdev->rs_mark_time[i] = now;
+		}
+		_drbd_pause_after(mdev);
+	}
+	write_unlock_irq(&global_state_lock);
+
+	if (r == SS_SUCCESS) {
+		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+		     drbd_conn_str(ns.conn),
+		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
+		     (unsigned long) mdev->rs_total);
+		if (side == C_SYNC_TARGET)
+			mdev->bm_resync_fo = 0;
+
+		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+		 * with w_send_oos, or the sync target will get confused as to
+		 * how much bits to resync.  We cannot do that always, because for an
+		 * empty resync and protocol < 95, we need to do it here, as we call
+		 * drbd_resync_finished from here in that case.
+		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+		 * and from after_state_ch otherwise. */
+		if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(mdev);
+
+		if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+			/* This still has a race (about when exactly the peers
+			 * detect connection loss) that can lead to a full sync
+			 * on next handshake. In 8.3.9 we fixed this with explicit
+			 * resync-finished notifications, but the fix
+			 * introduces a protocol change.  Sleeping for some
+			 * time longer than the ping interval + timeout on the
+			 * SyncSource, to give the SyncTarget the chance to
+			 * detect connection loss, then waiting for a ping
+			 * response (implicit in drbd_resync_finished) reduces
+			 * the race considerably, but does not solve it. */
+			if (side == C_SYNC_SOURCE)
+				schedule_timeout_interruptible(
+					mdev->net_conf->ping_int * HZ +
+					mdev->net_conf->ping_timeo*HZ/9);
+			drbd_resync_finished(mdev);
+		}
+
+		drbd_rs_controller_reset(mdev);
+		/* ns.conn may already be != mdev->state.conn,
+		 * we may have been paused in between, or become paused until
+		 * the timer triggers.
+		 * No matter, that is handled in resync_timer_fn() */
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&mdev->resync_timer, jiffies);
+
+		drbd_md_sync(mdev);
+	}
+	put_ldev(mdev);
+	drbd_state_unlock(mdev);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct drbd_work *w = NULL;
+	LIST_HEAD(work_list);
+	int intr = 0, i;
+
+	sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+
+	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+
+		if (down_trylock(&mdev->data.work.s)) {
+			mutex_lock(&mdev->data.mutex);
+			if (mdev->data.socket && !mdev->net_conf->no_cork)
+				drbd_tcp_uncork(mdev->data.socket);
+			mutex_unlock(&mdev->data.mutex);
+
+			intr = down_interruptible(&mdev->data.work.s);
+
+			mutex_lock(&mdev->data.mutex);
+			if (mdev->data.socket  && !mdev->net_conf->no_cork)
+				drbd_tcp_cork(mdev->data.socket);
+			mutex_unlock(&mdev->data.mutex);
+		}
+
+		if (intr) {
+			D_ASSERT(intr == -EINTR);
+			flush_signals(current);
+			ERR_IF (get_t_state(thi) == Running)
+				continue;
+			break;
+		}
+
+		if (get_t_state(thi) != Running)
+			break;
+		/* With this break, we have done a down() but not consumed
+		   the entry from the list. The cleanup code takes care of
+		   this...   */
+
+		w = NULL;
+		spin_lock_irq(&mdev->data.work.q_lock);
+		ERR_IF(list_empty(&mdev->data.work.q)) {
+			/* something terribly wrong in our logic.
+			 * we were able to down() the semaphore,
+			 * but the list is empty... doh.
+			 *
+			 * what is the best thing to do now?
+			 * try again from scratch, restarting the receiver,
+			 * asender, whatnot? could break even more ugly,
+			 * e.g. when we are primary, but no good local data.
+			 *
+			 * I'll try to get away just starting over this loop.
+			 */
+			spin_unlock_irq(&mdev->data.work.q_lock);
+			continue;
+		}
+		w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
+		list_del_init(&w->list);
+		spin_unlock_irq(&mdev->data.work.q_lock);
+
+		if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
+			/* dev_warn(DEV, "worker: a callback failed! \n"); */
+			if (mdev->state.conn >= C_CONNECTED)
+				drbd_force_state(mdev,
+						NS(conn, C_NETWORK_FAILURE));
+		}
+	}
+	D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
+	D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
+
+	spin_lock_irq(&mdev->data.work.q_lock);
+	i = 0;
+	while (!list_empty(&mdev->data.work.q)) {
+		list_splice_init(&mdev->data.work.q, &work_list);
+		spin_unlock_irq(&mdev->data.work.q_lock);
+
+		while (!list_empty(&work_list)) {
+			w = list_entry(work_list.next, struct drbd_work, list);
+			list_del_init(&w->list);
+			w->cb(mdev, w, 1);
+			i++; /* dead debugging code */
+		}
+
+		spin_lock_irq(&mdev->data.work.q_lock);
+	}
+	sema_init(&mdev->data.work.s, 0);
+	/* DANGEROUS race: if someone did queue his work within the spinlock,
+	 * but up() ed outside the spinlock, we could get an up() on the
+	 * semaphore without corresponding list entry.
+	 * So don't do that.
+	 */
+	spin_unlock_irq(&mdev->data.work.q_lock);
+
+	D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
+	/* _drbd_set_state only uses stop_nowait.
+	 * wait here for the Exiting receiver. */
+	drbd_thread_stop(&mdev->receiver);
+	drbd_mdev_cleanup(mdev);
+
+	dev_info(DEV, "worker terminated\n");
+
+	clear_bit(DEVICE_DYING, &mdev->flags);
+	clear_bit(CONFIG_PENDING, &mdev->flags);
+	wake_up(&mdev->state_wait);
+
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_wrappers.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_wrappers.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/drbd_wrappers.h	2015-01-21 12:02:58.388823806 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/drbd_wrappers.h	2015-01-21 12:02:58.388823806 +0300
@@ -0,0 +1,1114 @@
+#ifndef _DRBD_WRAPPERS_H
+#define _DRBD_WRAPPERS_H
+
+#include <linux/ctype.h>
+#include <linux/net.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+# error "use a 2.6 kernel, please"
+#endif
+
+/* The history of blkdev_issue_flush()
+
+   It had 2 arguments before fbd9b09a177a481eda256447c881f014f29034fe,
+   after it had 4 arguments. (With that commit came BLKDEV_IFL_WAIT)
+
+   It had 4 arguments before dd3932eddf428571762596e17b65f5dc92ca361b,
+   after it got 3 arguments. (With that commit came BLKDEV_DISCARD_SECURE
+   and BLKDEV_IFL_WAIT disappeared again.) */
+#include <linux/blkdev.h>
+#ifndef BLKDEV_IFL_WAIT
+#ifndef BLKDEV_DISCARD_SECURE
+/* before fbd9b09a177 */
+#define blkdev_issue_flush(b, gfpf, s)	blkdev_issue_flush(b, s)
+#endif
+/* after dd3932eddf4 no define at all */
+#else
+/* between fbd9b09a177 and dd3932eddf4 */
+#define blkdev_issue_flush(b, gfpf, s)	blkdev_issue_flush(b, gfpf, s, BLKDEV_IFL_WAIT)
+#endif
+
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+
+/* for the proc_create wrapper */
+#include <linux/proc_fs.h>
+
+/* struct page has a union in 2.6.15 ...
+ * an anonymous union and struct since 2.6.16
+ * or in fc5 "2.6.15" */
+#include <linux/mm.h>
+#ifndef page_private
+# define page_private(page)		((page)->private)
+# define set_page_private(page, v)	((page)->private = (v))
+#endif
+
+/* mutex was not available before 2.6.16.
+ * various vendors provide various degrees of backports.
+ * we provide the missing parts ourselves, if neccessary.
+ * this one is for RHEL/Centos 4 */
+#if defined(mutex_lock) && !defined(mutex_is_locked)
+#define mutex_is_locked(m) (atomic_read(&(m)->count) != 1)
+#endif
+
+/* see get_sb_bdev and bd_claim */
+extern char *drbd_sec_holder;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+static inline unsigned short queue_logical_block_size(struct request_queue *q)
+{
+	int retval = 512;
+	if (q && q->hardsect_size)
+		retval = q->hardsect_size;
+	return retval;
+}
+
+static inline sector_t bdev_logical_block_size(struct block_device *bdev)
+{
+	return queue_logical_block_size(bdev_get_queue(bdev));
+}
+
+static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+	/* before upstream commit ba066f3a0469dfc6d8fbdf70fabfd8c069fbf306,
+	 * there is no max_hw_sectors. Simply use max_sectors here,
+	 * it should be good enough. Affected: sles9. */
+	return q->max_sectors;
+#else
+	return q->max_hw_sectors;
+#endif
+}
+
+static inline unsigned int queue_max_sectors(struct request_queue *q)
+{
+	return q->max_sectors;
+}
+
+static inline void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
+{
+	q->hardsect_size = size;
+}
+#endif
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+	return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+}
+
+#ifdef COMPAT_HAVE_VOID_MAKE_REQUEST
+/* in Commit 5a7bbad27a410350e64a2d7f5ec18fc73836c14f (between Linux-3.1 and 3.2)
+   make_request() becomes type void. Before it had type int. */
+#define MAKE_REQUEST_TYPE void
+#define MAKE_REQUEST_RETURN return
+#else
+#define MAKE_REQUEST_TYPE int
+#define MAKE_REQUEST_RETURN return 0
+#endif
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
+					sector_t size)
+{
+	/* set_capacity(mdev->this_bdev->bd_disk, size); */
+	set_capacity(mdev->vdisk, size);
+	mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+#ifndef COMPAT_HAVE_FMODE_T
+typedef unsigned fmode_t;
+#endif
+
+#ifndef COMPAT_HAVE_BLKDEV_GET_BY_PATH
+/* see kernel 2.6.37,
+ * d4d7762 block: clean up blkdev_get() wrappers and their users
+ * e525fd8 block: make blkdev_get/put() handle exclusive access
+ * and kernel 2.6.28
+ * 30c40d2 [PATCH] propagate mode through open_bdev_excl/close_bdev_excl
+ * Also note that there is no FMODE_EXCL before
+ * 86d434d [PATCH] eliminate use of ->f_flags in block methods
+ */
+#ifndef COMPAT_HAVE_OPEN_BDEV_EXCLUSIVE
+#ifndef FMODE_EXCL
+#define FMODE_EXCL 0
+#endif
+static inline
+struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
+{
+	/* drbd does not open readonly, but try to be correct, anyways */
+	return open_bdev_excl(path, (mode & FMODE_WRITE) ? 0 : MS_RDONLY, holder);
+}
+static inline
+void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
+{
+	/* mode ignored. */
+	close_bdev_excl(bdev);
+}
+#endif
+static inline struct block_device *blkdev_get_by_path(const char *path,
+		fmode_t mode, void *holder)
+{
+	return open_bdev_exclusive(path, mode, holder);
+}
+
+static inline int drbd_blkdev_put(struct block_device *bdev, fmode_t mode)
+{
+	/* blkdev_put != close_bdev_exclusive, in general, so this is obviously
+	 * not correct, and there should be some if (mode & FMODE_EXCL) ...
+	 * But this is the only way it is used in DRBD,
+	 * and for <= 2.6.27, there is no FMODE_EXCL anyways. */
+	close_bdev_exclusive(bdev, mode);
+
+	/* blkdev_put seems to not have useful return values,
+	 * close_bdev_exclusive is void. */
+	return 0;
+}
+#define blkdev_put(b, m)	drbd_blkdev_put(b, m)
+#endif
+
+#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+/* Before Linux-2.6.24 bie_endio() had the size of the bio as second argument.
+   See 6712ecf8f648118c3363c142196418f89a510b90 */
+#define bio_endio(B,E) bio_endio(B, (B)->bi_size, E)
+#define BIO_ENDIO_TYPE int
+#define BIO_ENDIO_ARGS(b,e) (b, unsigned int bytes_done, e)
+#define BIO_ENDIO_FN_START if (bio->bi_size) return 1
+#define BIO_ENDIO_FN_RETURN return 0
+#else
+#define BIO_ENDIO_TYPE void
+#define BIO_ENDIO_ARGS(b,e) (b,e)
+#define BIO_ENDIO_FN_START do {} while (0)
+#define BIO_ENDIO_FN_RETURN return
+#endif
+
+/* bi_end_io handlers */
+extern BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error);
+extern BIO_ENDIO_TYPE drbd_endio_sec BIO_ENDIO_ARGS(struct bio *bio, int error);
+extern BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
+#define part_inc_in_flight(A, B) part_inc_in_flight(A)
+#define part_dec_in_flight(A, B) part_dec_in_flight(A)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+/* Before 2.6.23 (with 20c2df83d25c6a95affe6157a4c9cac4cf5ffaac) kmem_cache_create had a
+   ctor and a dtor */
+#define kmem_cache_create(N,S,A,F,C) kmem_cache_create(N,S,A,F,C,NULL)
+#endif
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+# undef HAVE_bvec_merge_data
+# define HAVE_bvec_merge_data 1
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+static inline void sg_set_page(struct scatterlist *sg, struct page *page,
+			       unsigned int len, unsigned int offset)
+{
+	sg->page   = page;
+	sg->offset = offset;
+	sg->length = len;
+}
+
+#define sg_init_table(S,N) ({})
+
+#ifdef NEED_SG_SET_BUF
+static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
+			      unsigned int buflen)
+{
+	sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+}
+#endif
+
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+# define BD_OPS_USE_FMODE
+#endif
+
+/* how to get to the kobj of a gendisk.
+ * see also upstream commits
+ * edfaa7c36574f1bf09c65ad602412db9da5f96bf
+ * ed9e1982347b36573cd622ee5f4e2a7ccd79b3fd
+ * 548b10eb2959c96cef6fc29fc96e0931eeb53bc5
+ */
+#ifndef dev_to_disk
+# define disk_to_kobj(disk) (&(disk)->kobj)
+#else
+# ifndef disk_to_dev
+#  define disk_to_dev(disk) (&(disk)->dev)
+# endif
+# define disk_to_kobj(disk) (&disk_to_dev(disk)->kobj)
+#endif
+static inline void drbd_kobject_uevent(struct drbd_conf *mdev)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,15)
+	kobject_uevent(disk_to_kobj(mdev->vdisk), KOBJ_CHANGE, NULL);
+#else
+	kobject_uevent(disk_to_kobj(mdev->vdisk), KOBJ_CHANGE);
+	/* rhel4 / sles9 and older don't have this at all,
+	 * which means user space (udev) won't get events about possible changes of
+	 * corresponding resource + disk names after the initial drbd minor creation.
+	 */
+#endif
+#endif
+}
+
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_conf *mdev,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+				"bio->bi_bdev == NULL\n",
+		       mdev_to_minor(mdev));
+		dump_stack();
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (drbd_insert_fault(mdev, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
+/* see 7eaceac block: remove per-queue plugging */
+#ifdef blk_queue_plugged
+static inline void drbd_plug_device(struct drbd_conf *mdev)
+{
+	struct request_queue *q;
+	q = bdev_get_queue(mdev->this_bdev);
+
+	spin_lock_irq(q->queue_lock);
+
+/* XXX the check on !blk_queue_plugged is redundant,
+ * implicitly checked in blk_plug_device */
+
+	if (!blk_queue_plugged(q)) {
+		blk_plug_device(q);
+		del_timer(&q->unplug_timer);
+		/* unplugging should not happen automatically... */
+	}
+	spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void drbd_plug_device(struct drbd_conf *mdev)
+{
+}
+#endif
+
+static inline int drbd_backing_bdev_events(struct drbd_conf *mdev)
+{
+	struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
+	/* very old kernel */
+	return (int)disk_stat_read(disk, read_sectors)
+	     + (int)disk_stat_read(disk, write_sectors);
+#elif defined(__disk_stat_inc)
+	/* older kernel */
+	return (int)disk_stat_read(disk, sectors[0])
+	     + (int)disk_stat_read(disk, sectors[1]);
+#else
+	/* recent kernel */
+	return (int)part_stat_read(&disk->part0, sectors[0])
+	     + (int)part_stat_read(&disk->part0, sectors[1]);
+#endif
+}
+
+#ifdef DEFINE_SOCK_CREATE_KERN
+#define sock_create_kern sock_create
+#endif
+
+#ifdef USE_KMEM_CACHE_S
+#define kmem_cache kmem_cache_s
+#endif
+
+#ifdef DEFINE_KERNEL_SOCK_SHUTDOWN
+enum sock_shutdown_cmd {
+	SHUT_RD = 0,
+	SHUT_WR = 1,
+	SHUT_RDWR = 2,
+};
+static inline int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
+{
+	return sock->ops->shutdown(sock, how);
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+static inline void drbd_unregister_blkdev(unsigned int major, const char *name)
+{
+	int ret = unregister_blkdev(major, name);
+	if (ret)
+		printk(KERN_ERR "drbd: unregister of device failed\n");
+}
+#else
+#define drbd_unregister_blkdev unregister_blkdev
+#endif
+
+#ifdef NEED_BACKPORT_OF_ATOMIC_ADD
+
+#if defined(__x86_64__)
+
+static __inline__ int atomic_add_return(int i, atomic_t *v)
+{
+	int __i = i;
+	__asm__ __volatile__(
+		LOCK_PREFIX "xaddl %0, %1;"
+		:"=r"(i)
+		:"m"(v->counter), "0"(i));
+	return i + __i;
+}
+
+static __inline__ int atomic_sub_return(int i, atomic_t *v)
+{
+	return atomic_add_return(-i, v);
+}
+
+#define atomic_inc_return(v)  (atomic_add_return(1,v))
+#define atomic_dec_return(v)  (atomic_sub_return(1,v))
+
+#elif defined(__i386__) || defined(__arch_um__)
+
+static __inline__ int atomic_add_return(int i, atomic_t *v)
+{
+	int __i;
+#ifdef CONFIG_M386
+	unsigned long flags;
+	if(unlikely(boot_cpu_data.x86==3))
+		goto no_xadd;
+#endif
+	/* Modern 486+ processor */
+	__i = i;
+	__asm__ __volatile__(
+		LOCK_PREFIX "xaddl %0, %1;"
+		:"=r"(i)
+		:"m"(v->counter), "0"(i));
+	return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+	local_irq_save(flags);
+	__i = atomic_read(v);
+	atomic_set(v, i + __i);
+	local_irq_restore(flags);
+	return i + __i;
+#endif
+}
+
+static __inline__ int atomic_sub_return(int i, atomic_t *v)
+{
+	return atomic_add_return(-i, v);
+}
+
+#define atomic_inc_return(v)  (atomic_add_return(1,v))
+#define atomic_dec_return(v)  (atomic_sub_return(1,v))
+
+#else
+# error "You need to copy/past atomic_inc_return()/atomic_dec_return() here"
+# error "for your architecture. (Hint: Kernels after 2.6.10 have those"
+# error "by default! Using a later kernel might be less effort!)"
+#endif
+
+#endif
+
+#if !defined(CRYPTO_ALG_ASYNC)
+/* With Linux-2.6.19 the crypto API changed! */
+/* This is not a generic backport of the new api, it just implements
+   the corner case of "hmac(xxx)".  */
+
+#define CRYPTO_ALG_ASYNC 4711
+#define CRYPTO_ALG_TYPE_HASH CRYPTO_ALG_TYPE_DIGEST
+
+struct crypto_hash {
+	struct crypto_tfm *base;
+	const u8 *key;
+	int keylen;
+};
+
+struct hash_desc {
+	struct crypto_hash *tfm;
+	u32 flags;
+};
+
+static inline struct crypto_hash *
+crypto_alloc_hash(char *alg_name, u32 type, u32 mask)
+{
+	struct crypto_hash *ch;
+	char *closing_bracket;
+
+	/* "hmac(xxx)" is in alg_name we need that xxx. */
+	closing_bracket = strchr(alg_name, ')');
+	if (!closing_bracket) {
+		ch = kmalloc(sizeof(struct crypto_hash), GFP_KERNEL);
+		if (!ch)
+			return ERR_PTR(-ENOMEM);
+		ch->base = crypto_alloc_tfm(alg_name, 0);
+		if (ch->base == NULL) {
+			kfree(ch);
+			return ERR_PTR(-ENOMEM);
+		}
+		return ch;
+	}
+	if (closing_bracket-alg_name < 6)
+		return ERR_PTR(-ENOENT);
+
+	ch = kmalloc(sizeof(struct crypto_hash), GFP_KERNEL);
+	if (!ch)
+		return ERR_PTR(-ENOMEM);
+
+	*closing_bracket = 0;
+	ch->base = crypto_alloc_tfm(alg_name + 5, 0);
+	*closing_bracket = ')';
+
+	if (ch->base == NULL) {
+		kfree(ch);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return ch;
+}
+
+static inline int
+crypto_hash_setkey(struct crypto_hash *hash, const u8 *key, unsigned int keylen)
+{
+	hash->key = key;
+	hash->keylen = keylen;
+
+	return 0;
+}
+
+static inline int
+crypto_hash_digest(struct hash_desc *desc, struct scatterlist *sg,
+		   unsigned int nbytes, u8 *out)
+{
+
+	crypto_hmac(desc->tfm->base, (u8 *)desc->tfm->key,
+		    &desc->tfm->keylen, sg, 1 /* ! */ , out);
+	/* ! this is not generic. Would need to convert nbytes -> nsg */
+
+	return 0;
+}
+
+static inline void crypto_free_hash(struct crypto_hash *tfm)
+{
+	if (!tfm)
+		return;
+	crypto_free_tfm(tfm->base);
+	kfree(tfm);
+}
+
+static inline unsigned int crypto_hash_digestsize(struct crypto_hash *tfm)
+{
+	return crypto_tfm_alg_digestsize(tfm->base);
+}
+
+static inline struct crypto_tfm *crypto_hash_tfm(struct crypto_hash *tfm)
+{
+	return tfm->base;
+}
+
+static inline int crypto_hash_init(struct hash_desc *desc)
+{
+	crypto_digest_init(desc->tfm->base);
+	return 0;
+}
+
+static inline int crypto_hash_update(struct hash_desc *desc,
+				     struct scatterlist *sg,
+				     unsigned int nbytes)
+{
+	crypto_digest_update(desc->tfm->base,sg,1 /* ! */ );
+	/* ! this is not generic. Would need to convert nbytes -> nsg */
+
+	return 0;
+}
+
+static inline int crypto_hash_final(struct hash_desc *desc, u8 *out)
+{
+	crypto_digest_final(desc->tfm->base, out);
+	return 0;
+}
+
+#endif
+
+static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
+{
+#ifdef CRYPTO_ALG_TYPE_HASH_MASK
+	/* see include/linux/crypto.h */
+	return !((crypto_tfm_alg_type(tfm) ^ CRYPTO_ALG_TYPE_HASH)
+		& CRYPTO_ALG_TYPE_HASH_MASK);
+#else
+	return crypto_tfm_alg_type(tfm) == CRYPTO_ALG_TYPE_HASH;
+#endif
+}
+
+
+#ifdef NEED_BACKPORT_OF_KZALLOC
+static inline void *kzalloc(size_t size, int flags)
+{
+	void *rv = kmalloc(size, flags);
+	if (rv)
+		memset(rv, 0, size);
+
+	return rv;
+}
+#endif
+
+/* see upstream commit 2d3854a37e8b767a51aba38ed6d22817b0631e33 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)
+#ifndef cpumask_bits
+#define nr_cpu_ids NR_CPUS
+#define nr_cpumask_bits nr_cpu_ids
+
+typedef cpumask_t cpumask_var_t[1];
+#define cpumask_bits(maskp) ((unsigned long*)(maskp))
+#define cpu_online_mask &(cpu_online_map)
+
+static inline void cpumask_clear(cpumask_t *dstp)
+{
+	bitmap_zero(cpumask_bits(dstp), NR_CPUS);
+}
+
+static inline int cpumask_equal(const cpumask_t *src1p,
+				const cpumask_t *src2p)
+{
+	return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
+						 nr_cpumask_bits);
+}
+
+static inline void cpumask_copy(cpumask_t *dstp,
+				cpumask_t *srcp)
+{
+	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+static inline unsigned int cpumask_weight(const cpumask_t *srcp)
+{
+	return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+static inline void cpumask_set_cpu(unsigned int cpu, cpumask_t *dstp)
+{
+	set_bit(cpu, cpumask_bits(dstp));
+}
+
+static inline void cpumask_setall(cpumask_t *dstp)
+{
+	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
+}
+
+static inline void free_cpumask_var(cpumask_var_t mask)
+{
+}
+#endif
+/* see upstream commit 0281b5dc0350cbf6dd21ed558a33cccce77abc02 */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+static inline int zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
+}
+#else
+static inline int zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	cpumask_clear(*mask);
+	return 1;
+}
+#endif
+/* see upstream commit cd8ba7cd9be0192348c2836cb6645d9b2cd2bfd2 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+/* As macro because RH has it in 2.6.18-128.4.1.el5, but not exported to modules !?!? */
+#define set_cpus_allowed_ptr(P, NM) set_cpus_allowed(P, *NM)
+#endif
+#endif
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#define __bitmap_parse(BUF, BUFLEN, ISUSR, MASKP, NMASK) \
+	backport_bitmap_parse(BUF, BUFLEN, ISUSR, MASKP, NMASK)
+
+#define CHUNKSZ                         32
+#define nbits_to_hold_value(val)        fls(val)
+#define unhex(c)                        (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
+
+static inline int backport_bitmap_parse(const char *buf, unsigned int buflen,
+		int is_user, unsigned long *maskp,
+		int nmaskbits)
+{
+	int c, old_c, totaldigits, ndigits, nchunks, nbits;
+	u32 chunk;
+	const char __user *ubuf = buf;
+
+	bitmap_zero(maskp, nmaskbits);
+
+	nchunks = nbits = totaldigits = c = 0;
+	do {
+		chunk = ndigits = 0;
+
+		/* Get the next chunk of the bitmap */
+		while (buflen) {
+			old_c = c;
+			if (is_user) {
+				if (__get_user(c, ubuf++))
+					return -EFAULT;
+			}
+			else
+				c = *buf++;
+			buflen--;
+			if (isspace(c))
+				continue;
+
+			/*
+			 * If the last character was a space and the current
+			 * character isn't '\0', we've got embedded whitespace.
+			 * This is a no-no, so throw an error.
+			 */
+			if (totaldigits && c && isspace(old_c))
+				return -EINVAL;
+
+			/* A '\0' or a ',' signal the end of the chunk */
+			if (c == '\0' || c == ',')
+				break;
+
+			if (!isxdigit(c))
+				return -EINVAL;
+
+			/*
+			 * Make sure there are at least 4 free bits in 'chunk'.
+			 * If not, this hexdigit will overflow 'chunk', so
+			 * throw an error.
+			 */
+			if (chunk & ~((1UL << (CHUNKSZ - 4)) - 1))
+				return -EOVERFLOW;
+
+			chunk = (chunk << 4) | unhex(c);
+			ndigits++; totaldigits++;
+		}
+		if (ndigits == 0)
+			return -EINVAL;
+		if (nchunks == 0 && chunk == 0)
+			continue;
+
+		bitmap_shift_left(maskp, maskp, CHUNKSZ, nmaskbits);
+		*maskp |= chunk;
+		nchunks++;
+		nbits += (nchunks == 1) ? nbits_to_hold_value(chunk) : CHUNKSZ;
+		if (nbits > nmaskbits)
+			return -EOVERFLOW;
+	} while (buflen && c == ',');
+
+	return 0;
+}
+#endif
+
+#ifndef __CHECKER__
+# undef __cond_lock
+# define __cond_lock(x,c) (c)
+#endif
+
+#ifndef KERNEL_HAS_GFP_T
+#define KERNEL_HAS_GFP_T
+typedef unsigned gfp_t;
+#endif
+
+
+/* struct kvec didn't exist before 2.6.8, this is an ugly
+ * #define to work around it ... - jt */
+
+#ifndef KERNEL_HAS_KVEC
+#define kvec iovec
+#endif
+
+#ifndef net_random
+#define random32 net_random
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)
+#define BDI_async_congested BDI_write_congested
+#define BDI_sync_congested  BDI_read_congested
+#endif
+
+/* see upstream commits
+ * 2d3a4e3666325a9709cc8ea2e88151394e8f20fc (in 2.6.25-rc1)
+ * 59b7435149eab2dd06dd678742faff6049cb655f (in 2.6.26-rc1)
+ * this "backport" does not close the race that lead to the API change,
+ * but only provides an equivalent function call.
+ */
+#ifndef KERNEL_HAS_PROC_CREATE_DATA
+static inline struct proc_dir_entry *proc_create_data(const char *name,
+	mode_t mode, struct proc_dir_entry *parent,
+	struct file_operations *proc_fops, void *data)
+{
+	struct proc_dir_entry *pde = create_proc_entry(name, mode, parent);
+	if (pde) {
+		pde->proc_fops = proc_fops;
+		pde->data = data;
+	}
+	return pde;
+}
+
+#endif
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)
+#define TP_PROTO(args...)	args
+#define TP_ARGS(args...)		args
+
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)				\
+	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
+	{ }								\
+	static inline void trace_##name(proto)				\
+	{ }								\
+	static inline int register_trace_##name(void (*probe)(proto))	\
+	{								\
+		return -ENOSYS;						\
+	}								\
+	static inline int unregister_trace_##name(void (*probe)(proto))	\
+	{								\
+		return -ENOSYS;						\
+	}
+
+#undef DEFINE_TRACE
+#define DEFINE_TRACE(name)
+
+#endif
+
+#ifdef NEED_BLK_QUEUE_MAX_HW_SECTORS
+static inline void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max)
+{
+	blk_queue_max_sectors(q, max);
+}
+#elif defined(USE_BLK_QUEUE_MAX_SECTORS_ANYWAYS)
+	/* For kernel versions 2.6.31 to 2.6.33 inclusive, even though
+	 * blk_queue_max_hw_sectors is present, we actually need to use
+	 * blk_queue_max_sectors to set max_hw_sectors. :-(
+	 * RHEL6 2.6.32 chose to be different and already has eliminated
+	 * blk_queue_max_sectors as upstream 2.6.34 did.
+	 */
+#define blk_queue_max_hw_sectors(q, max)	blk_queue_max_sectors(q, max)
+#endif
+
+#ifdef NEED_BLK_QUEUE_MAX_SEGMENTS
+static inline void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
+{
+	blk_queue_max_phys_segments(q, max_segments);
+	blk_queue_max_hw_segments(q, max_segments);
+#define BLK_MAX_SEGMENTS MAX_HW_SEGMENTS /* or max MAX_PHYS_SEGMENTS. Probably does not matter */
+}
+#endif
+
+#ifdef NEED_ATOMIC_ADD_UNLESS
+#ifndef atomic_xchg
+static inline int atomic_xchg(atomic_t *v, int new)
+{
+	return xchg(&v->counter, new);
+}
+#endif
+#ifndef atomic_cmpxchg
+static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+	return cmpxchg(&v->counter, old, new);
+}
+#endif
+
+/**
+ * atomic_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as @v was not already @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int c, old;
+	c = atomic_read(v);
+	for (;;) {
+		if (unlikely(c == (u)))
+			break;
+		old = atomic_cmpxchg((v), c, c + (a));
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return c != (u);
+}
+#endif
+
+#ifdef NEED_BOOL_TYPE
+typedef _Bool                   bool;
+enum {
+	false = 0,
+	true = 1
+};
+#endif
+
+/* REQ_* and BIO_RW_* flags have been moved around in the tree,
+ * and have finally been "merged" with
+ * 7b6d91daee5cac6402186ff224c3af39d79f4a0e and
+ * 7cc015811ef8992dfcce314d0ed9642bc18143d1
+ * We communicate between different systems,
+ * so we have to somehow semantically map the bi_rw flags
+ * bi_rw (some kernel version) -> data packet flags -> bi_rw (other kernel version)
+ */
+
+/* RHEL 6.1 backported FLUSH/FUA as BIO_RW_FLUSH/FUA
+ * and at that time also introduced the defines BIO_FLUSH/FUA.
+ * There is also REQ_FLUSH/FUA, but these do NOT share
+ * the same value space as the bio rw flags, yet.
+ */
+#ifdef BIO_FLUSH
+
+#define DRBD_REQ_FLUSH		(1UL << BIO_RW_FLUSH)
+#define DRBD_REQ_FUA		(1UL << BIO_RW_FUA)
+#define DRBD_REQ_HARDBARRIER	(1UL << BIO_RW_BARRIER)
+#define DRBD_REQ_DISCARD	(1UL << BIO_RW_DISCARD)
+#define DRBD_REQ_SYNC		(1UL << BIO_RW_SYNCIO)
+#define DRBD_REQ_UNPLUG		(1UL << BIO_RW_UNPLUG)
+
+#elif defined(REQ_FLUSH)	/* introduced in 2.6.36,
+				 * now equivalent to bi_rw */
+
+#define DRBD_REQ_SYNC		REQ_SYNC
+#define DRBD_REQ_FLUSH		REQ_FLUSH
+#define DRBD_REQ_FUA		REQ_FUA
+#define DRBD_REQ_DISCARD	REQ_DISCARD
+/* REQ_HARDBARRIER has been around for a long time,
+ * without being directly related to bi_rw.
+ * so the ifdef is only usful inside the ifdef REQ_FLUSH!
+ * commit 7cc0158 (v2.6.36-rc1) made it a bi_rw flag, ...  */
+#ifdef REQ_HARDBARRIER
+#define DRBD_REQ_HARDBARRIER	REQ_HARDBARRIER
+#else
+/* ... but REQ_HARDBARRIER was removed again in 02e031c (v2.6.37-rc4). */
+#define DRBD_REQ_HARDBARRIER	0
+#endif
+
+/* again: testing on this _inside_ the ifdef REQ_FLUSH,
+ * see 721a960 block: kill off REQ_UNPLUG */
+#ifdef REQ_UNPLUG
+#define DRBD_REQ_UNPLUG		REQ_UNPLUG
+#else
+#define DRBD_REQ_UNPLUG		0
+#endif
+
+#else				/* "older", and hopefully not
+				 * "partially backported" kernel */
+
+#if defined(BIO_RW_SYNC)
+/* see upstream commits
+ * 213d9417fec62ef4c3675621b9364a667954d4dd,
+ * 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8
+ * later, the defines even became an enum ;-) */
+#define DRBD_REQ_SYNC		(1UL << BIO_RW_SYNC)
+#define DRBD_REQ_UNPLUG		(1UL << BIO_RW_SYNC)
+#else
+/* cannot test on defined(BIO_RW_SYNCIO), it may be an enum */
+#define DRBD_REQ_SYNC		(1UL << BIO_RW_SYNCIO)
+#define DRBD_REQ_UNPLUG		(1UL << BIO_RW_UNPLUG)
+#endif
+
+#define DRBD_REQ_FLUSH		(1UL << BIO_RW_BARRIER)
+/* REQ_FUA has been around for a longer time,
+ * without a direct equivalent in bi_rw. */
+#define DRBD_REQ_FUA		(1UL << BIO_RW_BARRIER)
+#define DRBD_REQ_HARDBARRIER	(1UL << BIO_RW_BARRIER)
+
+/* we don't support DISCARDS yet, anyways.
+ * cannot test on defined(BIO_RW_DISCARD), it may be an enum */
+#define DRBD_REQ_DISCARD	0
+#endif
+
+/* this results in:
+	bi_rw   -> dp_flags
+
+< 2.6.28
+	SYNC	-> SYNC|UNPLUG
+	BARRIER	-> FUA|FLUSH
+	there is no DISCARD
+2.6.28
+	SYNC	-> SYNC|UNPLUG
+	BARRIER	-> FUA|FLUSH
+	DISCARD	-> DISCARD
+2.6.29
+	SYNCIO	-> SYNC
+	UNPLUG	-> UNPLUG
+	BARRIER	-> FUA|FLUSH
+	DISCARD	-> DISCARD
+2.6.36
+	SYNC	-> SYNC
+	UNPLUG	-> UNPLUG
+	FUA	-> FUA
+	FLUSH	-> FLUSH
+	DISCARD	-> DISCARD
+--------------------------------------
+	dp_flags   -> bi_rw
+< 2.6.28
+	SYNC	-> SYNC (and unplug)
+	UNPLUG	-> SYNC (and unplug)
+	FUA	-> BARRIER
+	FLUSH	-> BARRIER
+	there is no DISCARD,
+	it will be silently ignored on the receiving side.
+2.6.28
+	SYNC	-> SYNC (and unplug)
+	UNPLUG	-> SYNC (and unplug)
+	FUA	-> BARRIER
+	FLUSH	-> BARRIER
+	DISCARD -> DISCARD
+	(if that fails, we handle it like any other IO error)
+2.6.29
+	SYNC	-> SYNCIO
+	UNPLUG	-> UNPLUG
+	FUA	-> BARRIER
+	FLUSH	-> BARRIER
+	DISCARD -> DISCARD
+2.6.36
+	SYNC	-> SYNC
+	UNPLUG	-> UNPLUG
+	FUA	-> FUA
+	FLUSH	-> FLUSH
+	DISCARD	-> DISCARD
+
+NOTE: DISCARDs likely need some work still.  We should actually never see
+DISCARD requests, as our queue does not announce QUEUE_FLAG_DISCARD yet.
+*/
+
+#ifndef COMPLETION_INITIALIZER_ONSTACK
+#define COMPLETION_INITIALIZER_ONSTACK(work) \
+	({ init_completion(&work); work; })
+#endif
+
+#ifdef NEED_SCHEDULE_TIMEOUT_INTERR
+static inline signed long schedule_timeout_interruptible(signed long timeout)
+{
+	__set_current_state(TASK_INTERRUPTIBLE);
+        return schedule_timeout(timeout);
+}
+
+static inline signed long schedule_timeout_uninterruptible(signed long timeout)
+{
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        return schedule_timeout(timeout);
+}
+#endif
+
+#ifndef CONFIG_DYNAMIC_DEBUG
+/* At least in 2.6.34 the function macro dynamic_dev_dbg() is broken when compiling
+   without CONFIG_DYNAMIC_DEBUG. It has 'format' in the argument list, it references
+   to 'fmt' in its body. */
+#ifdef dynamic_dev_dbg
+#undef dynamic_dev_dbg
+#define dynamic_dev_dbg(dev, fmt, ...)                               \
+        do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0)
+#endif
+#endif
+
+#ifndef min_not_zero
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+#endif
+
+/* Introduced with 2.6.26. See include/linux/jiffies.h */
+#ifndef time_is_before_eq_jiffies
+#define time_is_before_jiffies(a) time_after(jiffies, a)
+#define time_is_after_jiffies(a) time_before(jiffies, a)
+#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)
+#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)
+#endif
+
+#ifndef time_in_range
+#define time_in_range(a,b,c) \
+	(time_after_eq(a,b) && \
+	 time_before_eq(a,c))
+#endif
+
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+#ifndef COMPAT_HAVE_BIOSET_CREATE_FRONT_PAD
+/*
+ * upstream commit (included in 2.6.29)
+ * commit bb799ca0202a360fa74d5f17039b9100caebdde7
+ * Author: Jens Axboe <jens.axboe@oracle.com>
+ * Date:   Wed Dec 10 15:35:05 2008 +0100
+ *
+ *     bio: allow individual slabs in the bio_set
+ *
+ * does
+ * -struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+ * +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
+ *
+ * Note that up until 2.6.21 inclusive, it was
+ * struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
+ * so if we want to support old kernels (RHEL5), we will need an additional compat check.
+ *
+ * This also means that we must not use the front_pad trick as long as we want
+ * to keep compatibility with < 2.6.29.
+ */
+#ifdef COMPAT_BIOSET_CREATE_HAS_THREE_PARAMETERS
+#define bioset_create(pool_size, front_pad)    bioset_create(pool_size, pool_size, 1)
+#else
+#define bioset_create(pool_size, front_pad)    bioset_create(pool_size, pool_size)
+#endif
+#endif /* COMPAT_HAVE_BIOSET_CREATE_FRONT_PAD */
+#else /* COMPAT_HAVE_BIOSET_CREATE */
+/* Old kernel, no bioset_create at all!
+ * Just do plain alloc_bio, and forget about the dedicated bioset */
+static inline struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
+{
+	return NULL;
+}
+static inline void bioset_free(struct bio_set *bs)
+{
+	BUG();
+}
+static inline void bio_free(struct bio *bio, struct bio_set *bs)
+{
+	BUG();
+}
+static inline struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+{
+	BUG();
+	return NULL;
+}
+#endif /* COMPAT_HAVE_BIOSET_CREATE */
+
+/*
+ * In commit c4945b9e (v2.6.39-rc1), the little-endian bit operations have been
+ * renamed to be less weird.
+ */
+#ifndef COMPAT_HAVE_FIND_NEXT_ZERO_BIT_LE
+#define find_next_zero_bit_le(addr, size, offset) \
+	generic_find_next_zero_le_bit(addr, size, offset)
+#define find_next_bit_le(addr, size, offset) \
+	generic_find_next_le_bit(addr, size, offset)
+#define test_bit_le(nr, addr) \
+	generic_test_le_bit(nr, addr)
+#define __test_and_set_bit_le(nr, addr) \
+	generic___test_and_set_le_bit(nr, addr)
+#define __test_and_clear_bit_le(nr, addr) \
+	generic___test_and_clear_le_bit(nr, addr)
+#endif
+
+#ifdef COMPAT_KREF_PUT_HAS_SINGLE_ARG
+#define kref_put(KREF, RELEASE)	({ (KREF)->release=RELEASE; kref_put(KREF); })
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/lru_cache.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/lru_cache.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/drbd/lru_cache.c	2015-01-21 12:02:58.389823779 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/drbd/lru_cache.c	2015-01-21 12:02:58.389823779 +0300
@@ -0,0 +1,540 @@
+/*
+   lru_cache.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/string.h> /* for memset */
+#include <linux/seq_file.h> /* for seq_printf */
+#include <linux/lru_cache.h>
+
+/* this is developers aid only.
+ * it catches concurrent access (lack of locking on the users part) */
+#define PARANOIA_ENTRY() do {		\
+	BUG_ON(!lc);			\
+	BUG_ON(!lc->nr_elements);	\
+	BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \
+} while (0)
+
+#define RETURN(x...)     do { \
+	clear_bit(__LC_PARANOIA, &lc->flags); \
+	smp_mb__after_clear_bit(); return x ; } while (0)
+
+/* BUG() if e is not one of the elements tracked by lc */
+#define PARANOIA_LC_ELEMENT(lc, e) do {	\
+	struct lru_cache *lc_ = (lc);	\
+	struct lc_element *e_ = (e);	\
+	unsigned i = e_->lc_index;	\
+	BUG_ON(i >= lc_->nr_elements);	\
+	BUG_ON(lc_->lc_element[i] != e_); } while (0)
+
+/**
+ * lc_create - prepares to track objects in an active set
+ * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
+ * @e_count: number of elements allowed to be active simultaneously
+ * @e_size: size of the tracked objects
+ * @e_off: offset to the &struct lc_element member in a tracked object
+ *
+ * Returns a pointer to a newly initialized struct lru_cache on success,
+ * or NULL on (allocation) failure.
+ */
+struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+		unsigned e_count, size_t e_size, size_t e_off)
+{
+	struct hlist_head *slot = NULL;
+	struct lc_element **element = NULL;
+	struct lru_cache *lc;
+	struct lc_element *e;
+	unsigned cache_obj_size = kmem_cache_size(cache);
+	unsigned i;
+
+	WARN_ON(cache_obj_size < e_size);
+	if (cache_obj_size < e_size)
+		return NULL;
+
+	/* e_count too big; would probably fail the allocation below anyways.
+	 * for typical use cases, e_count should be few thousand at most. */
+	if (e_count > LC_MAX_ACTIVE)
+		return NULL;
+
+	slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL);
+	if (!slot)
+		goto out_fail;
+	element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL);
+	if (!element)
+		goto out_fail;
+
+	lc = kzalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc)
+		goto out_fail;
+
+	INIT_LIST_HEAD(&lc->in_use);
+	INIT_LIST_HEAD(&lc->lru);
+	INIT_LIST_HEAD(&lc->free);
+
+	lc->name = name;
+	lc->element_size = e_size;
+	lc->element_off = e_off;
+	lc->nr_elements = e_count;
+	lc->new_number = LC_FREE;
+	lc->lc_cache = cache;
+	lc->lc_element = element;
+	lc->lc_slot = slot;
+
+	/* preallocate all objects */
+	for (i = 0; i < e_count; i++) {
+		void *p = kmem_cache_alloc(cache, GFP_KERNEL);
+		if (!p)
+			break;
+		memset(p, 0, lc->element_size);
+		e = p + e_off;
+		e->lc_index = i;
+		e->lc_number = LC_FREE;
+		list_add(&e->list, &lc->free);
+		element[i] = e;
+	}
+	if (i == e_count)
+		return lc;
+
+	/* else: could not allocate all elements, give up */
+	for (i--; i; i--) {
+		void *p = element[i];
+		kmem_cache_free(cache, p - e_off);
+	}
+	kfree(lc);
+out_fail:
+	kfree(element);
+	kfree(slot);
+	return NULL;
+}
+
+void lc_free_by_index(struct lru_cache *lc, unsigned i)
+{
+	void *p = lc->lc_element[i];
+	WARN_ON(!p);
+	if (p) {
+		p -= lc->element_off;
+		kmem_cache_free(lc->lc_cache, p);
+	}
+}
+
+/**
+ * lc_destroy - frees memory allocated by lc_create()
+ * @lc: the lru cache to destroy
+ */
+void lc_destroy(struct lru_cache *lc)
+{
+	unsigned i;
+	if (!lc)
+		return;
+	for (i = 0; i < lc->nr_elements; i++)
+		lc_free_by_index(lc, i);
+	kfree(lc->lc_element);
+	kfree(lc->lc_slot);
+	kfree(lc);
+}
+
+/**
+ * lc_reset - does a full reset for @lc and the hash table slots.
+ * @lc: the lru cache to operate on
+ *
+ * It is roughly the equivalent of re-allocating a fresh lru_cache object,
+ * basically a short cut to lc_destroy(lc); lc = lc_create(...);
+ */
+void lc_reset(struct lru_cache *lc)
+{
+	unsigned i;
+
+	INIT_LIST_HEAD(&lc->in_use);
+	INIT_LIST_HEAD(&lc->lru);
+	INIT_LIST_HEAD(&lc->free);
+	lc->used = 0;
+	lc->hits = 0;
+	lc->misses = 0;
+	lc->starving = 0;
+	lc->dirty = 0;
+	lc->changed = 0;
+	lc->flags = 0;
+	lc->changing_element = NULL;
+	lc->new_number = LC_FREE;
+	memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
+
+	for (i = 0; i < lc->nr_elements; i++) {
+		struct lc_element *e = lc->lc_element[i];
+		void *p = e;
+		p -= lc->element_off;
+		memset(p, 0, lc->element_size);
+		/* re-init it */
+		e->lc_index = i;
+		e->lc_number = LC_FREE;
+		list_add(&e->list, &lc->free);
+	}
+}
+
+/**
+ * lc_seq_printf_stats - print stats about @lc into @seq
+ * @seq: the seq_file to print into
+ * @lc: the lru cache to print statistics of
+ */
+size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
+{
+	/* NOTE:
+	 * total calls to lc_get are
+	 * (starving + hits + misses)
+	 * misses include "dirty" count (update from an other thread in
+	 * progress) and "changed", when this in fact lead to an successful
+	 * update of the cache.
+	 */
+	return seq_printf(seq, "\t%s: used:%u/%u "
+		"hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
+		lc->name, lc->used, lc->nr_elements,
+		lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
+}
+
+static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
+{
+	return  lc->lc_slot + (enr % lc->nr_elements);
+}
+
+
+/**
+ * lc_find - find element by label, if present in the hash table
+ * @lc: The lru_cache object
+ * @enr: element number
+ *
+ * Returns the pointer to an element, if the element with the requested
+ * "label" or element number is present in the hash table,
+ * or NULL if not found. Does not change the refcnt.
+ */
+struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
+{
+	struct hlist_node *n;
+	struct lc_element *e;
+
+	BUG_ON(!lc);
+	BUG_ON(!lc->nr_elements);
+	hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
+		if (e->lc_number == enr)
+			return e;
+	}
+	return NULL;
+}
+
+/* returned element will be "recycled" immediately */
+static struct lc_element *lc_evict(struct lru_cache *lc)
+{
+	struct list_head  *n;
+	struct lc_element *e;
+
+	if (list_empty(&lc->lru))
+		return NULL;
+
+	n = lc->lru.prev;
+	e = list_entry(n, struct lc_element, list);
+
+	PARANOIA_LC_ELEMENT(lc, e);
+
+	list_del(&e->list);
+	hlist_del(&e->colision);
+	return e;
+}
+
+/**
+ * lc_del - removes an element from the cache
+ * @lc: The lru_cache object
+ * @e: The element to remove
+ *
+ * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list,
+ * sets @e->enr to %LC_FREE.
+ */
+void lc_del(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_ENTRY();
+	PARANOIA_LC_ELEMENT(lc, e);
+	BUG_ON(e->refcnt);
+
+	e->lc_number = LC_FREE;
+	hlist_del_init(&e->colision);
+	list_move(&e->list, &lc->free);
+	RETURN();
+}
+
+static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
+{
+	struct list_head *n;
+
+	if (list_empty(&lc->free))
+		return lc_evict(lc);
+
+	n = lc->free.next;
+	list_del(n);
+	return list_entry(n, struct lc_element, list);
+}
+
+static int lc_unused_element_available(struct lru_cache *lc)
+{
+	if (!list_empty(&lc->free))
+		return 1; /* something on the free list */
+	if (!list_empty(&lc->lru))
+		return 1;  /* something to evict */
+
+	return 0;
+}
+
+
+/**
+ * lc_get - get element by label, maybe change the active set
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Finds an element in the cache, increases its usage count,
+ * "touches" and returns it.
+ *
+ * In case the requested number is not present, it needs to be added to the
+ * cache. Therefore it is possible that an other element becomes evicted from
+ * the cache. In either case, the user is notified so he is able to e.g. keep
+ * a persistent log of the cache changes, and therefore the objects in use.
+ *
+ * Return values:
+ *  NULL
+ *     The cache was marked %LC_STARVING,
+ *     or the requested label was not in the active set
+ *     and a changing transaction is still pending (@lc was marked %LC_DIRTY).
+ *     Or no unused or free element could be recycled (@lc will be marked as
+ *     %LC_STARVING, blocking further lc_get() operations).
+ *
+ *  pointer to the element with the REQUESTED element number.
+ *     In this case, it can be used right away
+ *
+ *  pointer to an UNUSED element with some different element number,
+ *          where that different number may also be %LC_FREE.
+ *
+ *          In this case, the cache is marked %LC_DIRTY (blocking further changes),
+ *          and the returned element pointer is removed from the lru list and
+ *          hash collision chains.  The user now should do whatever housekeeping
+ *          is necessary.
+ *          Then he must call lc_changed(lc,element_pointer), to finish
+ *          the change.
+ *
+ * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
+ *       any cache set change.
+ */
+struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e;
+
+	PARANOIA_ENTRY();
+	if (lc->flags & LC_STARVING) {
+		++lc->starving;
+		RETURN(NULL);
+	}
+
+	e = lc_find(lc, enr);
+	if (e) {
+		++lc->hits;
+		if (e->refcnt++ == 0)
+			lc->used++;
+		list_move(&e->list, &lc->in_use); /* Not evictable... */
+		RETURN(e);
+	}
+
+	++lc->misses;
+
+	/* In case there is nothing available and we can not kick out
+	 * the LRU element, we have to wait ...
+	 */
+	if (!lc_unused_element_available(lc)) {
+		__set_bit(__LC_STARVING, &lc->flags);
+		RETURN(NULL);
+	}
+
+	/* it was not present in the active set.
+	 * we are going to recycle an unused (or even "free") element.
+	 * user may need to commit a transaction to record that change.
+	 * we serialize on flags & TF_DIRTY */
+	if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
+		++lc->dirty;
+		RETURN(NULL);
+	}
+
+	e = lc_get_unused_element(lc);
+	BUG_ON(!e);
+
+	clear_bit(__LC_STARVING, &lc->flags);
+	BUG_ON(++e->refcnt != 1);
+	lc->used++;
+
+	lc->changing_element = e;
+	lc->new_number = enr;
+
+	RETURN(e);
+}
+
+/* similar to lc_get,
+ * but only gets a new reference on an existing element.
+ * you either get the requested element, or NULL.
+ * will be consolidated into one function.
+ */
+struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e;
+
+	PARANOIA_ENTRY();
+	if (lc->flags & LC_STARVING) {
+		++lc->starving;
+		RETURN(NULL);
+	}
+
+	e = lc_find(lc, enr);
+	if (e) {
+		++lc->hits;
+		if (e->refcnt++ == 0)
+			lc->used++;
+		list_move(&e->list, &lc->in_use); /* Not evictable... */
+	}
+	RETURN(e);
+}
+
+/**
+ * lc_changed - tell @lc that the change has been recorded
+ * @lc: the lru cache to operate on
+ * @e: the element pending label change
+ */
+void lc_changed(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_ENTRY();
+	BUG_ON(e != lc->changing_element);
+	PARANOIA_LC_ELEMENT(lc, e);
+	++lc->changed;
+	e->lc_number = lc->new_number;
+	list_add(&e->list, &lc->in_use);
+	hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
+	lc->changing_element = NULL;
+	lc->new_number = LC_FREE;
+	clear_bit(__LC_DIRTY, &lc->flags);
+	smp_mb__after_clear_bit();
+	RETURN();
+}
+
+
+/**
+ * lc_put - give up refcnt of @e
+ * @lc: the lru cache to operate on
+ * @e: the element to put
+ *
+ * If refcnt reaches zero, the element is moved to the lru list,
+ * and a %LC_STARVING (if set) is cleared.
+ * Returns the new (post-decrement) refcnt.
+ */
+unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_ENTRY();
+	PARANOIA_LC_ELEMENT(lc, e);
+	BUG_ON(e->refcnt == 0);
+	BUG_ON(e == lc->changing_element);
+	if (--e->refcnt == 0) {
+		/* move it to the front of LRU. */
+		list_move(&e->list, &lc->lru);
+		lc->used--;
+		clear_bit(__LC_STARVING, &lc->flags);
+		smp_mb__after_clear_bit();
+	}
+	RETURN(e->refcnt);
+}
+
+/**
+ * lc_element_by_index
+ * @lc: the lru cache to operate on
+ * @i: the index of the element to return
+ */
+struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
+{
+	BUG_ON(i >= lc->nr_elements);
+	BUG_ON(lc->lc_element[i] == NULL);
+	BUG_ON(lc->lc_element[i]->lc_index != i);
+	return lc->lc_element[i];
+}
+
+/**
+ * lc_index_of
+ * @lc: the lru cache to operate on
+ * @e: the element to query for its index position in lc->element
+ */
+unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_LC_ELEMENT(lc, e);
+	return e->lc_index;
+}
+
+/**
+ * lc_set - associate index with label
+ * @lc: the lru cache to operate on
+ * @enr: the label to set
+ * @index: the element index to associate label with.
+ *
+ * Used to initialize the active set to some previously recorded state.
+ */
+void lc_set(struct lru_cache *lc, unsigned int enr, int index)
+{
+	struct lc_element *e;
+
+	if (index < 0 || index >= lc->nr_elements)
+		return;
+
+	e = lc_element_by_index(lc, index);
+	e->lc_number = enr;
+
+	hlist_del_init(&e->colision);
+	hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
+	list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
+}
+
+/**
+ * lc_dump - Dump a complete LRU cache to seq in textual form.
+ * @lc: the lru cache to operate on
+ * @seq: the &struct seq_file pointer to seq_printf into
+ * @utext: user supplied "heading" or other info
+ * @detail: function pointer the user may provide to dump further details
+ * of the object the lc_element is embedded in.
+ */
+void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
+	     void (*detail) (struct seq_file *, struct lc_element *))
+{
+	unsigned int nr_elements = lc->nr_elements;
+	struct lc_element *e;
+	int i;
+
+	seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+	for (i = 0; i < nr_elements; i++) {
+		e = lc_element_by_index(lc, i);
+		if (e->lc_number == LC_FREE) {
+			seq_printf(seq, "\t%2d: FREE\n", i);
+		} else {
+			seq_printf(seq, "\t%2d: %4u %4u    ", i,
+				   e->lc_number, e->refcnt);
+			detail(seq, e);
+		}
+	}
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ASSUMPTIONS linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ASSUMPTIONS
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ASSUMPTIONS	2015-01-21 12:02:54.707921511 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ASSUMPTIONS	2015-01-21 12:02:54.707921511 +0300
@@ -0,0 +1,23 @@
+1. When mapping is not available (still not allocated or just
+   not read from disk), bio preparation stage allows to
+   virtual block boundary. In this case bulk writeback self-synchronizes
+   to sending bios spanning the whole block and we do not have
+   to waste time/io to pad those bios to block size, when
+   creating new block, or reading original block, when making delta.
+
+2. That block size is calculated from block size of top delta.
+
+   It would be useful to have f.e. base image with block=128K
+   and deltas with block=4K. In this case we quantize everything
+   to 4K.
+
+   Here is the assumption: successive deltas cannot increase block
+   size. NB: raw image has infinite block size.
+
+   BUT! When we start new delta, some bios can be already prepared
+   and in this case we can see bio spanning several blocks.
+   Right now we do not support this, therefore all the deltas
+   and base image must have the same block size. What's about
+   raw image, we just assign a default block size to base image
+   and all the deltas must inherit it.
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/Makefile	2015-01-21 12:02:54.707921511 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/Makefile	2015-01-21 12:02:55.548899187 +0300
@@ -0,0 +1,23 @@
+#
+# Makefile for Parallels loop device
+#
+CFLAGS_io_direct.o = -I$(src)
+CFLAGS_ploop_events.o = -I$(src)
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= ploop.o
+ploop-objs := dev.o map.o io.o sysfs.o tracker.o freeblks.o ploop_events.o discard.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pfmt_ploop1.o
+pfmt_ploop1-objs := fmt_ploop1.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pfmt_raw.o
+pfmt_raw-objs := fmt_raw.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_direct.o
+pio_direct-objs := io_direct.o io_direct_map.o compat.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_kaio.o
+pio_kaio-objs := io_kaio.o io_kaio_map.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_nfs.o
+pio_nfs-objs := io_nfs.o
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/QUESTIONS linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/QUESTIONS
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/QUESTIONS	2015-01-21 12:02:54.707921511 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/QUESTIONS	2015-01-21 12:02:54.707921511 +0300
@@ -0,0 +1,21 @@
+1. Seems, Axboe's fast loop uses GFP_KERNEL in loop thread. What the fuck?
+
+2. bio_add_page() is buggy. When existing bvec is extended, it passes
+   some crap to merge_bvec_fn
+
+3. BIO_RW_AHEAD. Nobody uses it. Should we implement this?
+
+4. Switching io_context. Dubious. Definitely broken in 2.6.18
+
+5. current->backing_dev_info. Investigate.
+
+6. Waiting for free request. Should not we redo all the checks after waiting?
+
+7. Cached write is used when allocating new index cluster. Should we
+   get rid of this?
+
+8. Investigate *congestion* methods. We should do something, no doubts.
+
+9. Can/should we register_shrinker() to balance our internal caches?
+
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/STATE_MACHINE linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/STATE_MACHINE
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/STATE_MACHINE	2015-01-21 12:02:54.707921511 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/STATE_MACHINE	2015-01-21 12:02:55.774893189 +0300
@@ -0,0 +1,63 @@
+Finite set of requests. Each incoming bio is either redirected directly,
+or attached to request. Both cases are accounted in "active_reqs".
+
+Requests can be:
+
+- in entry_queue
+	Idle requests, still not processed.
+	Main thread fetches them one by one, only if ready_queue is empty.
+	If head of entry queue is barrier, machine processes only
+	ready_queue until all active requests are drained.
+
+- in ready_queue
+	Requests which are ready for the next step or for completion.
+
+- in one of internal queues
+	Some private logic is responsible for placing them into ready_queue,
+	when it is ready to proceed.
+
+- in flight, attached only to set of bios. In this case after completion
+  of all the child bios, it moves to some internal queue or to ready_queue
+  by bio completion callback.
+
+
+States of main process:
+
+- Pre-barrier flush. Barrier request is at head of entry queue.
+  Entry queue is not processed. ready_queue is processed. This state
+  is exited when "active_reqs" is zero. Entered Barrier state.
+
+- Barrier state. Entry queue is frozen. ready_queue is processed until
+  "active_reqs" is zero.
+
+- Running. If something is on ready_queue, it is processed.
+  If ready_queue is empty, entry_queue is checked.
+  If we see a barrier, we enter Pre-barrier flush.
+
+
+EVENTS:
+
+- Mitigation timer expires:
+	if (sleeping && entry_queue && !barrier_state)
+		wakeup()
+
+- Fast bio completion
+	--active_reqs
+	if (active_reqs == 0 && sleeping && entry_queue && !barrier_state)
+		wakeup();
+
+- Bio completion
+	add preq to ready queue
+	if (sleeping)
+		wakeup();
+
+
+Request states.
+
+1. E_ENTRY
+	Request is still not processed, it is either in entry queue or
+	in some internal queue, or it returned from that internal
+	queue to ready_queue. In any case, processing must be started
+	from the very beginning.
+
+2.
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/TODO linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/TODO
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/TODO	2015-01-21 12:02:54.707921511 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/TODO	2015-01-21 12:02:55.774893189 +0300
@@ -0,0 +1,171 @@
+1. DM-like snapshots
+--------------------
+
+   Not done because WILL NOT DO. This functionality is
+   available, if we use DM over ploop or acronis trueimage on ploop
+   device.
+
+2. Tracking changes
+-------------------
+
+   [ Done. Approach B. ]
+
+   This functionality is required to implement
+   iterative migration. Naive implementation is very easy: just
+   a huge bitmap of dirty bits and ioctl to read&clear a chunk from this map.
+   Real implemenation can become tricky, because we definitely
+   will not want to copy all the data from original image etc. etc.
+
+   Not done because of three reasons: first, it requires non-trivial
+   support from migration tools (similar to vzfs-track+rsync),
+   second, it is easy to add any moment, third, it is not clear
+   that this is required, the case is covered by item #3 in this list,
+   suboptimally, but covered.
+
+   Anyway, even here we have a fork:
+
+   A. We can track virtual sectors.
+   B. We can track offsets in backing files.
+
+   Fork (B) allows to copy raw images via network (obvious speedup),
+	but those images can contain lots of dead data: unused and covered
+	by deltas.
+   Fork (A) allows to copy only relevant data droping dead deltas
+	and even omitting space not used by fs,
+	but then we lose snapshots and cannot stream raw files.
+
+3. Merging writable delta
+-------------------------
+
+   [Done]
+
+   Also a MUST for migration with shared image, where all the changes
+   after image was frozen go to delta.
+   Obvisously, we must not continue to work over delta after
+   migration is complete.
+
+   The only solution that I see is a new kind of delta "transparent delta".
+   It is a super-top delta above top delta. We read from it, but we
+   write to top delta. And background process copying data from
+   super-top to top. Seems, implementation is quite hairy.
+   One idea is to keep two maps: one is normal, not considering
+   super-top at all, another is just for super-top. Each incoming
+   requests looks up super-top map first, if it does not find anything,
+   it proceeds to normal path. If it finds and this is read, this is just
+   read. If it is write, it must read data from super-top
+   (if not whole cluster), write/allocate data in normal map, update
+   normal map and upon completion it must clear map entry in super-top.
+   Background process just scans super-top and triggers zero-length
+   writes on everything it finds. After a single pass the super-top delta
+   is clean.
+
+   More general approach. If we need to get rid of multiple deltas,
+   it is utterly stupid to merge deltas one by one. Instead we
+   can split delta list to two parts: folding and normal. Folding
+   deltas are batch of deltas to get rid of, all they are folded
+   to top level normal delta. Now it is obvious that we must have
+   two maps: normal and folding. Outline of algo:
+
+	1. Lookup folding map. If there is no map, proceed along normal path.
+	2. Otherwise, schedule read of folding map.
+	3. When map is calculated. If it is read request, just read.
+	4. If it is write, schedule read from folded delta.
+	   Cluster is locked out.
+	5. When it is complete, copy new data there and do the same
+	   things as we used to do in normal path, but after completion
+	   of update of normal index (or completion of write, if index
+	   is not changed), we schedule zeroing of folded delta index.
+	   Here is a trouble: we cannot just zero folded index, it can
+	   uncover an obsolete entry in lower folded delta. Seems, we
+	   have to reserve a special index value to mark invalidated
+	   entry.
+
+    OK, plan is ready, but technically it is not easy to accomplish it.
+
+4. Shrinking image
+------------------
+
+   [ Done ]
+
+   Naive solution is mostly trivial. We snapshot image with syncing
+   journal, then scan image in user space to collect unused blocks
+   and to plan optimal order of copies. Process of shrinking has to be made
+   by kernel, because we must switch maps atomically, but all logically
+   non-ttrivial part resides in user space. Then we can merge accumulated
+   delta back (and spoil the image again :-))
+
+   Reality hurts. Such merge will result in severely reordered image
+   unless we copy all of it to new place. Copying everything is optimal
+   from viewpoint of performance in future, but expensive. Naive shrinking
+   will kill perfroamnce, but it is quick. Apparently, correct solution
+   is somewhere between. Where?
+
+
+5. Reliability of EXT3 when we are out of disk space
+----------------------------------------------------
+
+   Right now it behaves very good from viewpoint of image integrity.
+   No errors after replaying journal. But when we run out of space kernel
+   spews a lot of errors, this behaviour is still not acceptable.
+
+6. Alignment disaster
+---------------------
+
+If ploop is partitioned f.e. for use with Parallels VM, by default
+each partition starts at odd sector offset, which means writes are
+suboptimal. Solutions are:
+
+	A. Give "correct" disk geometry to VM. Not crippled LBA default,
+	   but something with tracks aligned at least to pages.
+	B. dmonakhov's idea. Each cluster is augmented with 8 additional
+	   sectors. Alignment is established by the first write.
+	   1 sector of 8 is not used and can be used f.e. to store some
+	   metainformation: checksums, back reference from cluster
+	   to location in image, which would allow to recover even
+	   if indices are corrupted. Drawback: severe impact on performance
+	   due to unknown reasons, seems something specific to modern SATA
+	   disks.
+	C. Ignore the issue. Only growing images are impacted.
+	   And frankly, growing == bad performance in any case.
+
+
+Take C for now. Impact of performance in compilebench and bonnie
+is practically invisible.
+
+
+7. ext3 block allocation eats disk space
+----------------------------------------
+
+ext3 crock quickly sweeps the whole disk, even if you delete and
+create the same set of files. This is creepy. Moreovere,
+rm followed by creat almost always allocates new set of blocks,
+because transaction truncating file is still not commited. Grr...
+
+This means growing images with ext3 inside are just non-sense.
+
+	One solution (ignoring obvious: never use growing images)
+	is to create minimal size ext3 and resize it when it is full.
+	Difficult to embed to block device because of hierarchy violation.
+
+
+8. Memory allocation
+--------------------
+
+Currently bogus. This will take some time and a lot of brain efforts.
+All the allocations in ploop are made with GFP_NOFS, which means
+it can run out of resources.
+
+Obviously, we must supply allocation pools for all the objects and
+balance size of pools in a way which will make deadlocks impossible.
+
+Right now it is obviously wrong. And, I guess, this bogosity is shared
+with mainstream. F.e. we allocate at least one new bio for each incoming
+bio in fast path (or a lot of bios, when image is fragmented). Imagine,
+that burst of writeback exhausted all bio mempool and all of them are
+sent to us. We cannot allocate new bio before some bio is released,
+bio cannot be released before we make some forward progress, which
+we cannot do not allocating new bio. Of course, this is marginal pathological
+case, but there are too much of such cases to expect that deadlocks
+will never occur in real life.
+
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/compat.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/compat.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/compat.c	2015-01-21 12:02:54.708921485 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/compat.c	2015-01-21 12:02:54.708921485 +0300
@@ -0,0 +1,111 @@
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+
+/* These functions compensate for nice features, which are present
+ * in new kernels and absent in 2.6.18.
+ */
+
+static int _add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				  pgoff_t offset, gfp_t gfp_mask)
+{
+	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0) {
+		struct pagevec lru_pvec;
+		pagevec_init(&lru_pvec, 0);
+
+		page_cache_get(page);
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+		pagevec_lru_add(&lru_pvec);
+	}
+	return ret;
+}
+
+
+static struct page *
+__grab_cache_page(struct address_space *mapping, pgoff_t index)
+{
+	int status;
+	struct page *page;
+repeat:
+	page = find_lock_page(mapping, index);
+	if (likely(page))
+		return page;
+
+	page = page_cache_alloc(mapping);
+	if (!page)
+		return NULL;
+	status = _add_to_page_cache_lru(page, mapping, index, mapping_gfp_mask(mapping));
+	if (unlikely(status)) {
+		page_cache_release(page);
+		if (status == -EEXIST)
+			goto repeat;
+		return NULL;
+	}
+	return page;
+}
+
+
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	const struct address_space_operations *aops = mapping->a_ops;
+
+	int ret;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = mapping->host;
+	struct page *page;
+
+	page = __grab_cache_page(mapping, index);
+	*pagep = page;
+	if (!page)
+		return -ENOMEM;
+
+	ret = aops->prepare_write(file, page, offset, offset+len);
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
+	}
+	return ret;
+}
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	const struct address_space_operations *aops = mapping->a_ops;
+	int ret;
+
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = mapping->host;
+
+	flush_dcache_page(page);
+	ret = aops->commit_write(file, page, offset, offset+len);
+	unlock_page(page);
+#if 0
+	/* Not really, we are not interested. */
+	mark_page_accessed(page);
+#endif
+	page_cache_release(page);
+
+	if (ret < 0) {
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
+	} else if (ret > 0)
+		ret = min_t(size_t, copied, ret);
+	else
+		ret = copied;
+
+	return ret;
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/dev.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/dev.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/dev.c	2015-01-21 12:02:54.709921458 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/dev.c	2015-01-21 12:02:57.839838377 +0300
@@ -0,0 +1,4907 @@
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/statfs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "ploop_events.h"
+#include "freeblks.h"
+#include "discard.h"
+
+/* Structures and terms:
+ *
+ * ploop_device is root of everything.
+ *	Normally we use local variable "plo" to refer to it.
+ *
+ * ploop_device -> list of ploop_delta's.
+ *	Head of list is "top delta", tail of list is "root delta".
+ *	"top delta" is delta, where all the modifications are written,
+ *	"root delta" is base image. "Level" is distance from root.
+ *
+ * ploop_delta  -> { ops, priv } refers to particulat format of delta.
+ *		-> ploop_io refers to image on disk.
+ *
+ * ploop_io	-> list of ploop_file, each file maps an area in image.
+ *	*** Further is "ideal", right now we support only one ploop_file
+ *	*** and we do not support creation of new ploop_file's.
+ *		-> { ops , priv } generic image ops, mostly creation
+ *		   of new chunks.
+ *
+ * ploop_file	-> { file, ops, priv } how we do real IO on this file.
+ */
+
+static int ploop_max __read_mostly = PLOOP_DEVICE_RANGE;
+static int ploop_major __read_mostly = PLOOP_DEVICE_MAJOR;
+int max_map_pages __read_mostly;
+
+static long root_threshold __read_mostly = 2L * 1024 * 1024; /* 2GB in KB */
+static long user_threshold __read_mostly = 4L * 1024 * 1024; /* 4GB in KB */
+
+static int large_disk_support __read_mostly = 1; /* true */
+
+static struct rb_root ploop_devices_tree = RB_ROOT;
+static DEFINE_MUTEX(ploop_devices_mutex);
+
+static LIST_HEAD(ploop_formats);
+static DEFINE_MUTEX(ploop_formats_mutex);
+
+int ploop_register_format(struct ploop_delta_ops * ops)
+{
+	mutex_lock(&ploop_formats_mutex);
+	list_add(&ops->list, &ploop_formats);
+	mutex_unlock(&ploop_formats_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_register_format);
+
+void ploop_unregister_format(struct ploop_delta_ops * ops)
+{
+	mutex_lock(&ploop_formats_mutex);
+	list_del(&ops->list);
+	mutex_unlock(&ploop_formats_mutex);
+}
+EXPORT_SYMBOL(ploop_unregister_format);
+
+struct ploop_delta_ops * ploop_format_get(unsigned int id)
+{
+	struct ploop_delta_ops * ops;
+
+	mutex_lock(&ploop_formats_mutex);
+	list_for_each_entry(ops, &ploop_formats, list) {
+		if (ops->id == id && try_module_get(ops->owner)) {
+			mutex_unlock(&ploop_formats_mutex);
+			return ops;
+		}
+	}
+	mutex_unlock(&ploop_formats_mutex);
+	return NULL;
+}
+
+void ploop_format_put(struct ploop_delta_ops * ops)
+{
+	module_put(ops->owner);
+}
+
+void ploop_msg_once(struct ploop_device *plo, const char *fmt, ...)
+{
+	va_list args;
+
+	if (test_and_set_bit(PLOOP_S_ONCE, &plo->state))
+		return;
+
+	va_start(args, fmt);
+	printk("ploop(%d): ", plo->index);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+EXPORT_SYMBOL(ploop_msg_once);
+
+static void mitigation_timeout(unsigned long data)
+{
+	struct ploop_device * plo = (void*)data;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	spin_lock_irq(&plo->lock);
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    (!list_empty(&plo->entry_queue) ||
+	     ((plo->bio_head || !bio_list_empty(&plo->bio_discard_list)) &&
+	      !list_empty(&plo->free_list))) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void freeze_timeout(unsigned long data)
+{
+	struct ploop_device * plo = (void*)data;
+
+	spin_lock_irq(&plo->lock);
+	if (waitqueue_active(&plo->freeze_waitq))
+		wake_up_interruptible(&plo->freeze_waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void ploop_congest(struct ploop_device *plo)
+{
+	if (!test_bit(PLOOP_S_CONGESTED, &plo->state) &&
+	    PLOOP_CONGESTED(plo) > plo->tune.congestion_high_watermark)
+		set_bit(PLOOP_S_CONGESTED, &plo->state);
+}
+
+static void ploop_uncongest(struct ploop_device *plo)
+{
+	if (PLOOP_CONGESTED(plo) <= plo->tune.congestion_low_watermark &&
+	    test_and_clear_bit(PLOOP_S_CONGESTED, &plo->state)) {
+		struct backing_dev_info *bdi = &plo->queue->backing_dev_info;
+
+		if (waitqueue_active(&bdi->cong_waitq))
+			wake_up_all(&bdi->cong_waitq);
+	}
+}
+
+static struct ploop_request *
+ploop_alloc_request(struct ploop_device * plo)
+{
+	struct ploop_request * preq;
+
+	/* We allow only finite amount of request in process.
+	 * If caller does not stop to congest us, we force him to wait.
+	 *
+	 * _XXX_ I am afraid this logic is flawed. The justification is
+	 * that conventional devices, using request queues, do similar thing
+	 * blocking in add_request(), but I am still not sure that logic
+	 * applies here.
+	 */
+	if (list_empty(&plo->free_list)) {
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&plo->req_waitq, &_wait, TASK_UNINTERRUPTIBLE);
+			if (!list_empty(&plo->free_list))
+				break;
+			plo->st.bio_full++;
+			spin_unlock_irq(&plo->lock);
+			io_schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&plo->req_waitq, &_wait);
+	}
+
+	preq = list_entry(plo->free_list.next, struct ploop_request, list);
+	list_del_init(&preq->list);
+	ploop_congest(plo);
+	return preq;
+}
+
+static void ploop_grab_iocontext(struct bio *bio)
+{
+	struct io_context **ioc_pp = (struct io_context **)(&bio->bi_bdev);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+	/* This is WRONG. Seems, in 2.6.18 we can do nothing.
+	 * But leave this piece of art for now.
+	 */
+	if (current->io_context) {
+		*ioc_pp = current->io_context;
+		if (!atomic_inc_not_zero((*ioc_pp)->refcount))
+			*ioc_pp = NULL;
+		set_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	}
+#else
+	if (current->io_context) {
+		*ioc_pp = ioc_task_link(current->io_context);
+		set_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	}
+#endif
+}
+
+/* always called with plo->lock held */
+static inline void preq_unlink(struct ploop_request * preq,
+			       struct list_head *drop_list)
+{
+	list_del(&preq->list);
+	ploop_entry_qlen_dec(preq);
+	list_add(&preq->list, drop_list);
+}
+
+/* always called with plo->lock released */
+void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
+		      int keep_locked)
+{
+	struct ploop_request * preq;
+
+	list_for_each_entry(preq, drop_list, list) {
+		if (preq->ioc) {
+			ioc_task_unlink(preq->ioc);
+			preq->ioc = NULL;
+		}
+
+		BUG_ON (test_bit(PLOOP_REQ_ZERO, &preq->state));
+	}
+
+	spin_lock_irq(&plo->lock);
+
+	list_splice_init(drop_list, plo->free_list.prev);
+	if (waitqueue_active(&plo->req_waitq))
+		wake_up(&plo->req_waitq);
+	else if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+		waitqueue_active(&plo->waitq) &&
+		(plo->bio_head || !bio_list_empty(&plo->bio_discard_list)))
+		wake_up_interruptible(&plo->waitq);
+
+	ploop_uncongest(plo);
+
+	if (!keep_locked)
+		spin_unlock_irq(&plo->lock);
+}
+
+static void merge_rw_flags_to_req(unsigned long rw,
+				  struct ploop_request * preq)
+{
+		if (rw & BIO_FLUSH)
+			preq->req_rw |= BIO_FLUSH;
+		if (rw & BIO_FUA)
+			preq->req_rw |= BIO_FUA;
+		if (rw & (1 << BIO_RW_SYNCIO))
+			preq->req_rw |= (1 << BIO_RW_SYNCIO);
+}
+
+static void preq_set_sync_bit(struct ploop_request * preq)
+{
+	if (!test_bit(PLOOP_REQ_SYNC, &preq->state)) {
+		if (!(preq->req_rw & WRITE) || (preq->req_rw & (BIO_FLUSH|BIO_FUA))) {
+			preq->plo->read_sync_reqs++;
+			__set_bit(PLOOP_REQ_RSYNC, &preq->state);
+		}
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+	}
+}
+
+static void overlap_forward(struct ploop_device * plo,
+			    struct ploop_request * preq,
+			    struct ploop_request * preq1,
+			    struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	if (preq->req_sector + preq->req_size == preq1->req_sector) {
+		preq->bl.tail->bi_next = preq1->bl.head;
+		preq->bl.tail = preq1->bl.tail;
+		preq1->bl.head = preq1->bl.tail = NULL;
+		preq->req_size += preq1->req_size;
+		if (test_bit(PLOOP_REQ_SYNC, &preq1->state))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(preq1->req_rw, preq);
+		rb_erase(&preq1->lockout_link, &plo->entry_tree[preq1->req_rw & WRITE]);
+		preq_unlink(preq1, drop_list);
+		plo->st.coal_mforw++;
+	}
+
+	while ((n = rb_next(&preq->lockout_link)) != NULL) {
+		preq1 = rb_entry(n, struct ploop_request, lockout_link);
+		if (preq->req_sector + preq->req_size <= preq1->req_sector)
+			break;
+		rb_erase(n, &plo->entry_tree[preq->req_rw & WRITE]);
+		__clear_bit(PLOOP_REQ_SORTED, &preq1->state);
+		plo->st.coal_oforw++;
+	}
+}
+
+static void overlap_backward(struct ploop_device * plo,
+			     struct ploop_request * preq,
+			     struct ploop_request * preq1,
+			     struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	if (preq1->req_sector + preq1->req_size == preq->req_sector) {
+		preq1->bl.tail->bi_next = preq->bl.head;
+		preq->bl.head = preq1->bl.head;
+		preq1->bl.head = preq1->bl.tail = NULL;
+		preq->req_size += preq1->req_size;
+		preq->req_sector = preq1->req_sector;
+		if (test_bit(PLOOP_REQ_SYNC, &preq1->state))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(preq1->req_rw, preq);
+		rb_erase(&preq1->lockout_link, &plo->entry_tree[preq->req_rw & WRITE]);
+		preq_unlink(preq1, drop_list);
+		plo->st.coal_mback++;
+	}
+
+	while ((n = rb_prev(&preq->lockout_link)) != NULL) {
+		preq1 = rb_entry(n, struct ploop_request, lockout_link);
+		if (preq1->req_sector + preq1->req_size <= preq->req_sector)
+			break;
+		rb_erase(n, &plo->entry_tree[preq->req_rw & WRITE]);
+		__clear_bit(PLOOP_REQ_SORTED, &preq1->state);
+		plo->st.coal_oback++;
+	}
+}
+
+static int try_merge(struct ploop_device *plo, struct ploop_request * preq,
+		     struct bio * bio, struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	/* Merge to tail */
+	if (bio->bi_sector == preq->req_sector + preq->req_size) {
+		preq->bl.tail->bi_next = bio;
+		preq->bl.tail = bio;
+		preq->req_size += (bio->bi_size >> 9);
+		preq->tstamp = jiffies;
+		if (bio_rw_flagged(bio, BIO_RW_UNPLUG))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(bio->bi_rw, preq);
+		plo->st.coal_forw++;
+		n = rb_next(&preq->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == preq->req_cluster &&
+			    preq->req_sector + preq->req_size >= preq1->req_sector)
+				overlap_forward(plo, preq, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	if (bio->bi_sector + (bio->bi_size >> 9) == preq->req_sector) {
+		bio->bi_next = preq->bl.head;
+		preq->bl.head = bio;
+		preq->req_size += (bio->bi_size >> 9);
+		preq->req_sector = bio->bi_sector;
+		preq->tstamp = jiffies;
+		plo->st.coal_back++;
+		if (bio_rw_flagged(bio, BIO_RW_UNPLUG))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(bio->bi_rw, preq);
+		n = rb_prev(&preq->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == preq->req_cluster &&
+			    preq->req_sector <= preq1->req_sector + preq1->req_size)
+				overlap_backward(plo, preq, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct ploop_request *
+tree_insert(struct rb_root *root, struct ploop_request * preq0)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ploop_request * preq;
+
+	while (*p) {
+		parent = *p;
+		preq = rb_entry(parent, struct ploop_request, lockout_link);
+
+		if (preq0->req_cluster < preq->req_cluster)
+			p = &(*p)->rb_left;
+		else if (preq0->req_cluster > preq->req_cluster)
+			p = &(*p)->rb_right;
+		else if (preq0->req_sector + preq0->req_size < preq->req_sector)
+			p = &(*p)->rb_left;
+		else if (preq0->req_sector > preq->req_sector + preq->req_size)
+			p = &(*p)->rb_right;
+		else
+			return preq;
+	}
+
+	rb_link_node(&preq0->lockout_link, parent, p);
+	rb_insert_color(&preq0->lockout_link, root);
+	__set_bit(PLOOP_REQ_SORTED, &preq0->state);
+	return NULL;
+}
+
+static int
+insert_entry_tree(struct ploop_device * plo, struct ploop_request * preq0,
+		  struct list_head *drop_list)
+{
+	struct ploop_request * clash;
+	struct rb_node * n;
+
+	clash = tree_insert(&plo->entry_tree[preq0->req_rw & WRITE], preq0);
+	if (!clash)
+		return 0;
+
+	if (preq0->req_sector == clash->req_sector + clash->req_size) {
+		clash->bl.tail->bi_next = preq0->bl.head;
+		clash->bl.tail = preq0->bl.tail;
+		clash->req_size += preq0->req_size;
+		clash->tstamp = jiffies;
+		if (test_bit(PLOOP_REQ_SYNC, &preq0->state))
+			preq_set_sync_bit(clash);
+		merge_rw_flags_to_req(preq0->req_rw, clash);
+		preq_unlink(preq0, drop_list);
+		plo->st.coal_forw2++;
+
+		n = rb_next(&clash->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == clash->req_cluster &&
+			    clash->req_sector + clash->req_size >= preq1->req_sector)
+				overlap_forward(plo, clash, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	if (clash->req_sector == preq0->req_sector + preq0->req_size) {
+		preq0->bl.tail->bi_next = clash->bl.head;
+		clash->bl.head = preq0->bl.head;
+		clash->req_size += preq0->req_size;
+		clash->req_sector = preq0->req_sector;
+		clash->tstamp = jiffies;
+		plo->st.coal_back2++;
+		if (test_bit(PLOOP_REQ_SYNC, &preq0->state))
+			preq_set_sync_bit(clash);
+		merge_rw_flags_to_req(preq0->req_rw, clash);
+		preq_unlink(preq0, drop_list);
+
+		n = rb_prev(&clash->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == clash->req_cluster &&
+			    clash->req_sector <= preq1->req_sector + preq1->req_size)
+				overlap_backward(plo, clash, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	plo->st.coal_overlap++;
+
+	return 0;
+}
+
+static void
+ploop_bio_queue(struct ploop_device * plo, struct bio * bio,
+		struct list_head *drop_list)
+{
+	struct ploop_request * preq;
+
+	BUG_ON (list_empty(&plo->free_list));
+	preq = list_entry(plo->free_list.next, struct ploop_request, list);
+	list_del_init(&preq->list);
+
+	preq->req_cluster = bio->bi_sector >> plo->cluster_log;
+	bio->bi_next = NULL;
+	preq->req_sector = bio->bi_sector;
+	preq->req_size = bio->bi_size >> 9;
+	preq->req_rw = bio->bi_rw;
+	bio->bi_rw &= ~(BIO_FLUSH | BIO_FUA);
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = 0;
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+	preq->prealloc_size = 0;
+
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
+		int clu_size = 1 << plo->cluster_log;
+		int i = (clu_size - 1) & bio->bi_sector;
+		int err = 0;
+
+		if (i) {
+			preq->req_cluster++;
+			if (preq->req_size >= clu_size)
+				preq->req_size -= clu_size - i;
+		}
+
+		if (preq->req_size < clu_size ||
+		    (err = ploop_discard_add_bio(plo->fbd, bio))) {
+			if (test_bit(BIO_BDEV_REUSED, &bio->bi_flags)) {
+				ioc_task_unlink((struct io_context *)(bio->bi_bdev));
+				bio->bi_bdev = plo->bdev;
+				clear_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+			}
+			BIO_ENDIO(plo->queue, bio, err);
+			list_add(&preq->list, &plo->free_list);
+			plo->bio_discard_qlen--;
+			plo->bio_total--;
+			return;
+		}
+
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_DISCARD);
+		preq->dst_iblock = 0;
+		preq->bl.head = preq->bl.tail = NULL;
+	} else
+		preq->bl.head = preq->bl.tail = bio;
+
+	if (test_bit(BIO_BDEV_REUSED, &bio->bi_flags)) {
+		    preq->ioc = (struct io_context *)(bio->bi_bdev);
+		    bio->bi_bdev = plo->bdev;
+		    clear_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	} else {
+		preq->ioc = NULL;
+	}
+
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_UNPLUG)))
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+	if (unlikely(bio == plo->bio_sync)) {
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+		plo->bio_sync = NULL;
+	}
+
+	__TRACE("A %p %u\n", preq, preq->req_cluster);
+
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD)))
+		plo->bio_discard_qlen--;
+	else
+		plo->bio_qlen--;
+
+	ploop_entry_add(plo, preq);
+
+	if (bio->bi_size && !bio_rw_flagged(bio, BIO_RW_DISCARD))
+		insert_entry_tree(plo, preq, drop_list);
+
+	trace_bio_queue(preq);
+}
+
+static inline struct ploop_request *
+ploop_get_request(struct ploop_device * plo, struct list_head * list)
+{
+	struct ploop_request * preq;
+
+	if (unlikely(list_empty(list)))
+		return NULL;
+
+	preq = list_first_entry(list, struct ploop_request, list);
+	list_del_init(&preq->list);
+	return preq;
+}
+
+static struct ploop_delta * find_delta(struct ploop_device * plo, int level)
+{
+	struct ploop_delta * delta;
+
+	list_for_each_entry(delta, &plo->map.delta_list, list) {
+		if (delta->level == level)
+			return delta;
+	}
+
+	return NULL;
+}
+
+DEFINE_BIO_CB(ploop_fast_end_io)
+{
+	unsigned long flags;
+	struct ploop_device * plo;
+	struct bio * orig = bio->bi_private;
+
+	plo = orig->bi_bdev->bd_disk->private_data;
+
+	BIO_ENDIO(plo->queue, orig, err);
+
+	/* End of fast bio wakes up main process only when this could
+	 * mean exit from ATTENTION state.
+	 */
+	spin_lock_irqsave(&plo->lock, flags);
+	plo->active_reqs--;
+	plo->fastpath_reqs--;
+	plo->bio_total--;
+
+	if (plo->active_reqs == 0 &&
+	    test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq) &&
+	    (test_bit(PLOOP_S_EXITING, &plo->state) ||
+	     !list_empty(&plo->entry_queue)))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+
+	bio_put(bio);
+}
+END_BIO_CB(ploop_fast_end_io)
+
+static struct ploop_delta *
+ploop_fast_lookup(struct ploop_device * plo, sector_t sec,
+		  unsigned long rw, sector_t * isec)
+{
+	struct ploop_delta * top_delta, * delta;
+	int level;
+	cluster_t bio_cluster = sec >> plo->cluster_log;
+	iblock_t iblk;
+
+	level = ploop_fastmap(&plo->map, bio_cluster, &iblk);
+	if (level < 0)
+		return NULL;
+
+	top_delta = ploop_top_delta(plo);
+	delta = top_delta;
+
+	if (level != top_delta->level) {
+		/* _XXX_ here is a problem. While merge_bvec() we do
+		 * not know whether this bio is read or write. If it is read
+		 * we should check backing map. This is tradeoff:
+		 * either we will direct reads to slow path, or we
+		 * do not aggregate writes, which makes COW much
+		 * slower. For now we select optimization of COW.
+		 */
+		if (rw & (1<<BIO_RW))
+			return NULL;
+
+		delta = find_delta(plo, level);
+	}
+	if (delta) {
+		*isec = ((sector_t)iblk << plo->cluster_log) +
+			(sec & ((1 << plo->cluster_log) - 1));
+	}
+	return delta;
+}
+
+
+/* Got a bio, which is mapped 1-1 to block device.
+ * But there is a problem, this bio could bypass device merge functions,
+ * because we skipped it while our own merge_fn.
+ *
+ * We cannot split bio in fast path, but we can revalidate it.
+ *
+ * q->max_phys_segments and q->max_hw_segments must be set to minimal
+ * of all participating backing devices.
+ */
+
+static int
+bio_fast_map(struct ploop_device * plo, struct bio * orig_bio, struct bio * bio)
+{
+	struct ploop_delta * delta;
+	sector_t isector;
+
+	if (orig_bio->bi_size == 0)
+		delta = ploop_top_delta(plo);
+	else
+		delta = ploop_fast_lookup(plo, orig_bio->bi_sector,
+					  orig_bio->bi_rw, &isector);
+	if (delta == NULL) {
+		plo->st.fast_neg_nomap++;
+		return 1;
+	}
+
+	if (delta->io.ops->fastmap == NULL)
+		return 1;
+
+	return delta->io.ops->fastmap(&delta->io, orig_bio, bio, isector);
+}
+
+static inline unsigned int block_vecs(struct ploop_device * plo)
+{
+	return 1 << (plo->cluster_log + 9 - PAGE_SHIFT);
+}
+
+static int whole_block(struct ploop_device * plo, struct ploop_request *preq)
+{
+	if (preq->req_size != (1<<plo->cluster_log))
+		return 0;
+	return !(preq->req_sector & ((1<<plo->cluster_log) - 1));
+}
+
+static struct bio *
+preallocate_bio(struct bio * orig_bio, struct ploop_device * plo)
+{
+	struct bio * nbio = NULL;
+
+	if (plo->cached_bio) {
+		spin_lock_irq(&plo->lock);
+		nbio = plo->cached_bio;
+		if (nbio) {
+			if (orig_bio->bi_vcnt <= nbio->bi_max_vecs)
+				plo->cached_bio = NULL;
+			else
+				nbio = NULL;
+		}
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (nbio == NULL)
+		nbio = bio_alloc(GFP_NOFS, max(orig_bio->bi_max_vecs, block_vecs(plo)));
+	return nbio;
+}
+
+static void process_bio_queue(struct ploop_device * plo, struct list_head *drop_list)
+{
+	while (plo->bio_head && !list_empty(&plo->free_list)) {
+		struct bio *tmp = plo->bio_head;
+
+		BUG_ON (!plo->bio_tail);
+		plo->bio_head = plo->bio_head->bi_next;
+		if (!plo->bio_head)
+			plo->bio_tail = NULL;
+
+		ploop_bio_queue(plo, tmp, drop_list);
+	}
+}
+
+static void process_discard_bio_queue(struct ploop_device * plo, struct list_head *drop_list)
+{
+	bool discard = test_bit(PLOOP_S_DISCARD, &plo->state);
+
+	while (!list_empty(&plo->free_list)) {
+		struct bio *tmp;
+
+		/* Only one discard bio can be handled concurrently */
+		if (discard && ploop_discard_is_inprogress(plo->fbd))
+			return;
+
+		tmp = bio_list_pop(&plo->bio_discard_list);
+		if (tmp == NULL)
+			break;
+
+		/* If PLOOP_S_DISCARD isn't set, ploop_bio_queue
+		 * will complete it with a proper error.
+		 */
+		ploop_bio_queue(plo, tmp, drop_list);
+	}
+}
+
+static int ploop_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct bio * nbio;
+	struct ploop_device * plo = q->queuedata;
+	unsigned long rw;
+	LIST_HEAD(drop_list);
+
+	trace_make_request(bio);
+
+	plo->st.bio_in++;
+
+	BUG_ON(bio->bi_idx);
+	BUG_ON(bio->bi_size & 511);
+
+	if (unlikely(bio->bi_size == 0)) {
+		/* Is it possible? This makes sense if the request is
+		 * marked as FLUSH, otherwise just warn and complete. */
+		if (!bio_rw_flagged(bio, BIO_RW_FLUSH)) {
+			WARN_ON(1);
+			BIO_ENDIO(q, bio, 0);
+			return 0;
+		}
+		/* useless to pass this bio further */
+		if (!plo->tune.pass_flushes) {
+			ploop_acc_ff_in(plo, bio->bi_rw);
+			BIO_ENDIO(q, bio, 0);
+			return 0;
+		}
+	}
+
+	/* This is crazy. Pattern is borrowed from raid0.c
+	 * bio layer assumes that it can prepare single-page bio
+	 * not depending on any alignment constraints. So be it.
+	 */
+	if (!bio_rw_flagged(bio, BIO_RW_DISCARD) && bio->bi_size &&
+	    (bio->bi_sector >> plo->cluster_log) !=
+	    ((bio->bi_sector + (bio->bi_size >> 9) - 1) >> plo->cluster_log)) {
+		struct bio_pair *bp;
+		unsigned int first_sectors = (1<<plo->cluster_log)
+			- (bio->bi_sector & ((1<<plo->cluster_log) - 1));
+
+		plo->st.bio_splits++;
+
+		BUG_ON(bio->bi_vcnt != 1 || bio->bi_idx != 0);
+
+		bp = bio_split(bio, first_sectors);
+		ploop_make_request(q, &bp->bio1);
+		ploop_make_request(q, &bp->bio2);
+		bio_pair_release(bp);
+		return 0;
+	}
+
+	rw = bio->bi_rw;
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_FLUSH) &&
+		     !plo->tune.pass_flushes))
+		bio->bi_rw &= ~BIO_FLUSH;
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_FUA) &&
+		     !plo->tune.pass_fuas))
+		bio->bi_rw &= ~BIO_FUA;
+
+	/* Allocate new bio now. */
+	nbio = preallocate_bio(bio, plo);
+
+	if (!current->io_context)
+		(void)current_io_context(GFP_NOFS, -1);
+
+	spin_lock_irq(&plo->lock);
+	ploop_acc_ff_in_locked(plo, rw);
+	plo->bio_total++;
+
+	/* Device is aborted, everything is in error. This should not happen. */
+	if (unlikely(!test_bit(PLOOP_S_RUNNING, &plo->state) ||
+		     ((bio->bi_rw & (1<<BIO_RW)) &&
+		      test_bit(PLOOP_S_ABORT, &plo->state)))) {
+		plo->bio_total--;
+		spin_unlock_irq(&plo->lock);
+
+		BIO_ENDIO(q, bio, -EIO);
+		if (nbio)
+			bio_put(nbio);
+		return 0;
+	}
+
+	if (bio_rw_flagged(bio, BIO_RW_DISCARD)) {
+		bio_list_add(&plo->bio_discard_list, bio);
+		plo->bio_discard_qlen++;
+		goto queued;
+	}
+
+	/* Write tracking in fast path does not work at the moment. */
+	if (unlikely(test_bit(PLOOP_S_TRACK, &plo->state) &&
+		     (bio->bi_rw & WRITE)))
+		goto queue;
+
+	/* No fast path, when maintenance is in progress.
+	 * (PLOOP_S_TRACK was checked immediately above) */
+	if (FAST_PATH_DISABLED(plo->maintenance_type))
+		goto queue;
+
+	/* Attention state, always queue */
+	if (unlikely(test_bit(PLOOP_S_ATTENTION, &plo->state)))
+		goto queue;
+
+	/* Some barriers have been already enqueued, always queue */
+	if (unlikely(plo->barrier_reqs))
+		goto queue;
+
+	if (unlikely(nbio == NULL))
+		goto queue;
+
+	/* Try to merge before checking for fastpath. Maybe, this
+	 * is not wise.
+	 */
+	if (!RB_EMPTY_ROOT(&plo->entry_tree[bio->bi_rw & WRITE]) &&
+	    bio->bi_size) {
+		struct ploop_request * preq;
+		struct rb_node * n = plo->entry_tree[bio->bi_rw & WRITE].rb_node;
+		u32 bio_cluster = bio->bi_sector >> plo->cluster_log;
+
+		while (n) {
+			preq = rb_entry(n, struct ploop_request, lockout_link);
+
+			if (bio_cluster < preq->req_cluster)
+				n = n->rb_left;
+			else if (bio_cluster > preq->req_cluster)
+				n = n->rb_right;
+			else if (bio->bi_sector + (bio->bi_size >> 9) < preq->req_sector)
+				n = n->rb_left;
+			else if (bio->bi_sector > preq->req_sector + preq->req_size)
+				n = n->rb_right;
+			else
+				break;
+		}
+
+		if (n && try_merge(plo, preq, bio, &drop_list))
+			goto out;
+	}
+
+
+	/* Try fast path. If all the mappings are available
+	 * and bio can be remapped without split, just do it.
+	 */
+	if (!bio_fast_map(plo, bio, nbio)) {
+		/* Here is a little problem. It would be really good
+		 * to remap original bio and to return 1. It is how
+		 * make_request() engine is supposed to work.
+		 * Nevertheless, this logic is flawed.
+		 *
+		 * We cannot return remapped bio, because we lose track of it
+		 * and have no way to wait for end of IO f.e. to start
+		 * snapshot or to replace image file.
+		 */
+		trace_bio_fast_map(bio);
+		nbio->bi_private = bio;
+		nbio->bi_end_io = ploop_fast_end_io;
+		plo->active_reqs++;
+		plo->fastpath_reqs++;
+		plo->st.bio_fast++;
+		ploop_acc_ff_out_locked(plo, nbio->bi_rw);
+
+		spin_unlock_irq(&plo->lock);
+
+		generic_make_request(nbio);
+		return 0;
+	}
+
+	/* Otherwise: queue */
+
+queue:
+	BUG_ON (bio->bi_bdev != plo->bdev && bio_sectors(bio));
+	if (bio->bi_bdev == plo->bdev) {
+		BUG_ON (test_bit(BIO_BDEV_REUSED, &bio->bi_flags));
+		ploop_grab_iocontext(bio);
+	}
+
+	BUG_ON (bio->bi_next);
+	if (plo->bio_tail) {
+		BUG_ON (!plo->bio_head);
+		BUG_ON (plo->bio_tail->bi_next);
+		plo->bio_tail->bi_next = bio;
+		plo->bio_tail = bio;
+	} else {
+		BUG_ON (plo->bio_head);
+		plo->bio_head = plo->bio_tail = bio;
+	}
+	plo->bio_qlen++;
+	ploop_congest(plo);
+
+	/* second chance to merge requests */
+	process_bio_queue(plo, &drop_list);
+
+queued:
+	/* If main thread is waiting for requests, wake it up.
+	 * But try to mitigate wakeups, delaying wakeup for some short
+	 * time.
+	 */
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state)) {
+		/* Synchronous requests are not batched. */
+		if (plo->entry_qlen > plo->tune.batch_entry_qlen ||
+		    (bio->bi_rw & ((1<<BIO_RW_UNPLUG)|BIO_FLUSH|BIO_FUA)) ||
+		    (!bio_list_empty(&plo->bio_discard_list) && !list_empty(&plo->free_list))) {
+			wake_up_interruptible(&plo->waitq);
+		} else if (!timer_pending(&plo->mitigation_timer)) {
+			mod_timer(&plo->mitigation_timer,
+				  jiffies + plo->tune.batch_entry_delay);
+		}
+	}
+out:
+	if (nbio) {
+		if (!plo->cached_bio)
+			plo->cached_bio = nbio;
+		else
+			bio_put(nbio);
+	}
+	spin_unlock_irq(&plo->lock);
+
+	if (!list_empty(&drop_list))
+		ploop_preq_drop(plo, &drop_list, 0);
+
+	return 0;
+}
+
+
+/* q->merge_bvec_fn
+ *
+ * According to API, this function returns length which we are able
+ * to merge, but nobody uses it actually, so that we return either 0
+ * or bvec->bv_len.
+ */
+
+static int
+ploop_merge_bvec(struct request_queue *q, struct bvec_merge_data *bm_data,
+		 struct bio_vec *bvec)
+{
+	struct ploop_device *plo = q->queuedata;
+	struct ploop_delta * delta;
+	sector_t sec;
+	sector_t isector;
+	unsigned int len, ret;
+	unsigned long flags;
+
+	sec = bm_data->bi_sector + get_start_sect(bm_data->bi_bdev);
+	len = bm_data->bi_size + bvec->bv_len;
+	ret = bvec->bv_len;
+
+	/* Always allow to add the first bvec. */
+	if (!bm_data->bi_size)
+		return ret;
+
+	/* Is this possible? This would not contradict to anything. */
+	BUG_ON(len & 511);
+
+	len >>= 9;
+
+	if ((sec >> plo->cluster_log) != 
+	    ((sec + len - 1) >> plo->cluster_log)) {
+		plo->st.merge_neg_cluster++;
+		return 0;
+	}
+
+	/* We can return ret right now, the further action is an optimization
+	 * to prevent splitting overhead and to enable fast path.
+	 */
+	spin_lock_irqsave(&plo->lock, flags);
+	delta = ploop_fast_lookup(plo, sec, 0, &isector);
+	if (delta &&
+	    delta->io.ops->disable_merge &&
+	    delta->io.ops->disable_merge(&delta->io, isector, len)) {
+		plo->st.merge_neg_disable++;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&plo->lock, flags);
+
+	/* If no mapping is available, merge up to cluster boundary */
+	return ret;
+}
+
+static void ploop_unplug(struct request_queue *q)
+{
+	struct ploop_device *plo = q->queuedata;
+	unsigned long flags;
+
+	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+
+	/* _XXX_ Figure out context, where it is called. We have to unplug
+	 * all the devices, which have our IO queued, we need some
+	 * protection to find them. We kick only top level delta, but
+	 * even this is not quite safe.
+	 *
+	 * _XXX_ btw this is wrong in any case. Unplug on backing
+	 * device must be done after entry_queue is processed.
+	 */
+	clear_bit(PLOOP_S_SYNC, &plo->state);
+
+	if (plo->active_reqs) {
+		struct ploop_delta * top_delta;
+		top_delta = ploop_top_delta(plo);
+		if (top_delta->io.ops->unplug)
+			top_delta->io.ops->unplug(&top_delta->io);
+	}
+
+	/* And kick our "soft" queue too in case mitigation timer is in effect */
+	spin_lock_irqsave(&plo->lock, flags);
+	if (plo->bio_head) {
+		BUG_ON (!plo->bio_tail);
+		/* another way would be: bio_tail->bi_rw |= BIO_RW_SYNCIO; */
+		plo->bio_sync = plo->bio_tail;
+	} else if (!list_empty(&plo->entry_queue)) {
+		struct ploop_request * preq = list_entry(plo->entry_queue.prev, struct ploop_request, list);
+		preq_set_sync_bit(preq);
+	}
+
+	if ((!list_empty(&plo->entry_queue) ||
+	     (plo->bio_head && !list_empty(&plo->free_list))) &&
+	    test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+}
+
+static int ploop_congested2(void *data, int bits)
+{
+	struct ploop_device * plo = data;
+
+	if (test_bit(PLOOP_S_CONGESTED, &plo->state))
+		return bits;
+
+	return 0;
+}
+
+static int ploop_congested(void *data, int bits)
+{
+	struct ploop_device * plo = data;
+	struct ploop_delta * top_delta;
+	int ret = 0;
+
+	top_delta = ploop_top_delta(plo);
+	if (top_delta->io.ops->congested)
+		ret |= top_delta->io.ops->congested(&top_delta->io, bits);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,24) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+	if (plo->tune.congestion_detection &&
+	    plo->entry_qlen + plo->active_reqs - plo->fastpath_reqs
+	    > (3*plo->tune.max_requests)/4) {
+		if ((bits & (1<<BDI_write_congested)) && !(ret & (1<<BDI_write_congested))) {
+			ret |= (1<<BDI_write_congested);
+			set_bit(PLOOP_S_WRITE_CONG, &plo->state);
+		}
+		if ((bits & (1<<BDI_read_congested)) && !(ret & (1<<BDI_read_congested))) {
+			ret |= (1<<BDI_read_congested);
+			set_bit(PLOOP_S_READ_CONG, &plo->state);
+		}
+	}
+#endif
+
+	return ret;
+}
+
+static int check_lockout(struct ploop_request *preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node * n = plo->lockout_tree.rb_node;
+	struct ploop_request * p;
+
+	if (n == NULL)
+		return 0;
+
+	if (test_bit(PLOOP_REQ_LOCKOUT, &preq->state))
+		return 0;
+
+	while (n) {
+		p = rb_entry(n, struct ploop_request, lockout_link);
+
+		if (preq->req_cluster < p->req_cluster)
+			n = n->rb_left;
+		else if (preq->req_cluster > p->req_cluster)
+			n = n->rb_right;
+		else {
+			list_add_tail(&preq->list, &p->delay_list);
+			plo->st.bio_lockouts++;
+			trace_preq_lockout(preq, p);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int ploop_add_lockout(struct ploop_request *preq, int try)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node ** p = &plo->lockout_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+
+	if (test_bit(PLOOP_REQ_LOCKOUT, &preq->state))
+		return 0;
+
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct ploop_request, lockout_link);
+
+		if (preq->req_cluster == pr->req_cluster) {
+			if (try)
+				return 1;
+			BUG();
+		}
+
+		if (preq->req_cluster < pr->req_cluster)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	trace_add_lockout(preq);
+
+	rb_link_node(&preq->lockout_link, parent, p);
+	rb_insert_color(&preq->lockout_link, &plo->lockout_tree);
+	__set_bit(PLOOP_REQ_LOCKOUT, &preq->state);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_add_lockout);
+
+void del_lockout(struct ploop_request *preq)
+{
+	struct ploop_device * plo = preq->plo;
+
+	if (!test_and_clear_bit(PLOOP_REQ_LOCKOUT, &preq->state))
+		return;
+
+	trace_del_lockout(preq);
+
+	rb_erase(&preq->lockout_link, &plo->lockout_tree);
+}
+
+static void ploop_discard_wakeup(struct ploop_request *preq, int err)
+{
+	struct ploop_device *plo = preq->plo;
+
+	if (err || !ploop_fb_get_n_free(plo->fbd)) {
+		/* Only one discard request is processed */
+		ploop_fb_reinit(plo->fbd, err);
+	} else
+		set_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		if (test_bit(PLOOP_S_DISCARD_LOADED, &plo->state) ||
+		    !test_bit(PLOOP_S_DISCARD, &plo->state))
+			complete(&plo->maintenance_comp);
+}
+
+static void ploop_complete_request(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	int nr_completed = 0;
+	struct io_context *ioc;
+
+	trace_complete_request(preq);
+
+	__TRACE("Z %p %u\n", preq, preq->req_cluster);
+
+	while (preq->bl.head) {
+		struct bio * bio = preq->bl.head;
+		preq->bl.head = bio->bi_next;
+		bio->bi_next = NULL;
+		BIO_ENDIO(plo->queue, bio, preq->error);
+		nr_completed++;
+	}
+	preq->bl.tail = NULL;
+
+	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+	    test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		if (preq->error)
+			set_bit(PLOOP_S_ABORT, &plo->state);
+
+		if (atomic_dec_and_test(&plo->maintenance_cnt))
+			complete(&plo->maintenance_comp);
+	} else if (test_bit(PLOOP_REQ_MERGE, &preq->state)) {
+		if (!preq->error) {
+			if (plo->merge_ptr < plo->trans_map->max_index) {
+				spin_lock_irq(&plo->lock);
+				if (preq->map) {
+					map_release(preq->map);
+					preq->map = NULL;
+				}
+				if (preq->trans_map) {
+					map_release(preq->trans_map);
+					preq->trans_map = NULL;
+				}
+
+				del_lockout(preq);
+
+				preq->req_cluster = ~0U;
+
+				if (!list_empty(&preq->delay_list))
+					list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+				plo->active_reqs--;
+
+				preq->eng_state = PLOOP_E_ENTRY;
+				ploop_entry_add(plo, preq);
+				spin_unlock_irq(&plo->lock);
+				return;
+			}
+		} else
+			set_bit(PLOOP_S_ABORT, &plo->state);
+
+		if (atomic_dec_and_test(&plo->maintenance_cnt))
+			complete(&plo->maintenance_comp);
+	} else if (test_bit(PLOOP_REQ_DISCARD, &preq->state))
+		ploop_discard_wakeup(preq, preq->error);
+
+	if (preq->aux_bio) {
+		int i;
+		struct bio * bio = preq->aux_bio;
+
+		for (i = 0; i < bio->bi_vcnt; i++)
+			put_page(bio->bi_io_vec[i].bv_page);
+
+		bio_put(bio);
+
+		preq->aux_bio = NULL;
+	}
+
+	spin_lock_irq(&plo->lock);
+
+	del_lockout(preq);
+
+	if (!list_empty(&preq->delay_list))
+		list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	if (preq->trans_map) {
+		map_release(preq->trans_map);
+		preq->trans_map = NULL;
+	}
+
+	ioc = preq->ioc;
+	preq->ioc = NULL;
+
+	plo->active_reqs--;
+
+	if (unlikely(test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+		ploop_fb_put_zero_request(plo->fbd, preq);
+	} else {
+		ploop_uncongest(plo);
+		list_add(&preq->list, &plo->free_list);
+		if (waitqueue_active(&plo->req_waitq))
+			wake_up(&plo->req_waitq);
+		else if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+			 waitqueue_active(&plo->waitq) &&
+			 (plo->bio_head || !bio_list_empty(&plo->bio_discard_list)))
+			wake_up_interruptible(&plo->waitq);
+	}
+	plo->bio_total -= nr_completed;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,24)
+	if (plo->tune.congestion_detection &&
+	    plo->entry_qlen + plo->active_reqs - plo->fastpath_reqs
+	    <= plo->tune.max_requests/2) {
+		if (test_and_clear_bit(PLOOP_S_WRITE_CONG, &plo->state))
+			clear_bdi_congested(&plo->queue->backing_dev_info, WRITE);
+		if (test_and_clear_bit(PLOOP_S_READ_CONG, &plo->state))
+			clear_bdi_congested(&plo->queue->backing_dev_info, READ);
+	}
+#endif
+
+	spin_unlock_irq(&plo->lock);
+
+	if (ioc)
+		ioc_task_unlink(ioc);
+}
+
+void ploop_fail_request(struct ploop_request * preq, int err)
+{
+	struct ploop_device * plo = preq->plo;
+
+	ploop_set_error(preq, err);
+
+	spin_lock_irq(&plo->lock);
+	if (err == -ENOSPC) {
+		set_bit(PLOOP_S_ENOSPC_EVENT, &plo->state);
+		list_add(&preq->list, &plo->ready_queue);
+		if (waitqueue_active(&plo->event_waitq))
+			wake_up_interruptible(&plo->event_waitq);
+	} else {
+		set_bit(PLOOP_S_ABORT, &plo->state);
+		list_add_tail(&preq->list, &plo->ready_queue);
+	}
+	spin_unlock_irq(&plo->lock);
+}
+EXPORT_SYMBOL(ploop_fail_request);
+
+void ploop_fail_immediate(struct ploop_request * preq, int err)
+{
+	struct ploop_device * plo = preq->plo;
+
+	ploop_set_error(preq, err);
+
+	set_bit(PLOOP_S_ABORT, &plo->state);
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_complete_request(preq);
+}
+
+void ploop_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	unsigned long flags;
+
+	spin_lock_irqsave(&plo->lock, flags);
+	__TRACE("C %p %u\n", preq, preq->req_cluster);
+	if (preq->error)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+
+	list_add_tail(&preq->list, &plo->ready_queue);
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+}
+EXPORT_SYMBOL(ploop_complete_io_state);
+
+
+static int fill_bio(struct ploop_device *plo, struct bio * bio, cluster_t blk)
+{
+	int pages = block_vecs(plo);
+
+	for (; bio->bi_vcnt < pages; bio->bi_vcnt++) {
+		bio->bi_io_vec[bio->bi_vcnt].bv_page = alloc_page(GFP_NOFS);
+		if (bio->bi_io_vec[bio->bi_vcnt].bv_page == NULL)
+			return -ENOMEM;
+		bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+		bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+	}
+	bio->bi_sector = blk << plo->cluster_log;
+	bio->bi_size = (1 << (plo->cluster_log + 9));
+	return 0;
+}
+
+/* Not generic. We assume that dst is aligned properly, i.e. it is
+ * array of the whole pages starting at cluster boundary.
+ */
+static void bio_bcopy(struct bio *dst, struct bio *src, struct ploop_device *plo)
+{
+	int i;
+	unsigned int doff, soff, bv_off;
+
+	doff = (src->bi_sector & ((1<<plo->cluster_log) - 1)) << 9;
+	soff = 0;
+	bv_off = 0;
+	i = 0;
+
+	while (soff < src->bi_size) {
+		struct bio_vec * bv = src->bi_io_vec + i;
+		unsigned int copy;
+		int didx;
+		int poff;
+		void * ksrc;
+
+		if (bv_off >= bv->bv_len) {
+			i++;
+			bv++;
+			bv_off = 0;
+		}
+
+		didx = doff / PAGE_SIZE;
+		poff = doff & (PAGE_SIZE-1);
+		copy = bv->bv_len - bv_off;
+		if (copy > PAGE_SIZE - poff)
+			copy = PAGE_SIZE - poff;
+
+		ksrc = kmap_atomic(bv->bv_page, KM_USER0);
+		memcpy(page_address(dst->bi_io_vec[didx].bv_page) + poff,
+		       ksrc + bv->bv_offset + bv_off,
+		       copy);
+		kunmap_atomic(ksrc, KM_USER0);
+
+		bv_off += copy;
+		doff += copy;
+		soff += copy;
+	}
+}
+
+int check_zeros(struct bio_list * bl)
+{
+	struct bio * bio;
+
+	bio_list_for_each(bio, bl) {
+		int i;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct bio_vec * bv = bio->bi_io_vec + i;
+			unsigned long * ptr;
+			void * kaddr;
+			int k;
+
+			if (bv->bv_page == ZERO_PAGE(0))
+				continue;
+
+			kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+			ptr = kaddr + bv->bv_offset;
+			k = bv->bv_len/sizeof(unsigned long);
+			while (k) {
+				if (*ptr)
+					break;
+				ptr++;
+				k--;
+			}
+			kunmap_atomic(kaddr, KM_USER0);
+			if (k)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+static int prepare_merge_req(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	u32 iblk;
+	int res;
+
+	BUG_ON (preq->trans_map == NULL);
+
+	if (trans_map_get_index(preq, preq->req_cluster, &iblk)) {
+		u32 cluster = preq->req_cluster;
+
+		preq->req_cluster = ~0U;
+
+		if (cluster + 1 != plo->merge_ptr)
+			goto drop_map;
+
+		do {
+			cluster++;
+
+			if (cluster >= plo->trans_map->max_index)
+				goto drop_map;
+
+			if (cluster > map_get_mn_end(preq->trans_map)) {
+				plo->merge_ptr = cluster;
+				goto drop_map;
+			}
+		} while (trans_map_get_index(preq, cluster, &iblk));
+
+		preq->req_cluster = cluster;
+		plo->merge_ptr = cluster + 1;
+	}
+
+	spin_lock_irq(&plo->lock);
+	res = ploop_add_lockout(preq, 1);
+	spin_unlock_irq(&plo->lock);
+	return res;
+
+drop_map:
+	spin_lock_irq(&plo->lock);
+	map_release(preq->trans_map);
+	preq->trans_map = NULL;
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	spin_unlock_irq(&plo->lock);
+	return 1;
+}
+
+void ploop_queue_zero_request(struct ploop_device *plo,
+			      struct ploop_request *orig_preq, cluster_t clu)
+{
+	struct ploop_request * preq;
+
+	spin_lock_irq(&plo->lock);
+
+	preq = ploop_fb_get_zero_request(plo->fbd);
+	preq->bl.tail = preq->bl.head = NULL;
+	preq->req_cluster = clu;
+	preq->req_size = 0;
+	preq->req_rw = WRITE_SYNC;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_ZERO);
+	if (test_bit(PLOOP_REQ_SYNC, &orig_preq->state))
+		preq->state |= (1 << PLOOP_REQ_SYNC);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+
+	if (test_bit(PLOOP_REQ_RELOC_S, &orig_preq->state)) {
+		if (orig_preq->dst_iblock == ~0U)
+			orig_preq->eng_state = PLOOP_E_RELOC_COMPLETE;
+	} else {
+		orig_preq->eng_state = orig_preq->iblock ?
+			PLOOP_E_DELTA_ZERO_INDEX : PLOOP_E_ZERO_INDEX;
+	}
+	orig_preq->iblock = 0;
+	INIT_LIST_HEAD(&preq->delay_list);
+	list_add_tail(&orig_preq->list, &preq->delay_list);
+
+	list_add(&preq->list, &plo->ready_queue);
+	plo->active_reqs++;
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static void
+ploop_reloc_sched_read(struct ploop_request *preq, iblock_t iblk)
+{
+	struct ploop_device *plo   = preq->plo;
+	struct ploop_delta  *delta = ploop_top_delta(plo);
+	struct bio_list sbl;
+
+	spin_lock_irq(&plo->lock);
+	if (check_lockout(preq)) {
+		__TRACE("l2 %p %u\n", preq, preq->req_cluster);
+		spin_unlock_irq(&plo->lock);
+		return;
+	}
+	ploop_add_lockout(preq, 0);
+	spin_unlock_irq(&plo->lock);
+
+	if (!preq->aux_bio) {
+		preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+		if (!preq->aux_bio ||
+		    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+			ploop_fail_immediate(preq, -ENOMEM);
+			return;
+		}
+	}
+
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_RELOC_DATA_READ;
+	sbl.head = sbl.tail = preq->aux_bio;
+	delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+			      &sbl, iblk, 1<<plo->cluster_log);
+}
+
+/*
+ * Returns 0 if and only if a free block was successfully reused
+ */
+static int
+ploop_reuse_free_block(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	iblock_t  iblk;
+	cluster_t clu;
+	int	  rc;
+	unsigned long pin_state;
+
+	if (plo->maintenance_type != PLOOP_MNTN_FBLOADED &&
+	    plo->maintenance_type != PLOOP_MNTN_RELOC)
+		return -1;
+
+	rc = ploop_fb_get_free_block(plo->fbd, &clu, &iblk);
+
+	/* simple case - no free blocks left */
+	if (rc < 0)
+		return rc;
+
+	/* a free block to reuse requires zeroing index */
+	if (rc > 0) {
+		ploop_queue_zero_request(plo, preq, clu);
+		return 0;
+	}
+
+	/* 'rc == 0' - use iblk as a lost block */
+	pin_state = preq->iblock ? PLOOP_E_DELTA_ZERO_INDEX :
+				   PLOOP_E_ZERO_INDEX;
+	preq->iblock = iblk;
+
+	/* pin preq to some reloc request processing iblk ? */
+	if (ploop_fb_check_reloc_req(plo->fbd, preq, pin_state))
+		return 0;
+
+	/* iblk is a lost block and nobody is relocating it now */
+	preq->eng_state = PLOOP_E_DATA_WBI;
+	__TRACE("T2 %p %u\n", preq, preq->req_cluster);
+	plo->st.bio_out++;
+
+	if (pin_state == PLOOP_E_ZERO_INDEX) {
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+					  &preq->bl, preq->iblock,
+					  preq->req_size);
+	} else { /* PLOOP_E_DELTA_READ */
+		struct bio_list sbl;
+
+		BUG_ON (preq->aux_bio == NULL);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+				      &sbl, preq->iblock, 1<<plo->cluster_log);
+	}
+
+	return 0;
+}
+
+/*
+ * Returns 0 if and only if zero preq was successfully processed
+ */
+static int
+ploop_entry_zero_req(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	int	 level;
+	iblock_t iblk = 0;
+	int	 err;
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, *clu);
+			return 0;
+		}
+		return err;
+	}
+
+	level = map_get_index(preq, preq->req_cluster, &iblk);
+	if (level != top_delta->level) {
+		printk("Can't zero index on wrong level=%d "
+		       "(top_level=%d req_cluster=%u iblk=%u/%u)\n",
+		       level, top_delta->level, preq->req_cluster,
+		       iblk, preq->iblock);
+		return -EIO;
+	}
+
+	ploop_index_update(preq);
+	return 0;
+}
+
+#define MAP_MAX_IND(preq) min(map_get_mn_end(preq->map),	\
+			      preq->plo->map.max_index - 1)
+
+/*
+ * Returns 0 if and only if RELOC_A preq was successfully processed.
+ *
+ * Advance preq->req_cluster till it points to *iblk in grow range.
+ * Returning 0, always set *iblk to a meaningful value: either zero
+ * (if preq->req_cluster went out of allowed range or map is being read)
+ * or iblock in grow range that preq->req_cluster points to.
+ */
+static int
+ploop_entry_reloc_a_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	cluster_t           *clu       = &preq->req_cluster;
+	int level;
+	int err;
+	BUG_ON (*clu == ~0U);
+
+	while(*clu < plo->map.max_index) {
+		err = ploop_find_map(&plo->map, preq);
+		if (err) {
+			if (err == 1) {
+				__TRACE("m %p %u\n", preq, *clu);
+				*iblk = 0;
+				return 0;
+			}
+			return err;
+		}
+		BUG_ON (preq->map == NULL);
+
+		for (; *clu <= MAP_MAX_IND(preq); (*clu)++) {
+			level = map_get_index(preq, *clu, iblk);
+			if (level == top_delta->level &&
+			    *iblk >= plo->grow_start &&
+			    *iblk <= plo->grow_end)
+				break;
+		}
+
+		if (*clu <= MAP_MAX_IND(preq))
+			break;
+
+		spin_lock_irq(&plo->lock);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (*clu >= plo->map.max_index) {
+		preq->eng_state = PLOOP_E_COMPLETE;
+		ploop_complete_request(preq);
+		*iblk = 0;
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns 0 if and only if RELOC_S preq was successfully processed.
+ *
+ * Sets preq->req_cluster to the block we're going to relocate.
+ * Returning 0, always set *iblk to a meaningful value: either
+ * zero (if no more blocks to relocate or block to relocate is free
+ *	 (and zero-index op is scheduled) or map is being read)
+ * or iblock that preq->req_cluster points to.
+ */
+static int
+ploop_entry_reloc_s_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+
+	cluster_t from_clu, to_clu;
+	iblock_t from_iblk, to_iblk;
+	u32 free;
+	int level;
+	int err;
+
+	*iblk = 0;
+
+	if (preq->req_cluster == ~0U) {
+		cluster_t zero_cluster;
+
+		BUG_ON (preq->error);
+		err = ploop_fb_get_reloc_block(plo->fbd, &from_clu, &from_iblk,
+					       &to_clu, &to_iblk, &free);
+		if (err < 0) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return 0;
+		}
+
+		preq->req_cluster = from_clu;
+		preq->src_iblock  = from_iblk;
+		ploop_fb_add_reloc_req(plo->fbd, preq);
+
+		if (free) {
+			preq->dst_iblock  = ~0U;
+			preq->dst_cluster = ~0U;
+			zero_cluster = preq->req_cluster;
+		} else {
+			preq->dst_iblock  = to_iblk;
+			preq->dst_cluster = to_clu;
+			zero_cluster = preq->dst_cluster;
+		}
+
+		ploop_queue_zero_request(plo, preq, zero_cluster);
+		return 0;
+	}
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, *clu);
+			return 0;
+		}
+		return err;
+	}
+	BUG_ON (preq->map == NULL);
+
+	level = map_get_index(preq, preq->req_cluster, iblk);
+	if (level != top_delta->level) {
+		printk("Can't relocate block on wrong level=%d "
+		       "(top_level=%d req_cluster=%u iblk=%u/%u)\n",
+		       level, top_delta->level, preq->req_cluster,
+		       *iblk, preq->iblock);
+		return -EIO;
+	}
+	if (preq->src_iblock != *iblk) {
+		printk("Can't relocate block due to wrong mapping: "
+		       "req_cluster=%u should point to iblk=%u while "
+		       "map_get_index() calculated iblk=%u\n",
+		       preq->req_cluster, preq->src_iblock, *iblk);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/* dummy wrapper around ploop_entry_reloc_[a|s]_req() */
+static int
+ploop_entry_reloc_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state))
+		return ploop_entry_reloc_a_req(preq, iblk);
+	else if (test_bit(PLOOP_REQ_RELOC_S, &preq->state))
+		return ploop_entry_reloc_s_req(preq, iblk);
+	else
+		BUG();
+}
+
+static int discard_get_index(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	int	 level;
+	int	 err;
+
+	preq->iblock = 0;
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err)
+		return err;
+
+	level = map_get_index(preq, preq->req_cluster, &preq->iblock);
+	if (level != top_delta->level)
+		preq->iblock = 0;
+
+	if (preq->map) {
+		spin_lock_irq(&plo->lock);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+	}
+
+	return 0;
+}
+
+static int ploop_entry_discard_req(struct ploop_request *preq)
+{
+	int err = 0;
+	struct ploop_device * plo = preq->plo;
+	unsigned int len = 0;
+	cluster_t last_clu;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state)) {
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+
+	BUG_ON(plo->maintenance_type != PLOOP_MNTN_DISCARD);
+
+	last_clu = (preq->req_sector + preq->req_size) >> plo->cluster_log;
+
+	for (; preq->req_cluster < last_clu; preq->req_cluster++) {
+		len = preq->req_cluster - preq->dst_cluster;
+
+		err = discard_get_index(preq);
+		if (err) {
+			if (err == 1)
+				return 0;
+			goto err;
+		}
+
+		if (preq->dst_iblock &&
+		    (!preq->iblock || preq->dst_iblock + len != preq->iblock)) {
+			err = ploop_fb_add_free_extent(plo->fbd,
+							preq->dst_cluster,
+							preq->dst_iblock, len);
+			preq->dst_iblock = 0;
+			if (err)
+				goto err;
+		}
+
+		if (!preq->dst_iblock && preq->iblock) {
+			preq->dst_cluster = preq->req_cluster;
+			preq->dst_iblock = preq->iblock;
+		}
+	}
+
+	if (preq->dst_iblock) {
+		len = preq->req_cluster - preq->dst_cluster;
+		err = ploop_fb_add_free_extent(plo->fbd, preq->dst_cluster,
+						preq->dst_iblock, len);
+	}
+
+err:
+	preq->error = err;
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_complete_request(preq);
+
+	return 0;
+}
+
+/* Main preq state machine */
+
+static void
+ploop_entry_request(struct ploop_request * preq)
+{
+	struct ploop_device * plo       = preq->plo;
+	struct ploop_delta  * top_delta = ploop_top_delta(plo);
+	struct ploop_delta  * delta;
+	int level;
+	int err;
+	iblock_t iblk;
+
+	/* Control request. */
+	if (unlikely(preq->bl.head == NULL &&
+		     !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
+		     !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
+		     !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
+		     !test_bit(PLOOP_REQ_DISCARD, &preq->state) &&
+		     !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+		complete(plo->quiesce_comp);
+		wait_for_completion(&plo->relax_comp);
+		ploop_complete_request(preq);
+		complete(&plo->relaxed_comp);
+		return;
+	}
+
+	/* Empty flush. */
+	if (unlikely(preq->req_size == 0 &&
+		     !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
+		     !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
+		     !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
+		     !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+		if (preq->req_rw & BIO_FLUSH) {
+			if (top_delta->io.ops->issue_flush) {
+				top_delta->io.ops->issue_flush(&top_delta->io, preq);
+				return;
+			}
+		}
+
+		preq->eng_state = PLOOP_E_COMPLETE;
+		ploop_complete_request(preq);
+		return;
+	}
+
+restart:
+	if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+		err = ploop_entry_discard_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (test_bit(PLOOP_REQ_ZERO, &preq->state)) {
+		err = ploop_entry_zero_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+		   test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		err = ploop_entry_reloc_req(preq, &iblk);
+		if (err)
+			goto error;
+		if (iblk)
+			ploop_reloc_sched_read(preq, iblk);
+		return;
+	} else if (preq->req_cluster == ~0U) {
+		BUG_ON(!test_bit(PLOOP_REQ_MERGE, &preq->state));
+		BUG_ON(preq->trans_map);
+		BUG_ON(preq->map);
+
+		preq->req_cluster = plo->merge_ptr;
+		plo->merge_ptr++;
+		if (preq->req_cluster >= plo->trans_map->max_index) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return;
+		}
+	}
+
+	if (check_lockout(preq)) {
+		__TRACE("l %p %u\n", preq, preq->req_cluster);
+		return;
+	}
+
+	if (plo->trans_map) {
+		err = ploop_find_trans_map(plo->trans_map, preq);
+		if (err) {
+			if (err == 1) {
+				__TRACE("tm %p %u\n", preq, preq->req_cluster);
+				return;
+			}
+			goto error;
+		}
+
+		if (preq->trans_map &&
+		    !(preq->req_rw & (1<<BIO_RW)) &&
+		    trans_map_get_index(preq, preq->req_cluster, &iblk) == 0) {
+			delta = map_top_delta(plo->trans_map);
+			preq->iblock = iblk;
+			preq->eng_state = PLOOP_E_COMPLETE;
+			plo->st.bio_out++;
+			__TRACE("tS %p %u\n", preq, preq->req_cluster);
+			delta->io.ops->submit(&delta->io, preq, preq->req_rw, &preq->bl,
+					      iblk, preq->req_size);
+			return;
+		}
+
+		if (test_bit(PLOOP_REQ_MERGE, &preq->state)) {
+			if (prepare_merge_req(preq))
+				goto restart;
+		}
+	}
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, preq->req_cluster);
+			return;
+		}
+		goto error;
+	}
+
+	if (preq->trans_map &&
+	    trans_map_get_index(preq, preq->req_cluster, &iblk) == 0) {
+		struct bio_list sbl;
+
+		/* Read requests were served earlier. */
+		BUG_ON(!(preq->req_rw & (1<<BIO_RW)));
+
+		spin_lock_irq(&plo->lock);
+		ploop_add_lockout(preq, 0);
+		spin_unlock_irq(&plo->lock);
+
+		if (whole_block(plo, preq)) {
+			set_bit(PLOOP_REQ_TRANS, &preq->state);
+			plo->st.bio_trans_whole++;
+			goto delta_io;
+		}
+
+		plo->st.bio_cows++;
+
+		if (!preq->aux_bio)
+			preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+		if (!preq->aux_bio ||
+		    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+			ploop_fail_immediate(preq, -ENOMEM);
+			return;
+		}
+
+		delta = map_top_delta(plo->trans_map);
+
+		__TRACE("tDR %p %u\n", preq, preq->req_cluster);
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_TRANS_DELTA_READ;
+		sbl.head = sbl.tail = preq->aux_bio;
+		delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+				      &sbl, iblk, 1<<plo->cluster_log);
+		plo->st.bio_trans_copy++;
+		return;
+	}
+
+delta_io:
+	BUG_ON(test_bit(PLOOP_REQ_MERGE, &preq->state));
+
+	delta = top_delta;
+
+	level = map_get_index(preq, preq->req_cluster, &iblk);
+	if (level < 0) {
+		delta = NULL;
+	} else if (level != top_delta->level) {
+		delta = find_delta(plo, level);
+		if (!delta) {
+			err = -EIO;
+			goto error;
+		}
+	}
+
+	if (!(preq->req_rw & (1<<BIO_RW))) {
+		/* Read direction. If we found existing block in some
+		 * delta, we direct bio there. If we did not, this location
+		 * was never written before. We return zero fill and,
+		 * probably, should log an alert.
+		 */
+		if (!delta) {
+			struct bio * bio;
+
+			if (map_index_fault(preq) == 0) {
+				__TRACE("i %p %u\n", preq, preq->req_cluster);
+				return;
+			}
+
+			__TRACE("X %p %u\n", preq, preq->req_cluster);
+			bio_list_for_each(bio, &preq->bl) {
+				zero_fill_bio(bio);
+			}
+			ploop_complete_request(preq);
+			plo->st.bio_rzero++;
+			return;
+		}
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_COMPLETE;
+		plo->st.bio_out++;
+		__TRACE("S %p %u\n", preq, preq->req_cluster);
+		delta->io.ops->submit(&delta->io, preq, preq->req_rw, &preq->bl,
+				      iblk, preq->req_size);
+	} else {
+		if (delta) {
+			if (delta == top_delta) {
+				/* Block exists in top delta. Good. */
+				if (plo->maintenance_type == PLOOP_MNTN_GROW ||
+				    plo->maintenance_type == PLOOP_MNTN_RELOC) {
+					spin_lock_irq(&plo->lock);
+					ploop_add_lockout(preq, 0);
+					spin_unlock_irq(&plo->lock);
+				}
+				preq->iblock = iblk;
+				preq->eng_state = PLOOP_E_COMPLETE;
+				__TRACE("T %p %u\n", preq, preq->req_cluster);
+				plo->st.bio_out++;
+				delta->io.ops->submit(&delta->io, preq, preq->req_rw,
+						      &preq->bl, iblk, preq->req_size);
+			} else if (whole_block(plo, preq)) {
+				__TRACE("O1 %p %u\n", preq, preq->req_cluster);
+				/* Block does not exist in top delta,
+				 * but it exists in some delta.
+				 * BUT! Plain luck, we have full block
+				 * and can skip read stage.
+				 */
+				plo->st.bio_whole_cows++;
+
+				/* About lockout. Reads could proceed
+				 * without lockout.
+				 */
+				spin_lock_irq(&plo->lock);
+				ploop_add_lockout(preq, 0);
+				spin_unlock_irq(&plo->lock);
+
+				if (likely(ploop_reuse_free_block(preq)))
+					top_delta->ops->allocate(top_delta,
+								 preq, &preq->bl,
+								 preq->req_size);
+			} else {
+				struct bio_list sbl;
+
+				plo->st.bio_cows++;
+
+				if (!preq->aux_bio)
+					preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+				if (!preq->aux_bio ||
+				    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+					ploop_fail_immediate(preq, -ENOMEM);
+					return;
+				}
+				spin_lock_irq(&plo->lock);
+				ploop_add_lockout(preq, 0);
+				spin_unlock_irq(&plo->lock);
+
+				__TRACE("DR %p %u\n", preq, preq->req_cluster);
+				preq->iblock = iblk;
+				preq->eng_state = PLOOP_E_DELTA_READ;
+				sbl.head = sbl.tail = preq->aux_bio;
+				delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+						      &sbl, iblk, 1<<plo->cluster_log);
+			}
+		} else {
+			if (!whole_block(plo, preq) && map_index_fault(preq) == 0) {
+					__TRACE("f %p %u\n", preq, preq->req_cluster);
+					return;
+			}
+
+			if (plo->tune.check_zeros && check_zeros(&preq->bl)) {
+				if (map_index_fault(preq) == 0) {
+					__TRACE("f %p %u\n", preq, preq->req_cluster);
+					return;
+				}
+				preq->eng_state = PLOOP_E_COMPLETE;
+				/* Not ploop_complete_request().
+				 * This can be TRANS request.
+				 */
+				ploop_complete_io_state(preq);
+				if(whole_block(plo, preq))
+					plo->st.bio_alloc_whole++;
+				plo->st.bio_wzero++;
+				return;
+			}
+			if(whole_block(plo, preq))
+				plo->st.bio_alloc_whole++;
+
+			spin_lock_irq(&plo->lock);
+			ploop_add_lockout(preq, 0);
+			spin_unlock_irq(&plo->lock);
+
+			/* Block does not exist. */
+			if (likely(ploop_reuse_free_block(preq))) {
+				__TRACE("K %p %u\n", preq, preq->req_cluster);
+				plo->st.bio_alloc++;
+				top_delta->ops->allocate(top_delta, preq,
+							 &preq->bl,
+							 preq->req_size);
+			}
+		}
+	}
+	return;
+
+error:
+	ploop_fail_immediate(preq, err);
+}
+
+static void ploop_req_state_process(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct ploop_delta * top_delta;
+	struct io_context * saved_ioc = NULL;
+	int release_ioc = 0;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter * uninitialized_var(saved_ub);
+#endif
+
+	trace_req_state_process(preq);
+
+	if (preq->ioc) {
+		saved_ioc = current->io_context;
+		current->io_context = preq->ioc;
+#ifdef CONFIG_BEANCOUNTERS
+		saved_ub = set_exec_ub(preq->ioc->ioc_ub);
+#endif
+		atomic_long_inc(&preq->ioc->refcount);
+		release_ioc = 1;
+	}
+
+	if (preq->eng_state != PLOOP_E_COMPLETE &&
+	    test_bit(PLOOP_REQ_SYNC, &preq->state))
+		set_bit(PLOOP_S_SYNC, &plo->state);
+
+	if (test_bit(PLOOP_REQ_TRACK, &preq->state)) {
+		sector_t sec;
+		clear_bit(PLOOP_REQ_TRACK, &preq->state);
+
+		sec = (sector_t)preq->track_cluster << plo->cluster_log;
+		if (sec < plo->track_end)
+			ploop_tracker_notify(plo, sec);
+	}
+
+restart:
+	__TRACE("ST %p %u %lu\n", preq, preq->req_cluster, preq->eng_state);
+	switch (preq->eng_state) {
+	case PLOOP_E_ENTRY:
+		/* First entry */
+		if (preq->error ||
+		    ((preq->req_rw & (1<<BIO_RW)) &&
+		     test_bit(PLOOP_S_ABORT, &plo->state))) {
+			ploop_fail_immediate(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		ploop_entry_request(preq);
+		break;
+
+	case PLOOP_E_RELOC_COMPLETE:
+		BUG_ON (!test_bit(PLOOP_REQ_RELOC_S, &preq->state));
+		if (!preq->error) {
+			ploop_fb_relocate_req_completed(plo->fbd);
+			ploop_fb_del_reloc_req(plo->fbd, preq);
+			spin_lock_irq(&plo->lock);
+			if (!list_empty(&preq->delay_list)) {
+				struct ploop_request *pr;
+				pr = list_entry(preq->delay_list.next,
+						struct ploop_request, list);
+				list_splice_init(&preq->delay_list,
+						 plo->ready_queue.prev);
+			}
+			spin_unlock_irq(&plo->lock);
+			preq->req_cluster = ~0U;
+			preq->src_iblock  = ~0U; /* redundant */
+			preq->dst_cluster = ~0U; /* redundant */
+			preq->dst_iblock  = ~0U; /* redundant */
+			preq->eng_state = PLOOP_E_ENTRY;
+			goto restart;
+		}
+		/* drop down to PLOOP_E_COMPLETE case ... */
+	case PLOOP_E_COMPLETE:
+		if (unlikely(test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
+			     preq->error)) {
+			printk("RELOC_S completed with err %d"
+			       " (%u %u %u %u %u)\n",
+			       preq->error, preq->req_cluster, preq->iblock,
+			       preq->src_iblock, preq->dst_cluster,
+			       preq->dst_iblock);
+			ploop_fb_del_reloc_req(plo->fbd, preq);
+		}
+
+		if (!preq->error &&
+		    test_bit(PLOOP_REQ_TRANS, &preq->state)) {
+			u32 iblk;
+
+			__clear_bit(PLOOP_REQ_TRANS, &preq->state);
+			BUG_ON(!preq->trans_map);
+			if (!trans_map_get_index(preq, preq->req_cluster, &iblk)) {
+				spin_lock_irq(&plo->lock);
+				if (preq->map)
+					map_release(preq->map);
+				preq->map = preq->trans_map;
+				preq->trans_map = NULL;
+				spin_unlock_irq(&plo->lock);
+				preq->iblock = 0;
+				top_delta = map_top_delta(plo->trans_map);
+				top_delta->ops->allocate_complete(top_delta, preq);
+				plo->st.bio_trans_index++;
+				break;
+			}
+		}
+
+		ploop_complete_request(preq);
+		/* All done. */
+		break;
+
+	case PLOOP_E_DELTA_READ:
+	{
+		struct bio * b;
+
+		/* preq was scheduled for read from delta. bio is a bio
+		 * covering full block of data. Now we should copy data
+		 * and proceed with write.
+		 */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			ploop_fail_immediate(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		bio_list_for_each(b, &preq->bl) {
+			bio_bcopy(preq->aux_bio, b, plo);
+		}
+
+		/* Fall through ... */
+	}
+	case PLOOP_E_DELTA_COPIED:
+	{
+		if (likely(ploop_reuse_free_block(preq))) {
+			struct bio_list sbl;
+			sbl.head = sbl.tail = preq->aux_bio;
+			top_delta = ploop_top_delta(plo);
+			top_delta->ops->allocate(top_delta, preq,
+						 &sbl, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_ZERO_INDEX:
+	{
+		preq->eng_state = PLOOP_E_DATA_WBI;
+		top_delta = ploop_top_delta(plo);
+		plo->st.bio_out++;
+		if (whole_block(plo, preq)) {
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &preq->bl, preq->iblock,
+						  preq->req_size);
+		} else {
+			struct bio_list sbl;
+			struct bio * b;
+			int i;
+
+			if (!preq->aux_bio)
+				preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+			if (!preq->aux_bio ||
+			    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+				ploop_fail_immediate(preq, -ENOMEM);
+				break;
+			}
+
+			for (i = 0; i < preq->aux_bio->bi_vcnt; i++)
+				memset(page_address(preq->aux_bio->bi_io_vec[i].bv_page),
+				       0, PAGE_SIZE);
+
+			bio_list_for_each(b, &preq->bl) {
+				bio_bcopy(preq->aux_bio, b, plo);
+			}
+
+			sbl.head = sbl.tail = preq->aux_bio;
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &sbl, preq->iblock, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_DELTA_ZERO_INDEX:
+	{
+		struct bio_list sbl;
+
+		BUG_ON (preq->aux_bio == NULL);
+
+		preq->eng_state = PLOOP_E_DATA_WBI;
+		sbl.head = sbl.tail = preq->aux_bio;
+		top_delta = ploop_top_delta(plo);
+		plo->st.bio_out++;
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+					  &sbl, preq->iblock,
+					  1<<plo->cluster_log);
+		break;
+	}
+	case PLOOP_E_RELOC_DATA_READ:
+	{
+		struct bio_list sbl;
+
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			ploop_fail_immediate(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		BUG_ON (!preq->aux_bio);
+
+		top_delta = ploop_top_delta(plo);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+			preq->eng_state = PLOOP_E_DATA_WBI;
+			plo->st.bio_out++;
+			preq->iblock = preq->dst_iblock;
+			top_delta->io.ops->submit(&top_delta->io, preq,
+						  preq->req_rw, &sbl,
+						  preq->iblock,
+						  1<<plo->cluster_log);
+		} else {
+			top_delta->ops->allocate(top_delta, preq, &sbl,
+						 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_RELOC_NULLIFY:
+	{
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			ploop_fail_immediate(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		BUG_ON (!preq->aux_bio);
+
+		if (++plo->grow_relocated > plo->grow_end - plo->grow_start) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			break;
+		}
+
+		del_lockout(preq);
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->req_cluster++;
+		goto restart;
+	}
+	case PLOOP_E_TRANS_DELTA_READ:
+	{
+		struct bio * b;
+		struct bio_list sbl;
+		u32 iblk;
+
+		/* preq was scheduled for read from delta. bio is a bio
+		 * covering full block of data. Now we should copy data
+		 * and proceed with write.
+		 */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			ploop_fail_immediate(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		bio_list_for_each(b, &preq->bl) {
+			bio_bcopy(preq->aux_bio, b, plo);
+		}
+
+		top_delta = ploop_top_delta(plo);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		__set_bit(PLOOP_REQ_TRANS, &preq->state);
+		if (map_get_index(preq, preq->req_cluster, &iblk) != top_delta->level) {
+			/*
+			 * we can be here only if merge is in progress and
+			 * merge can't happen concurrently with ballooning
+			 */
+			top_delta->ops->allocate(top_delta, preq, &sbl, 1<<plo->cluster_log);
+			plo->st.bio_trans_alloc++;
+		} else {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			preq->iblock = iblk;
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &sbl, iblk, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_INDEX_READ:
+	case PLOOP_E_TRANS_INDEX_READ:
+		/* It was an index read. */
+		map_read_complete(preq);
+		preq->eng_state = PLOOP_E_ENTRY;
+		goto restart;
+
+	case PLOOP_E_DATA_WBI:
+		/* Data written. Index must be updated. */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			ploop_fail_immediate(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		top_delta = ploop_top_delta(plo);
+		top_delta->ops->allocate_complete(top_delta, preq);
+		break;
+
+	case PLOOP_E_INDEX_WB:
+		/* Index write completed. */
+		ploop_index_wb_complete(preq);
+		break;
+
+	default:
+		BUG();
+	}
+
+	if (release_ioc) {
+		struct io_context * ioc = current->io_context;
+		current->io_context = saved_ioc;
+#ifdef CONFIG_BEANCOUNTERS
+		set_exec_ub(saved_ub);
+#endif
+		put_io_context(ioc);
+	}
+}
+
+static void ploop_wait(struct ploop_device * plo, int once)
+{
+	DEFINE_WAIT(_wait);
+	for (;;) {
+		prepare_to_wait(&plo->waitq, &_wait, TASK_INTERRUPTIBLE);
+
+		/* This is obvious. */
+		if (!list_empty(&plo->ready_queue))
+			break;
+
+		/* This is not. If we have something in entry queue... */
+		if (!list_empty(&plo->entry_queue)) {
+			/* And entry queue is not suspended due to barrier
+			 * or active reuests are all completed, so that
+			 * we can start/finish barrier processing
+			 */
+			if (!once &&
+			    (!test_bit(PLOOP_S_ATTENTION, &plo->state) ||
+			     !plo->active_reqs))
+				break;
+		} else if (plo->bio_head ||
+			   (!bio_list_empty(&plo->bio_discard_list) &&
+			    !ploop_discard_is_inprogress(plo->fbd))) {
+			/* ready_queue and entry_queue are empty, but
+			 * bio list not. Obviously, we'd like to process
+			 * bio_list instead of sleeping */
+			if (!list_empty(&plo->free_list) &&
+			    (!test_bit(PLOOP_S_ATTENTION, &plo->state) ||
+			     !plo->active_reqs))
+				break;
+		}
+
+		if (kthread_should_stop() && !plo->active_reqs)
+			break;
+
+		set_bit(PLOOP_S_WAIT_PROCESS, &plo->state);
+		if (kthread_should_stop())
+			set_bit(PLOOP_S_EXITING, &plo->state);
+		once = 0;
+		spin_unlock_irq(&plo->lock);
+		if (test_and_clear_bit(PLOOP_S_SYNC, &plo->state) &&
+		    plo->active_reqs != plo->fastpath_reqs) {
+			struct ploop_delta * top_delta = ploop_top_delta(plo);
+			if (top_delta->io.ops->unplug)
+				top_delta->io.ops->unplug(&top_delta->io);
+		}
+		schedule();
+		spin_lock_irq(&plo->lock);
+		clear_bit(PLOOP_S_WAIT_PROCESS, &plo->state);
+	}
+	finish_wait(&plo->waitq, &_wait);
+}
+
+static void ploop_handle_enospc_req(struct ploop_request *preq)
+{
+	struct ploop_device * plo = preq->plo;
+	DEFINE_WAIT(_wait);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state))
+		return;
+
+	mod_timer(&plo->freeze_timer, jiffies + HZ * 10);
+
+	prepare_to_wait(&plo->freeze_waitq, &_wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&plo->lock);
+	schedule();
+	spin_lock_irq(&plo->lock);
+
+	finish_wait(&plo->freeze_waitq, &_wait);
+
+	spin_unlock_irq(&plo->lock);
+	if (preq->aux_bio) {
+		int i;
+		struct bio * bio = preq->aux_bio;
+
+		for (i = 0; i < bio->bi_vcnt; i++)
+			put_page(bio->bi_io_vec[i].bv_page);
+
+		bio_put(bio);
+
+		preq->aux_bio = NULL;
+	}
+	spin_lock_irq(&plo->lock);
+
+	del_lockout(preq);
+
+	if (!list_empty(&preq->delay_list))
+		list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	if (preq->trans_map) {
+		map_release(preq->trans_map);
+		preq->trans_map = NULL;
+	}
+
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+}
+
+/* Main process. Processing queues in proper order, handling pre-barrier
+ * flushes and queue suspend while processing a barrier
+ */
+static int ploop_thread(void * data)
+{
+	int once = 0;
+	struct ploop_device * plo = data;
+	LIST_HEAD(drop_list);
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	for (;;) {
+		/* Convert bios to preqs early (at least before processing
+		 * entry queue) to increase chances of bio merge
+		 */
+	again:
+		BUG_ON (!list_empty(&drop_list));
+
+		process_bio_queue(plo, &drop_list);
+		process_discard_bio_queue(plo, &drop_list);
+
+		if (!list_empty(&drop_list)) {
+			spin_unlock_irq(&plo->lock);
+			ploop_preq_drop(plo, &drop_list, 1);
+			goto again;
+		}
+
+		if (!list_empty(&plo->ready_queue)) {
+			struct ploop_request * preq;
+			preq = ploop_get_request(plo, &plo->ready_queue);
+			if (preq->error == -ENOSPC)
+				ploop_handle_enospc_req(preq);
+			spin_unlock_irq(&plo->lock);
+
+			ploop_req_state_process(preq);
+
+			spin_lock_irq(&plo->lock);
+			continue;
+		}
+
+		/* Now ready_queue is empty */
+
+		if (plo->active_reqs == 0)
+			clear_bit(PLOOP_S_ATTENTION, &plo->state);
+
+		if (!list_empty(&plo->entry_queue) &&
+		    !test_bit(PLOOP_S_ATTENTION, &plo->state)) {
+			struct ploop_request * preq;
+
+			preq = ploop_get_request(plo, &plo->entry_queue);
+
+			if (test_bit(PLOOP_REQ_BARRIER, &preq->state)) {
+				set_bit(PLOOP_S_ATTENTION, &plo->state);
+				if (plo->active_reqs) {
+					list_add(&preq->list, &plo->entry_queue);
+					continue;
+				}
+				plo->barrier_reqs--;
+			} else {
+				if (!plo->read_sync_reqs &&
+				    plo->active_reqs > plo->tune.max_active_requests &&
+				    plo->active_reqs > plo->entry_qlen &&
+				    time_before(jiffies, preq->tstamp + plo->tune.batch_entry_delay) &&
+				    !kthread_should_stop()) {
+					list_add(&preq->list, &plo->entry_queue);
+					once = 1;
+					mod_timer(&plo->mitigation_timer, preq->tstamp + plo->tune.batch_entry_delay);
+					goto wait_more;
+				}
+			}
+
+			plo->active_reqs++;
+			ploop_entry_qlen_dec(preq);
+
+			if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+				BUG_ON(plo->maintenance_type != PLOOP_MNTN_DISCARD);
+				atomic_inc(&plo->maintenance_cnt);
+			}
+
+			if (test_bit(PLOOP_REQ_SORTED, &preq->state)) {
+				rb_erase(&preq->lockout_link, &plo->entry_tree[preq->req_rw & WRITE]);
+				__clear_bit(PLOOP_REQ_SORTED, &preq->state);
+			}
+			preq->eng_state = PLOOP_E_ENTRY;
+			spin_unlock_irq(&plo->lock);
+
+			ploop_req_state_process(preq);
+
+			spin_lock_irq(&plo->lock);
+			continue;
+		}
+
+		/* Termination condition: stop requested,
+		 * no requests are in process or in entry queue
+		 */
+		if (kthread_should_stop() && !plo->active_reqs &&
+		    list_empty(&plo->entry_queue) && !plo->bio_head &&
+		    bio_list_empty(&plo->bio_discard_list))
+			break;
+
+wait_more:
+		ploop_wait(plo, once);
+		once = 0;
+	}
+	spin_unlock_irq(&plo->lock);
+
+	return 0;
+}
+
+
+/* block device operations */
+static int ploop_open(struct block_device *bdev, fmode_t fmode)
+{
+	struct ploop_device * plo = bdev->bd_disk->private_data;
+
+	mutex_lock(&plo->ctl_mutex);
+
+	BUG_ON (plo->bdev && plo->bdev != bdev);
+	if (!plo->bdev)
+		plo->bdev = bdev;
+
+	atomic_inc(&plo->open_count);
+	mutex_unlock(&plo->ctl_mutex);
+
+	check_disk_change(bdev);
+
+	return 0;
+}
+
+static int ploop_release(struct gendisk *disk, fmode_t fmode)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	mutex_lock(&plo->ctl_mutex);
+	if (atomic_dec_and_test(&plo->open_count)) {
+		ploop_tracker_stop(plo, 1);
+		plo->bdev = NULL;
+	}
+	mutex_unlock(&plo->ctl_mutex);
+
+	return 0;
+}
+
+static struct ploop_delta *
+init_delta(struct ploop_device * plo, struct ploop_ctl * ctl, int level)
+{
+	struct ploop_delta * delta;
+	struct ploop_delta_ops * ops;
+	int err;
+
+	ops = ploop_format_get(ctl->pctl_format);
+	if (ops == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (level < 0 && !list_empty(&plo->map.delta_list)) {
+		struct ploop_delta * top_delta = ploop_top_delta(plo);
+		err = -EINVAL;
+		if (top_delta->level >= 127)
+			goto out_err;
+		level = top_delta->level + 1;
+		if (ctl->pctl_cluster_log != plo->cluster_log)
+			goto out_err;
+		if (!(ops->capability & PLOOP_FMT_CAP_DELTA))
+			goto out_err;
+	} else if (level >= 0) {
+		struct ploop_delta * delta = find_delta(plo, level);
+		err = -EINVAL;
+		if (delta == NULL)
+			goto out_err;
+		if (ctl->pctl_cluster_log != plo->cluster_log)
+			goto out_err;
+		if (level && !(ops->capability & PLOOP_FMT_CAP_DELTA))
+			goto out_err;
+	}
+
+	if (level < 0)
+		level = 0;
+
+	err = -ENOMEM;
+	delta = kzalloc(sizeof(struct ploop_delta), GFP_KERNEL);
+	if (delta == NULL)
+		goto out_err;
+
+	__module_get(THIS_MODULE);
+
+	delta->level = level;
+	delta->cluster_log = ctl->pctl_cluster_log;
+	delta->plo = plo;
+	delta->ops = ops;
+	delta->flags = ctl->pctl_flags & PLOOP_FMT_FLAGS;
+	delta->max_delta_size = ULLONG_MAX;
+
+	KOBJECT_INIT(&delta->kobj, &ploop_delta_ktype);
+	return delta;
+
+out_err:
+	ploop_format_put(ops);
+	return ERR_PTR(err);
+}
+
+
+static int ploop_set_max_delta_size(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta * top_delta = ploop_top_delta(plo);
+	u64 max_delta_size;
+
+	if (copy_from_user(&max_delta_size, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (top_delta == NULL)
+		return -EINVAL;
+
+	top_delta->max_delta_size = max_delta_size;
+
+	return 0;
+}
+
+static int ploop_add_delta(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	if ((ctl.pctl_flags & PLOOP_FLAG_COOKIE) && !plo->cookie[0] &&
+	    copy_from_user(plo->cookie, (void*)arg + sizeof(struct ploop_ctl) +
+			   sizeof(struct ploop_ctl_chunk),
+			   PLOOP_COOKIE_SIZE - 1))
+		return -EFAULT;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EBUSY;
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	delta = init_delta(plo, &ctl, -1);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	if (list_empty(&plo->map.delta_list))
+		plo->fmt_version = PLOOP_FMT_UNDEFINED;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	if (list_empty(&plo->map.delta_list)) {
+		plo->cluster_log = delta->cluster_log;
+	} else {
+		struct ploop_delta * top_delta = ploop_top_delta(plo);
+
+		err = -EINVAL;
+		if (!(top_delta->flags & PLOOP_FMT_RDONLY))
+			goto out_close;
+	}
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	if (err < 0) {
+		kobject_put(&plo->kobj);
+		goto out_close;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	list_add(&delta->list, &plo->map.delta_list);
+	mutex_unlock(&plo->sysfs_mutex);
+	set_bit(PLOOP_S_CHANGED, &plo->state);
+
+	return 0;
+
+out_close:
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+static int ploop_replace_delta(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta, * old_delta;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	old_delta = find_delta(plo, ctl.pctl_level);
+	if (old_delta == NULL)
+		return -ENOENT;
+
+	if ((old_delta->flags ^ ctl.pctl_flags) & PLOOP_FMT_RDONLY)
+		return -EINVAL;
+
+	delta = init_delta(plo, &ctl, ctl.pctl_level);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	kobject_del(&old_delta->kobj);
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	kobject_put(&plo->kobj);
+
+	if (err < 0) {
+		kobject_put(&plo->kobj);
+		goto out_close;
+	}
+
+	ploop_quiesce(plo);
+	ploop_map_destroy(&plo->map);
+	list_replace_init(&old_delta->list, &delta->list);
+	ploop_relax(plo);
+
+	old_delta->ops->stop(old_delta);
+	old_delta->ops->destroy(old_delta);
+	kobject_put(&old_delta->kobj);
+	return 0;
+
+out_close:
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+
+void ploop_quiesce(struct ploop_device * plo)
+{
+	struct completion qcomp;
+	struct ploop_request * preq;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	spin_lock_irq(&plo->lock);
+	preq = ploop_alloc_request(plo);
+	preq->bl.head = preq->bl.tail = NULL;
+	preq->req_size = 0;
+	preq->req_rw = 0;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_BARRIER);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+
+	init_completion(&qcomp);
+	init_completion(&plo->relax_comp);
+	init_completion(&plo->relaxed_comp);
+	plo->quiesce_comp = &qcomp;
+
+	ploop_entry_add(plo, preq);
+	plo->barrier_reqs++;
+
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+
+	wait_for_completion(&qcomp);
+	plo->quiesce_comp = NULL;
+}
+
+void ploop_relax(struct ploop_device * plo)
+{
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	complete(&plo->relax_comp);
+	wait_for_completion(&plo->relaxed_comp);
+}
+
+/* search disk for first partition bdev with mounted fs and freeze it */
+static struct super_block *find_and_freeze_bdev(struct gendisk *disk,
+						struct block_device ** bdev_pp)
+{
+	struct super_block  * sb   = NULL;
+	struct block_device * bdev = NULL;
+	int i;
+
+	for (i = 0; i <= (*bdev_pp)->bd_part_count; i++) {
+		bdev = bdget_disk(disk, i);
+		if (!bdev)
+			break;
+
+		sb = freeze_bdev(bdev);
+		if (sb)
+			break;
+
+		thaw_bdev(bdev, sb);
+		bdput(bdev);
+		bdev = NULL;
+	}
+
+	*bdev_pp = bdev;
+	return sb;
+}
+
+static int ploop_snapshot(struct ploop_device * plo, unsigned long arg,
+			  struct block_device * bdev)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta, * top_delta;
+	struct ploop_snapdata snapdata;
+	struct super_block * sb;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return ploop_add_delta(plo, arg);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	delta = init_delta(plo, &ctl, -1);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	if (err)
+		goto out_close;
+
+	top_delta = ploop_top_delta(plo);
+
+	err = top_delta->ops->prepare_snapshot(top_delta, &snapdata);
+	if (err)
+		goto out_close2;
+
+	/* _XXX_ only one mounted fs per ploop-device is supported */
+	sb = NULL;
+	if (ctl.pctl_flags & PLOOP_FLAG_FS_SYNC) {
+		/* freeze_bdev() may trigger ploop_bd_full() */
+		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
+		mutex_unlock(&plo->ctl_mutex);
+		sb = find_and_freeze_bdev(plo->disk, &bdev);
+		mutex_lock(&plo->ctl_mutex);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		if (IS_ERR(sb)) {
+			err = PTR_ERR(sb);
+			fput(snapdata.file);
+			goto out_close2;
+		}
+	}
+
+	ploop_quiesce(plo);
+	err = top_delta->ops->complete_snapshot(top_delta, &snapdata);
+	if (!err) {
+		mutex_lock(&plo->sysfs_mutex);
+		list_add(&delta->list, &plo->map.delta_list);
+		clear_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+		mutex_unlock(&plo->sysfs_mutex);
+	}
+	ploop_relax(plo);
+
+	if ((ctl.pctl_flags & PLOOP_FLAG_FS_SYNC) && bdev) {
+		/* Drop ctl_mutex in order to avoid reverse order locking
+		   thaw_bdev() ->kill_sb() ->blkdev_put() ->bd_mutex */
+		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
+		mutex_unlock(&plo->ctl_mutex);
+		thaw_bdev(bdev, sb);
+		mutex_lock(&plo->ctl_mutex);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		bdput(bdev);
+	}
+
+	if (err)
+		goto out_close2;
+
+	return 0;
+
+out_close2:
+	kobject_del(&delta->kobj);
+out_close:
+	kobject_put(&plo->kobj);
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+static void renumber_deltas(struct ploop_device * plo)
+{
+	struct ploop_delta * delta;
+	int level = 0;
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		delta->level = level++;
+	}
+
+	if (level == 1) {
+		delta = ploop_top_delta(plo);
+		if (delta->level == 0 &&
+		    (delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL))
+			set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+	}
+}
+
+static void rename_deltas(struct ploop_device * plo, int level)
+{
+	struct ploop_delta * delta;
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		int err;
+
+		if (delta->level < level)
+			continue;
+#if 0
+		/* Oops, kobject_rename() is not exported! */
+		sprintf(nname, "%d", delta->level);
+		err = kobject_rename(&delta->kobj, nname);
+#else
+		kobject_del(&delta->kobj);
+		err = KOBJECT_ADD(&delta->kobj, &plo->kobj,
+				  "%d", delta->level);
+#endif
+		if (err)
+			printk("rename_deltas: %d %d %d\n", err, level, delta->level);
+	}
+}
+
+/* Delete delta. Obviously, removing an arbitrary delta will destroy
+ * all the data unless this delta is empty or its data are completely
+ * covered by higher delta or lower delta contains the whole copy of delta,
+ * which is deleted. Driver does not check this.
+ *
+ * Some cases, f.e. removing writable top delta are never valid,
+ * because caller has no way to ensure that new data do not emerge.
+ * Nevertheless, we do _NOT_ prohibit this operation, assuming
+ * that caller have some knowledge, which we cannot comprehend.
+ * F.e. virtual machine using the device was stopped, device
+ * was synced and data were copied to lower delta. And this is bad
+ * idea. This should be different ioctl.
+ */
+
+static int ploop_del_delta(struct ploop_device * plo, unsigned long arg)
+{
+	__u32 level;
+	struct ploop_delta * delta, * next;
+
+	if (copy_from_user(&level, (void*)arg, 4))
+		return -EFAULT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (level == 0 && test_bit(PLOOP_S_RUNNING, &plo->state)) {
+		printk(KERN_INFO "Can't del base delta on running ploop%d\n",
+		       plo->index);
+		return -EBUSY;
+	}
+
+	delta = find_delta(plo, level);
+
+	if (delta == NULL)
+		return -ENOENT;
+
+	kobject_del(&delta->kobj);
+	kobject_put(&plo->kobj);
+
+	ploop_quiesce(plo);
+	next = list_entry(delta->list.next, struct ploop_delta, list);
+	list_del(&delta->list);
+	if (list_empty(&plo->map.delta_list))
+		plo->cookie[0] = 0;
+	if (level != 0)
+		next->ops->refresh(next);
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		ploop_map_remove_delta(&plo->map, level);
+	renumber_deltas(plo);
+	ploop_relax(plo);
+	rename_deltas(plo, level);
+
+	delta->ops->stop(delta);
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	BUG_ON(test_bit(PLOOP_S_RUNNING, &plo->state) &&
+	       list_empty(&plo->map.delta_list));
+	return 0;
+}
+
+static void ploop_merge_process(struct ploop_device * plo)
+{
+	int num_reqs;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+	plo->merge_ptr = 0;
+
+	init_completion(&plo->maintenance_comp);
+
+	num_reqs = plo->tune.fsync_max;
+	if (num_reqs > plo->tune.max_requests/2)
+		num_reqs = plo->tune.max_requests/2;
+	if (num_reqs < 1)
+		num_reqs = 1;
+
+	for (; num_reqs; num_reqs--) {
+		struct ploop_request * preq;
+
+		preq = ploop_alloc_request(plo);
+
+		preq->bl.tail = preq->bl.head = NULL;
+		preq->req_cluster = ~0U;
+		preq->req_size = 0;
+		preq->req_rw = WRITE_SYNC;
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_MERGE);
+		preq->error = 0;
+		preq->tstamp = jiffies;
+		preq->iblock = 0;
+		preq->prealloc_size = 0;
+
+		atomic_inc(&plo->maintenance_cnt);
+
+		ploop_entry_add(plo, preq);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+int ploop_maintenance_wait(struct ploop_device * plo)
+{
+	int err;
+
+	mutex_unlock(&plo->ctl_mutex);
+
+	err = wait_for_completion_interruptible(&plo->maintenance_comp);
+
+	mutex_lock(&plo->ctl_mutex);
+
+	return atomic_read(&plo->maintenance_cnt) ? err : 0;
+}
+
+static void ploop_update_fmt_version(struct ploop_device * plo)
+{
+	struct ploop_delta * delta = ploop_top_delta(plo);
+
+	if (delta->level == 0 &&
+	    (delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL)) {
+		ploop_map_destroy(&plo->map);
+		set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+		plo->fmt_version = PLOOP_FMT_UNDEFINED;
+	}
+}
+
+static void ploop_merge_cleanup(struct ploop_device * plo,
+				struct ploop_map * map,
+				struct ploop_delta * delta, int err)
+{
+	ploop_quiesce(plo);
+	mutex_lock(&plo->sysfs_mutex);
+	list_del(&delta->list);
+
+	if (err)
+		list_add(&delta->list, &plo->map.delta_list);
+	else
+		ploop_update_fmt_version(plo);		
+
+	plo->trans_map = NULL;
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	mutex_unlock(&plo->sysfs_mutex);
+	ploop_map_destroy(map);
+	ploop_relax(plo);
+}
+
+static int ploop_merge(struct ploop_device * plo)
+{
+	int err;
+	struct ploop_map * map;
+	struct ploop_delta * delta, * next;
+	struct ploop_snapdata sd;
+
+	if (plo->maintenance_type == PLOOP_MNTN_MERGE)
+		goto already;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	BUG_ON (plo->trans_map);
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = ploop_top_delta(plo);
+	if (delta->level == 0)
+		return -ENOENT;
+
+	map = kzalloc(sizeof(struct ploop_map), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	map_init(plo, map);
+	ploop_map_start(map, plo->bd_size);
+
+	next = list_entry(delta->list.next, struct ploop_delta, list);
+
+	err = next->ops->prepare_merge(next, &sd);
+	if (err) {
+		printk(KERN_WARNING "prepare_merge for ploop%d failed (%d)\n",
+		       plo->index, err);
+		goto out;
+	}
+
+	ploop_quiesce(plo);
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		ploop_map_destroy(&plo->map);
+
+	err = next->ops->start_merge(next, &sd);
+
+	if (!err) {
+		mutex_lock(&plo->sysfs_mutex);
+		list_del(&delta->list);
+		list_add(&delta->list, &map->delta_list);
+		delta->level = 0;
+		plo->trans_map = map;
+		plo->maintenance_type = PLOOP_MNTN_MERGE;
+		mutex_unlock(&plo->sysfs_mutex);
+	} else {
+		/* Yes. All transient obstacles must be resolved
+		 * in prepare_merge. Failed start_merge means
+		 * abort of the device.
+		 */
+		printk(KERN_WARNING "start_merge for ploop%d failed (%d)\n",
+		       plo->index, err);
+		set_bit(PLOOP_S_ABORT, &plo->state);
+	}
+
+	ploop_relax(plo);
+
+	if (err)
+		goto out;
+
+	ploop_merge_process(plo);
+
+already:
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		return err;
+
+	BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+	if (plo->maintenance_type != PLOOP_MNTN_MERGE)
+		return -EALREADY;
+
+	map = plo->trans_map;
+	BUG_ON (!map);
+
+	delta = map_top_delta(plo->trans_map);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+		printk(KERN_WARNING "merge for ploop%d failed (state ABORT)\n",
+		       plo->index);
+		err = -EIO;
+	}
+
+	ploop_merge_cleanup(plo, map, delta, err);
+
+	if (!err) {
+		kobject_del(&delta->kobj);
+		kobject_put(&plo->kobj);
+
+		delta->ops->stop(delta);
+		delta->ops->destroy(delta);
+		kobject_put(&delta->kobj);
+	}
+out:
+	kfree(map);
+	return err;
+}
+
+static int ploop_truncate(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_truncate_ctl ctl;
+	struct ploop_delta * delta;
+	struct file * file;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_truncate_ctl)))
+		return -EFAULT;
+
+	if (ctl.fd < 0)
+		return -EBADF;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = find_delta(plo, ctl.level);
+	if (delta == NULL)
+		return -ENOENT;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY))
+		return -EBUSY;
+
+	if (delta->ops->truncate == NULL)
+		return -EOPNOTSUPP;
+
+	file = fget(ctl.fd);
+	if (file == NULL)
+		return -EBADF;
+
+	ploop_quiesce(plo);
+
+	ploop_map_destroy(&plo->map);
+
+	err = delta->ops->truncate(delta, file, ctl.alloc_head);
+	if (!err)
+		delta->io.prealloced_size = 0;
+
+	ploop_relax(plo);
+
+	fput(file);
+
+	return err;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+static int ploop_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+				sector_t *error_sector)
+{
+	/* Should queue barrier request on ploop and wait for completion.
+	 * The interface is broken and obsolete.
+	 */
+	return -EOPNOTSUPP;
+}
+#endif
+
+#define FUSE_SUPER_MAGIC 0x65735546
+#define IS_PSTORAGE(sb) (sb->s_magic == FUSE_SUPER_MAGIC && \
+			 !strcmp(sb->s_subtype, "pstorage"))
+
+static int ploop_bd_full(struct backing_dev_info *bdi, long long nr, int root)
+{
+	struct ploop_device *plo      = bdi->congested_data;
+	u64		     reserved = 0;
+	int		     rc	      = 0;
+
+	if (root) {
+		if (!plo->tune.disable_root_threshold)
+			reserved = (u64)root_threshold * 1024;
+	} else {
+		if (!plo->tune.disable_user_threshold)
+			reserved = (u64)user_threshold * 1024;
+	}
+
+	if (reserved) {
+		struct kstatfs buf;
+		int	       ret;
+
+		struct ploop_delta *top_delta;
+		struct file	   *file;
+		struct super_block *sb;
+		void		   *jctx = current->journal_info;
+
+		mutex_lock(&plo->sysfs_mutex);
+		top_delta = ploop_top_delta(plo);
+		file	  = top_delta->io.files.file;
+		sb	  = F_DENTRY(file)->d_inode->i_sb;
+
+		/* bd_full can be unsupported or not needed */
+		if (IS_PSTORAGE(sb) || sb->s_op->statfs == simple_statfs ||
+		    top_delta->flags & PLOOP_FMT_PREALLOCATED) {
+			mutex_unlock(&plo->sysfs_mutex);
+			return 0;
+		}
+
+		get_file(file);
+		mutex_unlock(&plo->sysfs_mutex);
+
+		current->journal_info = NULL;
+		ret = sb->s_op->statfs(F_DENTRY(file), &buf);
+		if (ret || buf.f_bfree * buf.f_bsize < reserved + nr) {
+			static unsigned long full_warn_time;
+
+			if (printk_timed_ratelimit(&full_warn_time, 60*60*HZ))
+				printk(KERN_WARNING
+				       "ploop%d: host disk is almost full "
+				       "(%llu < %llu); CT sees -ENOSPC !\n",
+				       plo->index, buf.f_bfree * buf.f_bsize,
+				       reserved + nr);
+
+			rc = 1;
+		}
+
+		fput(file);
+		current->journal_info = jctx;
+	}
+
+	return rc;
+}
+
+static int ploop_start(struct ploop_device * plo, struct block_device *bdev)
+{
+	int err;
+	struct ploop_delta * top_delta, * delta;
+	int i;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EBUSY;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	for (i = 0; i < plo->tune.max_requests; i++) {
+		struct ploop_request * preq;
+		preq = kzalloc(sizeof(struct ploop_request), GFP_KERNEL);
+		if (preq == NULL)
+			break;
+
+		preq->plo = plo;
+		INIT_LIST_HEAD(&preq->delay_list);
+		list_add(&preq->list, &plo->free_list);
+	}
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		err = delta->ops->start(delta);
+		if (err)
+			return err;
+	}
+
+	ploop_map_start(&plo->map, plo->bd_size);
+
+	top_delta = ploop_top_delta(plo);
+
+	if (top_delta->level == 0 &&
+	    (top_delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL))
+		set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+
+	/* Deltas are ready. Enable block device. */
+	set_device_ro(bdev, (top_delta->flags & PLOOP_FMT_RDONLY) != 0);
+
+	blk_queue_make_request(plo->queue, ploop_make_request);
+	plo->queue->queuedata = plo;
+	plo->queue->unplug_fn = ploop_unplug;
+	plo->queue->backing_dev_info.congested_fn = ploop_congested;
+	plo->queue->backing_dev_info.congested_fn2 = ploop_congested2;
+	plo->queue->backing_dev_info.bd_full_fn = ploop_bd_full;
+	plo->queue->backing_dev_info.congested_data = plo;
+
+	blk_queue_merge_bvec(plo->queue, ploop_merge_bvec);
+	blk_queue_flush(plo->queue, REQ_FLUSH);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+	blk_queue_issue_flush_fn(plo->queue, ploop_issue_flush_fn);
+#endif
+
+	if (top_delta->io.ops->queue_settings)
+		top_delta->io.ops->queue_settings(&top_delta->io, plo->queue);
+
+	blk_queue_max_discard_sectors(plo->queue, INT_MAX);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, plo->queue);
+
+	set_capacity(plo->disk, plo->bd_size);
+	bd_set_size(bdev, (loff_t)plo->bd_size << 9);
+	set_blocksize(bdev, PAGE_SIZE);
+
+	plo->thread = kthread_create(ploop_thread, plo, "ploop%d",
+				     plo->index);
+	if (IS_ERR(plo->thread)) {
+		err = PTR_ERR(plo->thread);
+		goto out_err;
+	}
+
+	wake_up_process(plo->thread);
+	set_bit(PLOOP_S_RUNNING, &plo->state);
+	BUG_ON(list_empty(&plo->map.delta_list));
+	return 0;
+
+out_err:
+	plo->thread = NULL;
+	set_capacity(plo->disk, 0);
+	bd_set_size(bdev, 0);
+	return err;
+}
+
+static int ploop_stop(struct ploop_device * plo, struct block_device *bdev)
+{
+	int p;
+	struct ploop_delta * delta;
+	int cnt;
+
+	if (bdev != bdev->bd_contains) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (wrong bdev)\n",
+			       plo->index);
+		return -ENODEV;
+	}
+
+	if (bdev->bd_contains->bd_holders) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (holders=%d)\n",
+			       plo->index, bdev->bd_contains->bd_holders);
+		return -EBUSY;
+	}
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (list_empty(&plo->map.delta_list)) {
+		printk(KERN_INFO "stop ploop%d failed (no deltas)\n",
+		       plo->index);
+		return -ENOENT;
+	}
+
+	cnt = atomic_read(&plo->open_count);
+	if (cnt > 1) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (cnt=%d)\n",
+			       plo->index, cnt);
+		return -EBUSY;
+	}
+
+	cnt = atomic_read(&plo->maintenance_cnt);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF && cnt) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed "
+			       "(type=%d cnt=%d)\n",
+			       plo->index, plo->maintenance_type, cnt);
+		return -EBUSY;
+	}
+
+	for (p = plo->disk->minors - 1; p > 0; p--)
+		invalidate_partition(plo->disk, p);
+	invalidate_partition(plo->disk, 0);
+
+	clear_bit(PLOOP_S_RUNNING, &plo->state);
+
+	del_timer_sync(&plo->mitigation_timer);
+	del_timer_sync(&plo->freeze_timer);
+
+	/* This will wait for queue drain */
+	kthread_stop(plo->thread);
+	plo->thread = NULL;
+
+	/* queue drained, no more ENOSPC */
+	spin_lock_irq(&plo->lock);
+	if (waitqueue_active(&plo->event_waitq))
+		wake_up_interruptible(&plo->event_waitq);
+	spin_unlock_irq(&plo->lock);
+
+	BUG_ON(plo->entry_qlen);
+	BUG_ON(plo->active_reqs);
+	BUG_ON(plo->barrier_reqs);
+	BUG_ON(plo->fastpath_reqs);
+	BUG_ON(plo->read_sync_reqs);
+
+	list_for_each_entry(delta, &plo->map.delta_list, list) {
+		delta->ops->stop(delta);
+	}
+
+	set_capacity(plo->disk, 0);
+	bd_set_size(bdev, 0);
+
+	if (plo->cached_bio) {
+		bio_put(plo->cached_bio);
+		plo->cached_bio = NULL;
+	}
+
+	while (!list_empty(&plo->free_list)) {
+		struct ploop_request * preq;
+
+		preq = list_first_entry(&plo->free_list, struct ploop_request, list);
+		list_del(&preq->list);
+		kfree(preq);
+	}
+
+	ploop_map_destroy(&plo->map);
+	if (plo->trans_map)
+		ploop_map_destroy(plo->trans_map);
+
+	return 0;
+}
+
+static int ploop_sync(struct ploop_device * plo, struct block_device *bdev)
+{
+	struct ploop_delta * delta;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = ploop_top_delta(plo);
+
+	if (delta->ops->sync == NULL)
+		return 0;
+
+	return delta->ops->sync(delta);
+}
+
+static void destroy_deltas(struct ploop_device * plo, struct ploop_map * map)
+{
+	while (!list_empty(&map->delta_list)) {
+		struct ploop_delta * delta;
+		delta = list_entry(map->delta_list.next, struct ploop_delta, list);
+
+		mutex_lock(&plo->sysfs_mutex);
+		list_del(&delta->list);
+		mutex_unlock(&plo->sysfs_mutex);
+
+		kobject_del(&delta->kobj);
+		kobject_put(&plo->kobj);
+
+		delta->ops->destroy(delta);
+		kobject_put(&delta->kobj);
+	}
+
+	plo->cookie[0] = 0;
+}
+
+static int ploop_clear(struct ploop_device * plo, struct block_device * bdev)
+{
+	int cnt;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state)) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed (RUNNING)\n",
+			       plo->index);
+		return -EBUSY;
+	}
+	if (plo->maintenance_type == PLOOP_MNTN_TRACK) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed (TRACK)\n",
+			       plo->index);
+		return -EBUSY;
+	}
+	cnt = atomic_read(&plo->maintenance_cnt);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF && cnt) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed "
+			       "(type=%d cnt=%d)\n",
+			       plo->index, plo->maintenance_type, cnt);
+		return -EBUSY;
+	}
+
+	clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+	clear_bit(PLOOP_S_DISCARD, &plo->state);
+
+	destroy_deltas(plo, &plo->map);
+
+	if (plo->trans_map) {
+		struct ploop_map * map;
+		destroy_deltas(plo, plo->trans_map);
+		map = plo->trans_map;
+		plo->trans_map = NULL;
+		kfree(map);
+	}
+
+	ploop_fb_fini(plo->fbd, 0);
+
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	plo->bd_size = 0;
+	plo->state = (1 << PLOOP_S_CHANGED);
+	BUG_ON(test_bit(PLOOP_S_RUNNING, &plo->state));
+	return 0;
+}
+
+static int ploop_index_update_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_index_update_ctl ctl;
+	struct reloc_map *map;
+	int i;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (copy_from_user(&ctl, (void*)arg,
+			   sizeof(struct ploop_index_update_ctl)))
+		return -EFAULT;
+
+	if (!ctl.n_maps)
+		return 0;
+
+	map = kzalloc(sizeof(*map) * ctl.n_maps, GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	if (copy_from_user(map, (u8*)arg + sizeof(ctl),
+			   sizeof(*map) * ctl.n_maps)) {
+		kfree(map);
+		return -EFAULT;
+	}
+
+	ploop_quiesce(plo);
+
+	for (i = 0; i < ctl.n_maps; i++)
+		ploop_update_map(&plo->map, ctl.level,
+				 map[i].req_cluster, map[i].iblk);
+
+	ploop_relax(plo);
+
+	kfree(map);
+	return 0;
+}
+
+static void ploop_relocate(struct ploop_device * plo)
+{
+	struct ploop_request * preq;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+	plo->grow_relocated = 0;
+
+	init_completion(&plo->maintenance_comp);
+
+	preq = ploop_alloc_request(plo);
+
+	preq->bl.tail = preq->bl.head = NULL;
+	preq->req_cluster = 0;
+	preq->req_size = 0;
+	preq->req_rw = WRITE_SYNC;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+	preq->prealloc_size = 0;
+
+	atomic_inc(&plo->maintenance_cnt);
+
+	ploop_entry_add(plo, preq);
+
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+		wake_up_interruptible(&plo->waitq);
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static int ploop_grow(struct ploop_device *plo, struct block_device *bdev,
+		      unsigned long arg)
+{
+	u64 new_size;
+	struct ploop_ctl ctl;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+	int reloc = 0; /* 'relocation needed' flag */
+	int err;
+
+	if (!delta)
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_GROW)
+		goto already;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+
+	if (ctl.pctl_cluster_log != plo->cluster_log)
+		return -EINVAL;
+
+	if (ctl.pctl_flags & PLOOP_FLAG_CLUBLKS)
+		new_size = (u64)ctl.pctl_size << plo->cluster_log;
+	else
+		new_size = ctl.pctl_size;
+
+	if (plo->bd_size > new_size) /* online shrink not supported */
+		return -EINVAL;
+
+	if (plo->bd_size == new_size) /* nothing to do */
+		return 0;
+
+	if (!delta->ops->prepare_grow)
+		return -EINVAL;
+
+	ploop_quiesce(plo);
+	err = delta->ops->prepare_grow(delta, &new_size, &reloc);
+	if (err)
+		goto grow_failed;
+
+	plo->grow_new_size = new_size;
+
+	/* prepare_grow() succeeded, but more actions needed */
+	if (reloc) {
+		plo->maintenance_type = PLOOP_MNTN_GROW;
+		ploop_relax(plo);
+		ploop_relocate(plo);
+already:
+		err = ploop_maintenance_wait(plo);
+		if (err)
+			return err;
+
+		BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+		if (plo->maintenance_type != PLOOP_MNTN_GROW)
+			return -EALREADY;
+
+		if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+			plo->maintenance_type = PLOOP_MNTN_OFF;
+			return -EIO;
+		}
+
+		ploop_quiesce(plo);
+		new_size = plo->grow_new_size;
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+
+	/* Update bdev size and friends */
+	if (delta->ops->complete_grow) {
+		err = delta->ops->complete_grow(delta, new_size);
+		if (err)
+			goto grow_failed;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	plo->bd_size = new_size;
+	plo->map.max_index = (plo->bd_size + (1 << plo->cluster_log) - 1 )
+			     >> plo->cluster_log;
+
+	set_capacity(plo->disk, plo->bd_size);
+	bd_set_size(bdev, (loff_t)plo->bd_size << 9);
+
+	mutex_unlock(&plo->sysfs_mutex);
+grow_failed:
+	ploop_relax(plo);
+	return err;
+}
+
+static int ploop_balloon_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_balloon_ctl ctl;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (ctl.inflate && ctl.keep_intact)
+		return -EINVAL;
+
+	switch (plo->maintenance_type) {
+	case PLOOP_MNTN_DISCARD:
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			break;
+
+		ploop_quiesce(plo);
+		clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_FBLOADED;
+		ploop_fb_lost_range_init(plo->fbd, delta->io.alloc_head);
+		ploop_relax(plo);
+		/* fall through */
+	case PLOOP_MNTN_FBLOADED:
+	case PLOOP_MNTN_RELOC:
+		BUG_ON (!plo->fbd);
+		ctl.alloc_head = ploop_fb_get_alloc_head(plo->fbd);
+		ctl.level      = ploop_fb_get_freezed_level(plo->fbd);
+		break;
+	case PLOOP_MNTN_OFF:
+		if (ctl.inflate) {
+			if (delta->ops->id != PLOOP_FMT_PLOOP1)
+				return -EOPNOTSUPP;
+
+			ploop_quiesce(plo);
+			plo->maintenance_type = PLOOP_MNTN_BALLOON;
+			ploop_relax(plo);
+		}
+		break;
+	case PLOOP_MNTN_BALLOON :
+		if (!ctl.inflate && !ctl.keep_intact) {
+			ploop_quiesce(plo);
+			plo->maintenance_type = PLOOP_MNTN_OFF;
+			ploop_relax(plo);
+		}
+	}
+	ctl.mntn_type = plo->maintenance_type;
+
+	return copy_to_user((void*)arg, &ctl, sizeof(ctl));
+}
+
+static int ploop_freeblks_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta *delta;
+	struct ploop_freeblks_ctl ctl;
+	struct ploop_freeblks_ctl_extent *extents;
+	struct ploop_freeblks_desc *fbd;
+	int i;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_OFF)
+		return -EINVAL;
+	if (plo->maintenance_type != PLOOP_MNTN_BALLOON)
+		return -EBUSY;
+	BUG_ON (plo->fbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	extents = kzalloc(sizeof(*extents) * ctl.n_extents, GFP_KERNEL);
+	if (!extents)
+		return -ENOMEM;
+
+	delta = ploop_top_delta(plo);
+	if (delta->level != ctl.level) {
+		rc = -EINVAL;
+		goto free_extents;
+	}
+
+	if (copy_from_user(extents, (u8*)arg + sizeof(ctl),
+			   sizeof(*extents) * ctl.n_extents)) {
+		rc = -EINVAL;
+		goto free_extents;
+	}
+
+	fbd = ploop_fb_init(plo);
+	if (!fbd) {
+		rc = -ENOMEM;
+		goto free_extents;
+	}
+
+	for (i = 0; i < ctl.n_extents; i++) {
+		rc = ploop_fb_add_free_extent(fbd, extents[i].clu,
+					      extents[i].iblk, extents[i].len);
+		if (rc) {
+			ploop_fb_fini(fbd, rc);
+			goto free_extents;
+		}
+	}
+
+	ploop_quiesce(plo);
+
+	ctl.alloc_head = delta->io.alloc_head;
+	if (copy_to_user((void*)arg, &ctl, sizeof(ctl))) {
+		rc = -EFAULT;
+		ploop_fb_fini(fbd, rc);
+	} else {
+		iblock_t a_h = delta->io.alloc_head;
+		/* make fbd visible to ploop engine */
+		plo->fbd = fbd;
+		plo->maintenance_type = PLOOP_MNTN_FBLOADED;
+		BUG_ON (a_h != ctl.alloc_head); /* quiesce sanity */
+		ploop_fb_lost_range_init(fbd, a_h);
+		ploop_fb_set_freezed_level(fbd, delta->level);
+	}
+
+	ploop_relax(plo);
+free_extents:
+	kfree(extents);
+	return rc;
+}
+
+static int ploop_fbget_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_freeblks_ctl ctl;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_DISCARD) {
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			return -EINVAL;
+	} else if (plo->maintenance_type != PLOOP_MNTN_FBLOADED)
+		return -EINVAL;
+	BUG_ON (!plo->fbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	ploop_quiesce(plo);
+	rc = ploop_fb_copy_freeblks_to_user(plo->fbd, (void*)arg, &ctl);
+	ploop_relax(plo);
+
+	return rc;
+}
+
+static int ploop_fbfilter_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	int rc = 0;
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD ||
+	    !test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+		return -EINVAL;
+
+	BUG_ON (!plo->fbd);
+
+	ploop_quiesce(plo);
+	rc = ploop_fb_filter_freeblks(plo->fbd, arg);
+	ploop_relax(plo);
+
+	return rc;
+}
+
+static void ploop_relocblks_process(struct ploop_device *plo)
+{
+	int num_reqs;
+	struct ploop_request *preq;
+
+	num_reqs = plo->tune.fsync_max;
+	if (num_reqs > plo->tune.max_requests/2)
+		num_reqs = plo->tune.max_requests/2;
+	if (num_reqs < 1)
+		num_reqs = 1;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+
+	init_completion(&plo->maintenance_comp);
+
+	for (; num_reqs; num_reqs--) {
+		preq = ploop_alloc_request(plo);
+
+		preq->bl.tail = preq->bl.head = NULL;
+		preq->req_cluster = ~0U; /* uninitialized */
+		preq->req_size = 0;
+		preq->req_rw = WRITE_SYNC;
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S);
+		preq->error = 0;
+		preq->tstamp = jiffies;
+		preq->iblock = 0;
+		preq->prealloc_size = 0;
+
+		atomic_inc(&plo->maintenance_cnt);
+
+		ploop_entry_add(plo, preq);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static int release_fbd(struct ploop_device *plo,
+		       struct ploop_relocblks_ctl_extent *e,
+		       int err)
+{
+	if (e)
+		kfree(e);
+
+	clear_bit(PLOOP_S_DISCARD, &plo->state);
+
+	ploop_quiesce(plo);
+	ploop_fb_fini(plo->fbd, err);
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	ploop_relax(plo);
+
+	return err;
+}
+
+static void ploop_discard_restart(struct ploop_device *plo, int err)
+{
+	if (!err && test_bit(PLOOP_S_DISCARD, &plo->state)) {
+		ploop_fb_reinit(plo->fbd, 0);
+		atomic_set(&plo->maintenance_cnt, 0);
+		init_completion(&plo->maintenance_comp);
+		plo->maintenance_type = PLOOP_MNTN_DISCARD;
+	} else {
+		clear_bit(PLOOP_S_DISCARD, &plo->state);
+		ploop_fb_fini(plo->fbd, err);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+}
+
+static int ploop_fbdrop_ioc(struct ploop_device *plo)
+{
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_DISCARD) {
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			return -EINVAL;
+	} else if (plo->maintenance_type != PLOOP_MNTN_FBLOADED)
+		return -EINVAL;
+	BUG_ON (!plo->fbd);
+
+	ploop_quiesce(plo);
+	ploop_discard_restart(plo, 0);
+	ploop_relax(plo);
+
+	return 0;
+}
+
+static int ploop_relocblks_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta *delta = ploop_top_delta(plo);
+	struct ploop_relocblks_ctl ctl;
+	struct ploop_relocblks_ctl_extent *extents = NULL;
+	struct ploop_freeblks_desc *fbd = plo->fbd;
+	int i;
+	int err = 0;
+	int n_free;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (!fbd || (plo->maintenance_type != PLOOP_MNTN_FBLOADED &&
+		     plo->maintenance_type != PLOOP_MNTN_RELOC))
+		return -EINVAL;
+
+	BUG_ON(test_bit(PLOOP_S_DISCARD_LOADED, &plo->state));
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (delta->level != ctl.level ||
+	    ploop_fb_get_freezed_level(plo->fbd) != ctl.level ||
+	    ploop_fb_get_alloc_head(plo->fbd) != ctl.alloc_head) {
+		return -EINVAL;
+	}
+
+	if (plo->maintenance_type == PLOOP_MNTN_RELOC)
+		goto already;
+
+	if (ctl.n_extents) {
+		extents = kzalloc(sizeof(*extents) * ctl.n_extents,
+				GFP_KERNEL);
+		if (!extents)
+			return release_fbd(plo, extents, -ENOMEM);
+
+		if (copy_from_user(extents, (u8*)arg + sizeof(ctl),
+					sizeof(*extents) * ctl.n_extents))
+			return release_fbd(plo, extents, -EINVAL);
+
+		for (i = 0; i < ctl.n_extents; i++) {
+			err = ploop_fb_add_reloc_extent(fbd, extents[i].clu,
+					extents[i].iblk,
+					extents[i].len,
+					extents[i].free);
+			if (err)
+				return release_fbd(plo, extents, err);
+		}
+
+		kfree(extents);
+		extents = NULL;
+	}
+
+	ploop_quiesce(plo);
+
+	/* alloc_head must never decrease */
+	BUG_ON (delta->io.alloc_head < ploop_fb_get_alloc_head(plo->fbd));
+	n_free = ploop_fb_get_n_free(plo->fbd);
+
+	/*
+	 * before relocation start, freeblks engine could provide only
+	 * free blocks
+	 */
+	BUG_ON (delta->io.alloc_head > ploop_fb_get_alloc_head(plo->fbd) &&
+		n_free);
+	ploop_fb_relocation_start(plo->fbd, ctl.n_scanned);
+
+	if (!n_free || !ctl.n_extents)
+		goto truncate;
+
+	plo->maintenance_type = PLOOP_MNTN_RELOC;
+
+	ploop_relax(plo);
+
+	ploop_relocblks_process(plo);
+already:
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		return err;
+
+	BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+	if (plo->maintenance_type != PLOOP_MNTN_RELOC)
+		return -EALREADY;
+
+	fbd = plo->fbd;
+	BUG_ON (!fbd);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+		clear_bit(PLOOP_S_DISCARD,&plo->state);
+
+		ploop_fb_fini(plo->fbd, -EIO);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		return -EIO;
+	}
+
+	if (ploop_fb_get_n_relocated(fbd) != ploop_fb_get_n_relocating(fbd))
+		return release_fbd(plo, extents, -EIO);
+
+	/* time to truncate */
+	ploop_quiesce(plo);
+truncate:
+	if (ploop_fb_get_lost_range_len(plo->fbd) != 0) {
+		BUG_ON (delta->io.alloc_head >
+			ploop_fb_get_alloc_head(plo->fbd));
+		err = delta->ops->truncate(delta, NULL,
+					   ploop_fb_get_first_lost_iblk(plo->fbd));
+		if (!err) {
+			delta->io.prealloced_size = 0;
+			ctl.alloc_head = ploop_fb_get_lost_range_len(plo->fbd);
+			err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+		}
+	} else {
+		ctl.alloc_head = 0;
+		err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+	}
+
+	ploop_discard_restart(plo, err);
+
+	ploop_relax(plo);
+	return err;
+}
+
+static int ploop_getdevice_ioc(unsigned long arg)
+{
+	int err;
+	int index = 0;
+	struct rb_node *n;
+	struct ploop_getdevice_ctl ctl = {};
+
+	mutex_lock(&ploop_devices_mutex);
+	for (n = rb_first(&ploop_devices_tree); n; n = rb_next(n), index++) {
+		struct ploop_device *plo;
+		plo = rb_entry(n, struct ploop_device, link);
+		if (plo->index != index || list_empty(&plo->map.delta_list))
+			break;
+	}
+	mutex_unlock(&ploop_devices_mutex);
+
+	ctl.minor = index << PLOOP_PART_SHIFT;
+	if (ctl.minor & ~MINORMASK)
+		return -ERANGE;
+	err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+	return err;
+}
+
+static int ploop_ioctl(struct block_device *bdev, fmode_t fmode, unsigned int cmd,
+		       unsigned long arg)
+{
+	struct ploop_device *plo = bdev->bd_disk->private_data;
+	int err = -EINVAL;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	mutex_lock(&plo->ctl_mutex);
+
+	if (plo->maintenance_type == PLOOP_MNTN_SNAPSHOT) {
+		mutex_unlock(&plo->ctl_mutex);
+		return -EBUSY;
+	}
+
+	switch(cmd) {
+	case PLOOP_IOC_ADD_DELTA:
+		err = ploop_add_delta(plo, arg);
+		break;
+	case PLOOP_IOC_DEL_DELTA:
+		err = ploop_del_delta(plo, arg);
+		break;
+	case PLOOP_IOC_REPLACE_DELTA:
+		err = ploop_replace_delta(plo, arg);
+		break;
+	case PLOOP_IOC_SNAPSHOT:
+		err = ploop_snapshot(plo, arg, bdev);
+		break;
+	case PLOOP_IOC_CLEAR:
+		err = ploop_clear(plo, bdev);
+		break;
+	case PLOOP_IOC_STOP:
+		err = ploop_stop(plo, bdev);
+		break;
+	case PLOOP_IOC_START:
+		err = ploop_start(plo, bdev);
+		break;
+	case PLOOP_IOC_SYNC:
+		err = ploop_sync(plo, bdev);
+		break;
+
+	case PLOOP_IOC_TRACK_INIT:
+		err = ploop_tracker_init(plo, arg);
+		break;
+	case PLOOP_IOC_TRACK_SETPOS:
+		err = ploop_tracker_setpos(plo, arg);
+		break;
+	case PLOOP_IOC_TRACK_STOP:
+		err = ploop_tracker_stop(plo, 0);
+		break;
+	case PLOOP_IOC_TRACK_ABORT:
+		err = ploop_tracker_stop(plo, 1);
+		break;
+	case PLOOP_IOC_TRACK_READ:
+		err = ploop_tracker_read(plo, arg);
+		break;
+
+	case PLOOP_IOC_MERGE:
+		err = ploop_merge(plo);
+		break;
+	case PLOOP_IOC_TRUNCATE:
+		err = ploop_truncate(plo, arg);
+		break;
+	case PLOOP_IOC_UPDATE_INDEX:
+		err = ploop_index_update_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_GROW:
+		err = ploop_grow(plo, bdev, arg);
+		break;
+	case PLOOP_IOC_BALLOON:
+		err = ploop_balloon_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FREEBLKS:
+		err = ploop_freeblks_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBGET:
+		err = ploop_fbget_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBFILTER:
+		err = ploop_fbfilter_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBDROP:
+		err = ploop_fbdrop_ioc(plo);
+		break;
+	case PLOOP_IOC_RELOCBLKS:
+		err = ploop_relocblks_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_GETDEVICE:
+		err = ploop_getdevice_ioc(arg);
+		break;
+
+	case PLOOP_IOC_DISCARD_INIT:
+		err = ploop_discard_init_ioc(plo);
+		break;
+	case PLOOP_IOC_DISCARD_FINI:
+		err = ploop_discard_fini_ioc(plo);
+		break;
+	case PLOOP_IOC_DISCARD_WAIT:
+		err = ploop_discard_wait_ioc(plo);
+		break;
+	case PLOOP_IOC_MAX_DELTA_SIZE:
+		err = ploop_set_max_delta_size(plo, arg);
+		break;
+	default:
+		err = -EINVAL;
+	}
+	mutex_unlock(&plo->ctl_mutex);
+	return err;
+}
+
+static int ploop_media_changed(struct gendisk *disk)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	return test_bit(PLOOP_S_CHANGED, &plo->state);
+}
+
+static int ploop_revalidate(struct gendisk *disk)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	clear_bit(PLOOP_S_CHANGED, &plo->state);
+	return 0;
+}
+
+static struct block_device_operations ploop_dev_fops = {
+	.owner =		THIS_MODULE,
+	.open =			ploop_open,
+	.release =		ploop_release,
+	.ioctl =		ploop_ioctl,
+	.media_changed =	ploop_media_changed,
+	.revalidate_disk =	ploop_revalidate,
+};
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(PLOOP_DEVICE_MAJOR);
+
+atomic_t plo_count = ATOMIC_INIT(0);
+
+static struct sysfs_ops ploop_sysfs_ops = { };
+
+static void ploop_obj_release(struct kobject *kobj)
+{
+	struct ploop_device *plo = container_of(kobj, struct ploop_device, kobj);
+	kfree(plo);
+	atomic_dec(&plo_count);
+}
+
+static struct kobj_type ploop_ktype = {
+	.sysfs_ops	= &ploop_sysfs_ops,
+	.release	= ploop_obj_release,
+};
+
+static struct ploop_device *__ploop_dev_alloc(int index)
+{
+	struct ploop_device *plo;
+	struct gendisk *dk;
+
+	plo = kzalloc(sizeof(*plo), GFP_KERNEL);
+	if(!plo)
+		goto out;
+
+	plo->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!plo->queue)
+		goto out_mem;
+
+	dk = plo->disk = alloc_disk(PLOOP_PART_MAX);
+	if (!plo->disk)
+		goto out_queue;
+
+	spin_lock_init(&plo->lock);
+	spin_lock_init(&plo->dummy_lock);
+	plo->queue->queue_lock = &plo->dummy_lock;
+	mutex_init(&plo->ctl_mutex);
+	mutex_init(&plo->sysfs_mutex);
+	plo->index = index;
+	plo->state = 0;
+	atomic_set(&plo->open_count, 0);
+	init_timer(&plo->mitigation_timer);
+	plo->mitigation_timer.function = mitigation_timeout;
+	plo->mitigation_timer.data = (unsigned long)plo;
+	init_timer(&plo->freeze_timer);
+	plo->freeze_timer.function = freeze_timeout;
+	plo->freeze_timer.data = (unsigned long)plo;
+	INIT_LIST_HEAD(&plo->entry_queue);
+	plo->entry_tree[0] = plo->entry_tree[1] = RB_ROOT;
+	plo->lockout_tree = RB_ROOT;
+	INIT_LIST_HEAD(&plo->ready_queue);
+	INIT_LIST_HEAD(&plo->free_list);
+	init_waitqueue_head(&plo->waitq);
+	init_waitqueue_head(&plo->req_waitq);
+	init_waitqueue_head(&plo->freeze_waitq);
+	init_waitqueue_head(&plo->event_waitq);
+	plo->tune = DEFAULT_PLOOP_TUNE;
+	map_init(plo, &plo->map);
+	track_init(plo);
+	KOBJECT_INIT(&plo->kobj, &ploop_ktype);
+	atomic_inc(&plo_count);
+	bio_list_init(&plo->bio_discard_list);
+
+	dk->major		= ploop_major;
+	dk->first_minor		= index << PLOOP_PART_SHIFT;
+	dk->minors		= PLOOP_PART_MAX;
+	dk->fops		= &ploop_dev_fops;
+	dk->private_data	= plo;
+	dk->queue		= plo->queue;
+	snprintf(dk->disk_name, sizeof(dk->disk_name), "ploop%d", index);
+	return plo;
+
+out_queue:
+	blk_cleanup_queue(plo->queue);
+out_mem:
+	kfree(plo);
+out:
+	return NULL;
+}
+
+static void ploop_dev_del(struct ploop_device *plo)
+{
+	ploop_tracker_destroy(plo, 1);
+	ploop_sysfs_uninit(plo);
+	del_gendisk(plo->disk);
+	blk_cleanup_queue(plo->queue);
+	put_disk(plo->disk);
+	rb_erase(&plo->link, &ploop_devices_tree);
+	ploop_fb_fini(plo->fbd, 0);
+	kobject_put(&plo->kobj);
+}
+
+static void ploop_dev_insert(struct ploop_device *plo)
+{
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_device * pl;
+
+	p = &ploop_devices_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pl = rb_entry(parent, struct ploop_device, link);
+		BUG_ON (plo->index == pl->index);
+
+		if (plo->index < pl->index)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&plo->link, parent, p);
+	rb_insert_color(&plo->link, &ploop_devices_tree);
+}
+
+static struct ploop_device *ploop_dev_search(int index)
+{
+	struct rb_node *n = ploop_devices_tree.rb_node;
+
+	while(n) {
+		struct ploop_device *plo;
+		plo = rb_entry(n, struct ploop_device, link);
+
+		if (index < plo->index)
+			n = n->rb_left;
+		else if (index > plo->index)
+			n = n->rb_right;
+		else
+			return plo;
+	}
+
+	return NULL;
+}
+
+static struct ploop_device *ploop_dev_init(int index)
+{
+	struct ploop_device *plo = ploop_dev_search(index);
+
+	if (plo)
+		return plo;
+
+	plo = __ploop_dev_alloc(index);
+	if (plo) {
+		add_disk(plo->disk);
+		ploop_sysfs_init(plo);
+		ploop_dev_insert(plo);
+	}
+	return plo;
+}
+
+static struct kobject *ploop_dev_probe(dev_t dev, int *part, void *data)
+{
+	struct kobject *kobj;
+	struct ploop_device *plo;
+
+	*part = dev & (PLOOP_PART_MAX - 1);
+	mutex_lock(&ploop_devices_mutex);
+	plo = ploop_dev_init((dev & MINORMASK) >> PLOOP_PART_SHIFT);
+	if (!plo)
+		kobj = ERR_PTR(-ENOMEM);
+	else
+		kobj = get_disk(plo->disk);
+	mutex_unlock(&ploop_devices_mutex);
+
+	return kobj;
+}
+
+/* Functions to service /proc/vz/ploop_minor */
+
+static int ploop_minor_show(struct seq_file *m, void *v)
+{
+	struct ploop_device *plo = m->private;
+	seq_printf(m, "%d\n", plo->index << PLOOP_PART_SHIFT);
+	return 0;
+}
+
+/* Returns random index from 10000 - 65535 range */
+static unsigned ploop_random_index(void)
+{
+	unsigned int n;
+
+	get_random_bytes(&n, sizeof(n));
+
+	return 10000 + n % (65536 - 10000);
+}
+
+static int ploop_minor_open(struct inode *inode, struct file *file)
+{
+	int index = 0;
+	struct rb_node *n;
+	struct ploop_device *plo = NULL;
+	int found = 0;
+	int ret;
+
+	mutex_lock(&ploop_devices_mutex);
+	for (n = rb_first(&ploop_devices_tree); n; n = rb_next(n)) {
+		plo = rb_entry(n, struct ploop_device, link);
+		if (list_empty(&plo->map.delta_list) &&
+		    !test_bit(PLOOP_S_LOCKED, &plo->locking_state)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		int i = 0;
+
+		index = ploop_random_index();
+		plo = ploop_dev_search(index);
+
+		while (plo) {
+			for (n = &plo->link; n; n = rb_next(n), index++) {
+				plo = rb_entry(n, struct ploop_device, link);
+				if (plo->index != index ||
+				    (list_empty(&plo->map.delta_list) &&
+				     !test_bit(PLOOP_S_LOCKED, &plo->locking_state)))
+					break;
+			}
+
+			BUG_ON (plo->index == index);
+
+			/* not more than two iterations */
+			if (i++ == 2)
+				break;
+
+			if ((index << PLOOP_PART_SHIFT) & ~MINORMASK) {
+				index = 0;
+				plo = ploop_dev_search(index);
+			} else
+				plo = NULL;
+		}
+		
+		if ((index << PLOOP_PART_SHIFT) & ~MINORMASK) {
+			mutex_unlock(&ploop_devices_mutex);
+			return -ERANGE;
+		}
+
+		plo = __ploop_dev_alloc(index);
+		if (!plo) {
+			mutex_unlock(&ploop_devices_mutex);
+			return -ENOMEM;
+		}
+
+		add_disk(plo->disk);
+		ploop_sysfs_init(plo);
+		ploop_dev_insert(plo);
+	}
+	set_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	mutex_unlock(&ploop_devices_mutex);
+
+	ret = single_open(file, ploop_minor_show, plo);
+	if (ret)
+		clear_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	return ret;
+}
+
+static int ploop_minor_release(struct inode *inode, struct file *filp)
+{
+	struct ploop_device *plo = ((struct seq_file *)filp->private_data)->private;
+	clear_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations proc_ploop_minor = {
+	.owner          = THIS_MODULE,
+	.open		= ploop_minor_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ploop_minor_release,
+};
+
+module_param(ploop_max, int, 0);
+MODULE_PARM_DESC(ploop_max, "Maximum number of ploop devices");
+module_param(ploop_major, int, 0);
+MODULE_PARM_DESC(ploop_major, "Major number of ploop device");
+module_param(max_map_pages, int, 0644);
+MODULE_PARM_DESC(ploop_max_map_pages, "Maximal amount of pages taken by map cache");
+module_param(root_threshold, long, 0644);
+MODULE_PARM_DESC(root_threshold, "Disk space reserved for root (in kilobytes)");
+module_param(user_threshold, long, 0644);
+MODULE_PARM_DESC(user_threshold, "Disk space reserved for user (in kilobytes)");
+module_param(large_disk_support, int, 0444);
+MODULE_PARM_DESC(ploop_large_disk_support, "Support of large disks (>2TB)");
+
+static int __init ploop_mod_init(void)
+{
+	int err;
+
+	/* _XXX_ should be estimated from available ram */
+	if (max_map_pages == 0)
+		max_map_pages = 1024;
+
+	err = ploop_map_init();
+	if (err)
+		goto out_err;
+
+	if (register_blkdev(ploop_major, "ploop"))
+		goto out_err;
+
+	blk_register_region(MKDEV(ploop_major, 0), ploop_max,
+			THIS_MODULE, ploop_dev_probe, NULL, NULL);
+
+	if (!proc_create("ploop_minor", 0440,
+			 proc_vz_dir, &proc_ploop_minor))
+		goto out_err2;
+
+	printk(KERN_INFO "ploop_dev: module loaded\n");
+	return 0;
+
+out_err2:
+	err = -ENOMEM;
+	blk_unregister_region(MKDEV(ploop_major, 0), ploop_max);
+	unregister_blkdev(PLOOP_DEVICE_MAJOR, "ploop");
+out_err:
+	ploop_map_exit();
+	return err;
+}
+
+static void __exit ploop_mod_exit(void)
+{
+	struct rb_node * n;
+
+	remove_proc_entry("ploop_minor", proc_vz_dir);
+	while ((n = rb_first(&ploop_devices_tree)) != NULL)
+		ploop_dev_del(rb_entry(n, struct ploop_device, link));
+	blk_unregister_region(MKDEV(ploop_major, 0), ploop_max);
+	unregister_blkdev(PLOOP_DEVICE_MAJOR, "ploop");
+	ploop_map_exit();
+	WARN_ON(atomic_read(&plo_count));
+}
+module_init(ploop_mod_init);
+module_exit(ploop_mod_exit);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/discard.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/discard.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/discard.c	2015-01-21 12:02:55.442902001 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/discard.c	2015-01-21 12:02:55.851891146 +0300
@@ -0,0 +1,108 @@
+#include <linux/module.h>
+#include <linux/bio.h>
+
+#include <linux/ploop/ploop.h>
+#include "discard.h"
+#include "freeblks.h"
+
+int ploop_discard_init_ioc(struct ploop_device *plo)
+{
+	struct ploop_freeblks_desc *fbd;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+
+	if (delta == NULL)
+		return -EINVAL;
+
+	if (delta->ops->id != PLOOP_FMT_PLOOP1)
+		return -EOPNOTSUPP;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	fbd = ploop_fb_init(plo);
+	if (!fbd)
+		return -ENOMEM;
+
+	ploop_quiesce(plo);
+
+	ploop_fb_set_freezed_level(fbd, delta->level);
+
+	plo->fbd = fbd;
+
+	atomic_set(&plo->maintenance_cnt, 0);
+	init_completion(&plo->maintenance_comp);
+	plo->maintenance_type = PLOOP_MNTN_DISCARD;
+	set_bit(PLOOP_S_DISCARD, &plo->state);
+
+	ploop_relax(plo);
+
+	return 0;
+}
+
+int ploop_discard_fini_ioc(struct ploop_device *plo)
+{
+	int ret = 0;
+	struct ploop_request *preq, *tmp;
+	LIST_HEAD(drop_list);
+
+	if (!test_and_clear_bit(PLOOP_S_DISCARD, &plo->state))
+		return 0;
+
+	ploop_quiesce(plo);
+
+	spin_lock_irq(&plo->lock);
+	list_for_each_entry_safe(preq, tmp, &plo->entry_queue, list)
+		if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+			list_move(&preq->list, &drop_list);
+			ploop_entry_qlen_dec(preq);
+		}
+	spin_unlock_irq(&plo->lock);
+
+	if (!list_empty(&drop_list))
+		ploop_preq_drop(plo, &drop_list, 0);
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ploop_fb_fini(plo->fbd, -EOPNOTSUPP);
+
+	clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	complete(&plo->maintenance_comp);
+
+out:
+	ploop_relax(plo);
+
+	return ret;
+}
+
+int ploop_discard_wait_ioc(struct ploop_device *plo)
+{
+	int err;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state))
+		return 0;
+
+	if (plo->maintenance_type == PLOOP_MNTN_FBLOADED)
+		return 1;
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD)
+		return -EINVAL;
+
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		goto out;
+
+	/* maintenance_cnt is zero without discard requests,
+	 * in this case ploop_maintenance_wait returns 0
+	 * instead of ERESTARTSYS */
+	if (test_bit(PLOOP_S_DISCARD_LOADED, &plo->state)) {
+		err = 1;
+	} else if (signal_pending(current))
+		err = -ERESTARTSYS;
+out:
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/discard.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/discard.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/discard.h	2015-01-21 12:02:55.442902001 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/discard.h	2015-01-21 12:02:55.442902001 +0300
@@ -0,0 +1,8 @@
+#ifndef _LINUX_PLOOP_DISCARD_H_
+#define _LINUX_PLOOP_DISCARD_H_
+
+extern int ploop_discard_init_ioc(struct ploop_device *plo);
+extern int ploop_discard_fini_ioc(struct ploop_device *plo);
+extern int ploop_discard_wait_ioc(struct ploop_device *plo);
+
+#endif // _LINUX_PLOOP_DISCARD_H_
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/events.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/events.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/events.h	2015-01-21 12:02:55.232907575 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/events.h	2015-01-21 12:02:55.427902399 +0300
@@ -0,0 +1,108 @@
+#if !defined(_TRACE_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENTS_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+
+#define PRINT_BI_RW(rw)	__print_flags(rw, "|",		\
+			{ 1 << BIO_RW,				"W"},	\
+			{ 1 << BIO_RW_FAILFAST_DEV,		"FD"},	\
+			{ 1 << BIO_RW_FAILFAST_TRANSPORT,	"FT"},	\
+			{ 1 << BIO_RW_FAILFAST_DRIVER,		"FDRV"},\
+			{ 1 << BIO_RW_AHEAD,			"A"},	\
+			{ 1 << BIO_RW_BARRIER,			"B"},	\
+			{ 1 << BIO_RW_SYNCIO,			"S"},	\
+			{ 1 << BIO_RW_UNPLUG,			"U"},	\
+			{ 1 << BIO_RW_META,			"M"},	\
+			{ 1 << BIO_RW_DISCARD,			"D"},	\
+			{ 1 << BIO_RW_NOIDLE,			"N"},	\
+			{ 1 << BIO_RW_FLUSH,			"F"},	\
+			{ 1 << BIO_RW_FUA,			"FUA"},	\
+			{ 1 << BIO_RW_THROTTLED,		"T"})
+
+#define PRINT_PREQ_STATE(state)					\
+			__print_flags(state, "|",		\
+			{ 1 << PLOOP_REQ_LOCKOUT,	"L"},	\
+			{ 1 << PLOOP_REQ_SYNC,		"S"},	\
+			{ 1 << PLOOP_REQ_BARRIER,	"B"},	\
+			{ 1 << PLOOP_REQ_UNSTABLE,	"U"},	\
+			{ 1 << PLOOP_REQ_TRACK,		"TRACK"},\
+			{ 1 << PLOOP_REQ_SORTED,	"SORT"},\
+			{ 1 << PLOOP_REQ_TRANS,		"T"},	\
+			{ 1 << PLOOP_REQ_MERGE,		"M"},	\
+			{ 1 << PLOOP_REQ_RELOC_A,	"RA"},	\
+			{ 1 << PLOOP_REQ_RELOC_S,	"RS"},	\
+			{ 1 << PLOOP_REQ_ZERO,		"Z"},	\
+			{ 1 << PLOOP_REQ_DISCARD,	"D"})
+
+#define PREQ_FORMAT "preq=0x%p cluster=0x%x iblock=0x%x size=0x%x eng_state=0x%lx state=%s rw=%s"
+
+#define PREQ_ARGS	__entry->preq,				\
+			__entry->clu,				\
+			__entry->iblk,				\
+			__entry->size,				\
+			__entry->eng_state,			\
+			PRINT_PREQ_STATE(__entry->state),	\
+			PRINT_BI_RW(__entry->rw)
+
+DECLARE_EVENT_CLASS(preq_template,
+	TP_PROTO(struct ploop_request *preq),
+
+	TP_ARGS(preq),
+
+	TP_STRUCT__entry(
+		__field(void *,		preq)
+		__field(cluster_t,	clu)
+		__field(iblock_t,	iblk)
+		__field(unsigned int,	size)
+		__field(unsigned long,	eng_state)
+		__field(unsigned long,	state)
+		__field(unsigned int,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->preq		= preq;
+		__entry->clu		= preq->req_cluster;
+		__entry->iblk		= preq->iblock;
+		__entry->size		= preq->req_size;
+		__entry->eng_state	= preq->eng_state;
+		__entry->state		= preq->state;
+		__entry->rw		= preq->req_rw;
+	),
+
+	TP_printk(PREQ_FORMAT, PREQ_ARGS)
+);
+
+DECLARE_EVENT_CLASS(bio_template,
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(void *,		bio)
+		__field(sector_t,	sector)
+		__field(unsigned int,	size)
+		__field(unsigned long,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->bio		= bio;
+		__entry->sector		= bio->bi_sector;
+		__entry->size		= bio->bi_size;
+		__entry->rw		= bio->bi_rw;
+	),
+
+	TP_printk("bio=0x%p sector=0x%lx size=0x%x rw=%s",
+			__entry->bio,
+			__entry->sector,
+			__entry->size,
+			PRINT_BI_RW(__entry->rw)
+			)
+);
+
+#endif /* _TRACE_PLOOP_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/fmt_ploop1.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/fmt_ploop1.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/fmt_ploop1.c	2015-01-21 12:02:54.709921458 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/fmt_ploop1.c	2015-01-21 12:02:57.809839173 +0300
@@ -0,0 +1,594 @@
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+#include "ploop1_image.h"
+
+/* The implementaion of ploop1 (PVD) delta format, defined in ploop1_fmt.h
+ */
+
+#define INDEX_PER_PAGE	     (PAGE_SIZE  / 4)
+#define INDEX_PER_PAGE_SHIFT (PAGE_SHIFT - 2)
+
+struct ploop1_private
+{
+	struct page	*dyn_page;
+	u64		bd_size;
+	u32		alloc_head;
+	sector_t	l1_off;
+};
+
+int ploop1_map_index(struct ploop_delta * delta, unsigned long block, sector_t *sec)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	if ((u64)block << delta->plo->cluster_log >= ph->bd_size)
+		return 0;
+
+	/*
+	 * ondisk_pageno == (block + off) >> INDEX_PER_PAGE_SHIFT
+	 * sec == ondisk_pageno << (PAGE_SHIFT - 9)
+	 * (8 sectors per page, and log(8) == PAGE_SHIFT - 9)
+	 */
+	*sec = ((block + PLOOP_MAP_OFFSET) >> INDEX_PER_PAGE_SHIFT) <<
+	       (PAGE_SHIFT - 9);
+	return 1;
+}
+
+static void
+ploop1_read_index(struct ploop_delta * delta, struct ploop_request * preq,
+		  struct page * page, sector_t sec)
+{
+	return delta->io.ops->read_page(&delta->io, preq, page, sec);
+}
+
+static void
+ploop1_destroy_priv(struct ploop_delta * delta)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	if (ph == NULL)
+		return;
+
+	delta->priv = NULL;
+
+	if (ph->dyn_page)
+		put_page(ph->dyn_page);
+
+	kfree(ph);
+}
+
+static int ploop1_stop(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if ((delta->flags & PLOOP_FMT_RDONLY) ||
+	    test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		return 0;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	if (ph->alloc_head > (ph->l1_off >> delta->plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	vh->m_DiskInUse = 0;
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+ploop1_compose(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	return ploop_io_init(delta, nchunks, pc);
+}
+
+static int
+ploop1_open(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop1_private * ph;
+	struct ploop_pvd_header *vh;
+	u64 i_size;
+	int version;
+
+	err = -ENOMEM;
+	ph = kzalloc(sizeof(struct ploop1_private), GFP_KERNEL);
+	if (ph == NULL)
+		return -ENOMEM;
+
+	delta->priv = ph;
+
+	ph->dyn_page = alloc_page(GFP_KERNEL);
+	if (ph->dyn_page == NULL)
+		goto out_err;
+
+	err = ploop_io_open(&delta->io);
+	if (err)
+		goto out_err;
+
+	/* IO engine is ready. */
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out_err;
+
+	err = -EINVAL;
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	version = ploop1_version(vh);
+	if (version == -1 || 
+	    vh->m_Type	  != cpu_to_le32(PRL_IMAGE_COMPRESSED) ||
+	    vh->m_Sectors != cpu_to_le32(1 << delta->cluster_log))
+		goto out_err;
+
+	/* We don't support mixed configuration of V1 and V2 images */
+	if (delta->plo->fmt_version && delta->plo->fmt_version != version)
+		goto out_err;
+
+	ph->l1_off = le32_to_cpu(vh->m_FirstBlockOffset);
+
+	err = -EBUSY;
+	if (vh->m_DiskInUse)
+		goto out_err;
+
+	err = -EINVAL;
+	i_size = delta->io.ops->i_size_read(&delta->io);
+	ph->alloc_head = i_size >> (delta->cluster_log + 9);
+	if (!(le32_to_cpu(vh->m_Sectors) << 9) ||
+	    do_div(i_size, le32_to_cpu(vh->m_Sectors) << 9))
+		goto out_err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, version);
+
+	if (delta->plo->bd_size > ph->bd_size)
+		goto out_err;
+	if (ph->bd_size & (le32_to_cpu(vh->m_Sectors) - 1))
+		goto out_err;
+	if (delta->plo->bd_size & (le32_to_cpu(vh->m_Sectors) - 1))
+		goto out_err;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE);
+		err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+		if (err)
+			goto out_err;
+	}
+
+	delta->io.alloc_head = ph->alloc_head;
+	delta->plo->bd_size = ph->bd_size;
+	delta->plo->fmt_version = version;
+
+	/* If i_size >= max_size, no more allocations needed */
+	if ((u64)ph->alloc_head << (delta->cluster_log + 9) >=
+	    ((u64)ph->bd_size + ph->l1_off) << 9)
+		delta->flags |= PLOOP_FMT_PREALLOCATED;
+
+	return 0;
+
+out_err:
+	ploop1_destroy_priv(delta);
+	return err;
+}
+
+static int
+ploop1_refresh(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, delta->plo->fmt_version);
+
+	return 0;
+}
+
+/*
+ * The function gets preq with a bio. Caller checked that this bio
+ * is write to a block, which is not allocated in this delta.
+ * If this block is totally new, bio can cover only a part of block,
+ * if bio is a COW from previous delta, the function gets a bio
+ * covering the whole cluster, which is read from original delta.
+ *
+ * Task of this function is to allocate new block in image,
+ * to copy data there and to update index after this. A lot, huh?
+ */
+
+static void
+ploop1_allocate(struct ploop_delta * delta, struct ploop_request * preq,
+		struct bio_list * sbl, unsigned int size)
+{
+	if (delta->io.alloc_head >=
+			(delta->max_delta_size >> delta->cluster_log)) {
+		ploop_fail_request(preq, -E2BIG);
+		return;
+	}
+	delta->io.ops->submit_alloc(&delta->io, preq, sbl, size);
+}
+
+/* Call this when data write is complete */
+
+static void
+ploop1_allocate_complete(struct ploop_delta * delta, struct ploop_request * preq)
+{
+	ploop_index_update(preq);
+}
+
+static void
+ploop1_destroy(struct ploop_delta * delta)
+{
+	ploop_io_destroy(&delta->io);
+	ploop1_destroy_priv(delta);
+}
+
+static int
+ploop1_start(struct ploop_delta * delta)
+{
+	return 0;
+//	return delta->io.ops->start(&delta->io);
+}
+
+static int
+ploop1_sync(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		return 0;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		return -EIO;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE);
+
+	if (ph->alloc_head > (ph->l1_off >> delta->plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+ploop1_prepare_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	return delta->io.ops->prepare_snapshot(&delta->io, sd);
+}
+
+static int
+ploop1_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err = 0;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		goto out;
+
+	err = -EIO;
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		goto out;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	if (ph->alloc_head > (ph->l1_off >> delta->io.plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	vh->m_DiskInUse = 0;
+
+	/*
+	 * NB: we don't call ploop_update_map_hdr() here because top
+	 * delta after snapshot completion should bear m_DiskInUse != 0.
+	 * Also, we rely on the fact that new top delta (created while
+	 * snapshotting) has exactly the same PVD-header as former top
+	 * delta. So, first 64 bytes of correspondent map_node page
+	 * remain valid.
+	 */
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->complete_snapshot(&delta->io, sd);
+	if (err)
+		goto out;
+
+	delta->flags |= PLOOP_FMT_RDONLY;
+	return 0;
+
+out:
+	if (sd->file) {
+		fput(sd->file);
+		sd->file = NULL;
+	}
+	return err;
+}
+
+static int
+ploop1_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	if (vh->m_DiskInUse)
+		return -EBUSY;
+
+	ph->alloc_head = delta->io.ops->i_size_read(&delta->io) >>
+			 (delta->io.plo->cluster_log + 9);
+	delta->io.alloc_head = ph->alloc_head;
+
+	err = delta->io.ops->prepare_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	delta->flags &= ~PLOOP_FMT_RDONLY;
+	return 0;
+}
+
+static int
+ploop1_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	err = delta->io.ops->start_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state)) {
+		printk(KERN_WARNING "ploop1_start_merge for ploop%d failed "
+		       "(state ABORT)\n", delta->plo->index);
+		return -EIO;
+	}
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE);
+
+	/* keep hdr in ph->dyn_page and in map_node in sync */
+	ploop_update_map_hdr(&delta->plo->map, (u8 *)vh, sizeof(*vh));
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, delta->plo->fmt_version);
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int ploop1_truncate(struct ploop_delta * delta, struct file * file,
+			   __u32 alloc_head)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	/*
+	 * Maybe we should call here ploop1_refresh() and re-read PVD-header
+	 * from disk. This will be clear in the course of porting
+	 * ploop-shrink.c::shrink_in_place().
+	 */
+
+	ph->alloc_head = alloc_head;
+	delta->io.alloc_head = alloc_head;
+
+	return delta->io.ops->truncate(&delta->io,
+				       file ? file : delta->io.files.file,
+				       alloc_head);
+}
+
+static int
+ploop1_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc)
+{
+	struct ploop1_private * ph = delta->priv;
+	struct ploop_pvd_header *vh;
+	int idxs_per_iblk; /* # indices in one cluster-block */
+	iblock_t bdsize;   /* block-device size measured in cluster-blocks */
+	int n_present;     /* # cluster-blocks in L2-table (existent now) */
+	int n_needed;      /* # cluster-blocks in L2-table (for new_size) */
+	int n_alloced = 0; /* # cluster-blocks we can alloc right now */
+	int err;
+	iblock_t a_h = delta->io.alloc_head;
+	int	 log = delta->io.plo->cluster_log;
+
+	if (*new_size & ((1 << delta->cluster_log) - 1))
+		return -EINVAL;
+
+	if (*new_size > ploop1_max_size(1 << delta->plo->cluster_log,
+					delta->plo->fmt_version))
+		return -EFBIG;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	n_present  = le32_to_cpu(vh->m_FirstBlockOffset) >> log;
+	BUG_ON (!n_present);
+
+	bdsize = (*new_size + (1 << log) - 1) >> log;
+
+	idxs_per_iblk = (1 << (log + 9)) / sizeof(u32);
+	n_needed = (bdsize + PLOOP_MAP_OFFSET + idxs_per_iblk - 1) /
+		   idxs_per_iblk;
+
+	if (n_needed <= n_present)
+		return 0;
+
+	if (a_h < n_needed) {
+		n_alloced = n_needed - a_h;
+		err = delta->io.ops->alloc(&delta->io,
+					   (loff_t)a_h << (log + 9),
+					   (loff_t)(n_alloced) << (log + 9));
+		if (err)
+			return err;
+	}
+
+	*reloc = n_needed - n_present - n_alloced;
+	if (*reloc) {
+		/* Feeling irresistable infatuation to relocate ... */
+		delta->io.plo->grow_start = n_present;
+		delta->io.plo->grow_end = n_needed - n_alloced - 1;
+	}
+
+	return 0;
+}
+
+static int ploop1_complete_grow(struct ploop_delta * delta, u64 new_size)
+{
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+	int err;
+	u32 vh_bsize; /* block size in sectors */
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	vh_bsize = le32_to_cpu(vh->m_Sectors);
+
+	if (vh_bsize != (1 << delta->io.plo->cluster_log)) {
+		printk("grow: vh->m_Sectors=%u != 1<<plo->cluster_log=%u\n",
+		       vh_bsize, 1 << delta->io.plo->cluster_log);
+		return -EINVAL;
+	}
+
+	generate_pvd_header(vh, new_size, vh_bsize, delta->plo->fmt_version);
+
+	vh->m_Type             = cpu_to_le32(vh->m_Type);
+	cpu_to_le_SizeInSectors(vh, delta->plo->fmt_version);
+	vh->m_Sectors          = cpu_to_le32(vh->m_Sectors);
+	vh->m_Heads            = cpu_to_le32(vh->m_Heads);
+	vh->m_Cylinders        = cpu_to_le32(vh->m_Cylinders);
+	vh->m_Size             = cpu_to_le32(vh->m_Size);
+	vh->m_FirstBlockOffset = cpu_to_le32(vh->m_FirstBlockOffset);
+
+	/* keep hdr in ph->dyn_page and in map_node in sync */
+	ploop_update_map_hdr(&delta->plo->map, (u8 *)vh, sizeof(*vh));
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	ph->bd_size = new_size;
+	ph->l1_off = le32_to_cpu(vh->m_FirstBlockOffset);
+
+	return 0;
+}
+
+static struct ploop_delta_ops ploop1_delta_ops =
+{
+	.id		=	PLOOP_FMT_PLOOP1,
+	.name		=	"ploop1",
+	.owner		=	THIS_MODULE,
+	.capability	=	PLOOP_FMT_CAP_WRITABLE | PLOOP_FMT_CAP_DELTA,
+
+	.map_index	=	ploop1_map_index,
+	.read_index	=	ploop1_read_index,
+
+	.allocate	=	ploop1_allocate,
+	.allocate_complete =	ploop1_allocate_complete,
+
+	.compose	=	ploop1_compose,
+	.open		=	ploop1_open,
+	.destroy	=	ploop1_destroy,
+	.start		=	ploop1_start,
+	.stop		=	ploop1_stop,
+	.refresh	=	ploop1_refresh,
+	.sync		=	ploop1_sync,
+	.prepare_snapshot =	ploop1_prepare_snapshot,
+	.complete_snapshot =	ploop1_complete_snapshot,
+	.prepare_merge	=	ploop1_prepare_merge,
+	.start_merge	=	ploop1_start_merge,
+	.truncate	=	ploop1_truncate,
+	.prepare_grow	=	ploop1_prepare_grow,
+	.complete_grow	=	ploop1_complete_grow,
+};
+
+static int __init pfmt_ploop1_mod_init(void)
+{
+	return ploop_register_format(&ploop1_delta_ops);
+}
+
+static void __exit pfmt_ploop1_mod_exit(void)
+{
+	ploop_unregister_format(&ploop1_delta_ops);
+}
+
+module_init(pfmt_ploop1_mod_init);
+module_exit(pfmt_ploop1_mod_exit);
+
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/fmt_raw.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/fmt_raw.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/fmt_raw.c	2015-01-21 12:02:54.710921431 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/fmt_raw.c	2015-01-21 12:02:57.826838722 +0300
@@ -0,0 +1,260 @@
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+
+/* An implementation of raw linear image format.
+ *
+ * Right now it is not quite optimal because we simulate
+ * raw image as ploop1-like image with dummy preallocated
+ * index tables. It is optimized only when we have
+ * just one raw image without any deltas on top.
+ * Probably, this is all that we need.
+ */
+
+static int raw_stop(struct ploop_delta * delta)
+{
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+raw_compose(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	return ploop_io_init(delta, nchunks, pc);
+}
+
+static int
+raw_open(struct ploop_delta * delta)
+{
+	int err;
+	loff_t pos;
+	int cluster_log = list_empty(&delta->plo->map.delta_list) ?
+		delta->cluster_log : delta->plo->cluster_log;
+
+	err = ploop_io_open(&delta->io);
+	if (err)
+		return err;
+
+	if (delta->plo->bd_size) {
+		if (delta->plo->bd_size > (delta->io.ops->i_size_read(&delta->io) >> 9))
+			return -EINVAL;
+	} else {
+		delta->plo->bd_size = delta->io.ops->i_size_read(&delta->io) >> 9;
+	}
+
+	pos = delta->io.ops->i_size_read(&delta->io);
+	pos += (1 << (cluster_log + 9)) - 1;
+	delta->io.alloc_head = pos >> (cluster_log + 9);
+
+	/* no more allocations at all */
+	delta->flags |= PLOOP_FMT_PREALLOCATED;
+
+	return 0;
+}
+
+/*
+ * Sanity checks below assumes that we can be called only by
+ * ploop_del_delta() or raw_start_merge(). Thus, there recently
+ * was a ploop1 delta above us. Adding ploop1 delta on the top
+ * of raw delta is only supported if raw delta is cluster-block
+ * aligned.
+ *
+ * Another assumption is that either size of raw delta was
+ * kept unchanged or it was grown in user-space while merging.
+ */
+static int
+raw_refresh(struct ploop_delta * delta)
+{
+	loff_t pos;
+
+	pos = delta->io.ops->i_size_read(&delta->io);
+	if (pos & ((1 << (delta->plo->cluster_log + 9)) - 1)) {
+		printk("raw delta is not aligned (%llu bytes)\n", pos);
+		return -EINVAL;
+	}
+	if ((pos >> (delta->plo->cluster_log + 9)) < delta->io.alloc_head) {
+		printk("raw delta was corrupted "
+		       "(old_size=%u new_size=%llu iblocks)\n",
+		       delta->io.alloc_head,
+		       pos >> (delta->plo->cluster_log + 9));
+		return -EINVAL;
+	}
+
+	delta->io.alloc_head = pos >> (delta->plo->cluster_log + 9);
+	return 0;
+}
+
+static void
+raw_allocate(struct ploop_delta * delta, struct ploop_request * preq,
+		struct bio_list * sbl, unsigned int size)
+{
+	delta->io.ops->submit_alloc(&delta->io, preq, sbl, size);
+}
+
+int raw_map_index(struct ploop_delta * delta, unsigned long index, sector_t *sec)
+{
+	*sec = index;
+	return 1;
+}
+
+static void
+raw_read_index(struct ploop_delta * delta, struct ploop_request * preq,
+	       struct page * page, sector_t sec)
+{
+	int i;
+	u32 * ptr = page_address(page);
+	int skip = (sec == 0) ? PLOOP_MAP_OFFSET : 0;
+
+	for (i = skip; i < PAGE_SIZE/4; i++) {
+		if ((sec << delta->plo->cluster_log) >=
+		    (delta->io.alloc_head << delta->plo->cluster_log)) {
+			ptr[i] = 0;
+			sec++;
+		} else if (sec == 0) {
+			/* ptr[i]==0 would be interpreted as "iblock not alloced" */
+			ptr[i] = PLOOP_ZERO_INDEX;
+			sec++;
+		} else {
+			ptr[i] = sec++ << ploop_map_log(delta->plo);
+		}
+	}
+
+	ploop_complete_io_state(preq);
+}
+
+static void
+raw_destroy(struct ploop_delta * delta)
+{
+	ploop_io_destroy(&delta->io);
+}
+
+static int
+raw_start(struct ploop_delta * delta)
+{
+	return 0;
+//	return delta->io.ops->start(&delta->io);
+}
+
+static int
+raw_prepare_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	return delta->io.ops->prepare_snapshot(&delta->io, sd);
+}
+
+static int
+raw_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err = 0;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		goto out;
+
+	err = -EIO;
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		goto out;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->complete_snapshot(&delta->io, sd);
+	if (err)
+		goto out;
+
+	delta->flags |= PLOOP_FMT_RDONLY;
+	return 0;
+
+out:
+	if (sd->file) {
+		fput(sd->file);
+		sd->file = NULL;
+	}
+	return err;
+}
+
+static int
+raw_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+
+	err = delta->io.ops->prepare_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	delta->flags &= ~PLOOP_FMT_RDONLY;
+	return 0;
+}
+
+static int
+raw_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+
+	err = delta->io.ops->start_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state)) {
+		printk(KERN_WARNING "raw_start_merge for ploop%d failed "
+		       "(state ABORT)\n", delta->plo->index);
+		return -EIO;
+	}
+
+	err = raw_refresh(delta);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+
+static int
+raw_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc)
+{
+	*new_size = (*new_size + (PAGE_SIZE >> 9) - 1) &
+		    ~((PAGE_SIZE >> 9) - 1);
+	return delta->io.ops->alloc(&delta->io,
+				    delta->plo->bd_size << 9,
+				    (*new_size - delta->plo->bd_size) << 9);
+}
+
+static struct ploop_delta_ops raw_delta_ops =
+{
+	.id		=	PLOOP_FMT_RAW,
+	.name		=	"raw",
+	.owner		=	THIS_MODULE,
+	.capability	=	PLOOP_FMT_CAP_WRITABLE|PLOOP_FMT_CAP_IDENTICAL,
+
+	.map_index	=	raw_map_index,
+	.read_index	=	raw_read_index,
+
+	.allocate	=	raw_allocate,
+
+	.compose	=	raw_compose,
+	.open		=	raw_open,
+	.destroy	=	raw_destroy,
+	.start		=	raw_start,
+	.stop		=	raw_stop,
+	.refresh	=	raw_refresh,
+	.prepare_snapshot =	raw_prepare_snapshot,
+	.complete_snapshot =	raw_complete_snapshot,
+	.prepare_merge	=	raw_prepare_merge,
+	.start_merge	=	raw_start_merge,
+	.prepare_grow	=	raw_prepare_grow,
+};
+
+static int __init pfmt_raw_mod_init(void)
+{
+	return ploop_register_format(&raw_delta_ops);
+}
+
+static void __exit pfmt_raw_mod_exit(void)
+{
+	ploop_unregister_format(&raw_delta_ops);
+}
+
+module_init(pfmt_raw_mod_init);
+module_exit(pfmt_raw_mod_exit);
+
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/freeblks.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/freeblks.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/freeblks.c	2015-01-21 12:02:54.893916573 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/freeblks.c	2015-01-21 12:02:57.839838377 +0300
@@ -0,0 +1,1092 @@
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "freeblks.h"
+
+#define MIN(a, b) (a < b ? a : b)
+
+struct ploop_freeblks_extent
+{
+	struct list_head list; /* List link */
+
+	cluster_t clu;
+	iblock_t  iblk;
+	u32	  len;
+
+};
+
+struct ploop_relocblks_extent
+{
+	struct list_head list; /* List link */
+
+	cluster_t clu;
+	iblock_t  iblk;
+	u32	  len;
+	u32	  free;	/* this extent is also present in freemap */
+};
+
+struct ploop_fextent_ptr {
+	struct ploop_freeblks_extent *ext;
+	u32 off;
+};
+
+struct ploop_rextent_ptr {
+	struct ploop_relocblks_extent *ext;
+	u32 off;
+};
+
+struct ploop_freeblks_desc {
+	struct ploop_device *plo;
+
+	int fbd_n_free;	       /* # free blocks remaining
+				  (i.e. "not re-used") */
+
+	/* fbd_ffb.ext->clu + fbd_ffb.off can be used as
+	 * 'clu of first free block to reuse' for WRITE ops */
+	struct ploop_fextent_ptr fbd_ffb; /* 'ffb' stands for
+					     'first free block' */
+
+	/* fbd_lfb.ext->clu + fbd_lfb.off can be used as
+	 * 'clu of first block to overwrite' (draining reloc range from end) */
+	struct ploop_fextent_ptr fbd_lfb; /* 'lfb' stands for
+					     'last free block for relocation'*/
+
+	/* fbd_reloc_extents[fbd->fbd_last_reloc_extent].clu +
+	 * fbd_last_reloc_off can be used as 'clu of first block to relocate'
+	 * (draining reloc range from end)
+	 * NB: ffb and lfb above deal with free_list, while lrb deals with
+	 * reloc_list! */
+	struct ploop_rextent_ptr fbd_lrb; /* 'lrb' stands for
+					     'last block to relocate' */
+
+	/* counters to trace the progress of relocation */
+	int fbd_n_relocated;  /* # blocks actually relocated */
+	int fbd_n_relocating; /* # blocks whose relocation was at
+				   least started */
+
+	/* lost_range: [fbd_first_lost_iblk ..
+	 *		fbd_first_lost_iblk + fbd_lost_range_len - 1] */
+	iblock_t fbd_first_lost_iblk;
+	int	 fbd_lost_range_len;
+	int	 fbd_lost_range_addon; /* :)) */
+
+	/* any reloc request resides there while it's "in progress" */
+	struct rb_root		reloc_tree;
+
+	/* list of ploop_request-s for PLOOP_REQ_ZERO ops: firstly zero index
+	 * for PLOOP_REQ_ZERO req_cluster, then schedule ordinary request
+	 * pinned to given PLOOP_REQ_ZERO request */
+	struct list_head	free_zero_list;
+
+	/* storage for free-block extents: list for now */
+	struct list_head	fbd_free_list;
+
+	/* storage for reloc-block extents: list for now */
+	struct list_head	fbd_reloc_list;
+
+	int	 fbd_freezed_level; /* for sanity - level on
+				     * PLOOP_IOC_FREEBLKS stage */
+
+	struct bio_list	fbd_dbl; /* dbl stands for 'discard bio list' */
+};
+
+int ploop_fb_get_n_relocated(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_relocated;
+}
+int ploop_fb_get_n_relocating(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_relocating;
+}
+int ploop_fb_get_n_free(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_free;
+}
+iblock_t ploop_fb_get_alloc_head(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_first_lost_iblk + fbd->fbd_lost_range_len;
+}
+int ploop_fb_get_lost_range_len(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lost_range_len;
+}
+iblock_t ploop_fb_get_first_lost_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_first_lost_iblk;
+}
+
+int ploop_fb_get_freezed_level(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_freezed_level;
+}
+void ploop_fb_set_freezed_level(struct ploop_freeblks_desc *fbd, int level)
+{
+	fbd->fbd_freezed_level = level;
+}
+
+void ploop_fb_add_reloc_req(struct ploop_freeblks_desc *fbd,
+			    struct ploop_request *preq)
+{
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+
+	if (fbd == NULL)
+		return;
+
+	p = &fbd->reloc_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct ploop_request, reloc_link);
+		BUG_ON (preq->src_iblock == pr->src_iblock);
+
+		if (preq->src_iblock < pr->src_iblock)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&preq->reloc_link, parent, p);
+	rb_insert_color(&preq->reloc_link, &fbd->reloc_tree);
+}
+
+void ploop_fb_del_reloc_req(struct ploop_freeblks_desc *fbd,
+			    struct ploop_request *preq)
+{
+	BUG_ON (fbd == NULL);
+
+	rb_erase(&preq->reloc_link, &fbd->reloc_tree);
+}
+
+int ploop_fb_check_reloc_req(struct ploop_freeblks_desc *fbd,
+			     struct ploop_request *preq,
+			     unsigned long pin_state)
+{
+	struct rb_node *n;
+	struct ploop_request * p;
+
+	BUG_ON (fbd == NULL);
+	BUG_ON (preq->iblock == 0);
+	BUG_ON (preq->iblock >= fbd->fbd_first_lost_iblk);
+
+	n = fbd->reloc_tree.rb_node;
+	if (n == NULL)
+		return 0;
+
+	while (n) {
+		p = rb_entry(n, struct ploop_request, reloc_link);
+
+		if (preq->iblock < p->src_iblock)
+			n = n->rb_left;
+		else if (preq->iblock > p->src_iblock)
+			n = n->rb_right;
+		else {
+			spin_lock_irq(&fbd->plo->lock);
+			preq->eng_state = pin_state;
+			list_add_tail(&preq->list, &p->delay_list);
+			spin_unlock_irq(&fbd->plo->lock);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int ploop_fb_copy_freeblks_to_user(struct ploop_freeblks_desc *fbd, void *arg,
+				   struct ploop_freeblks_ctl *ctl)
+{
+	int   rc = 0;
+	int   n	 = 0;
+	struct ploop_freeblks_extent	 *fextent;
+	struct ploop_freeblks_ctl_extent  cext;
+
+	list_for_each_entry(fextent, &fbd->fbd_free_list, list)
+		if (ctl->n_extents) {
+			int off = offsetof(struct ploop_freeblks_ctl,
+					   extents[n]);
+			if (n++ >= ctl->n_extents) {
+				rc = -ENOSPC;
+				break;
+			}
+
+			cext.clu  = fextent->clu;
+			cext.iblk = fextent->iblk;
+			cext.len  = fextent->len;
+
+			rc = copy_to_user((u8*)arg + off, &cext, sizeof(cext));
+			if (rc)
+				break;
+		} else {
+			n++;
+		}
+
+	if (!rc) {
+		ctl->n_extents = n;
+		rc = copy_to_user((void*)arg, ctl, sizeof(*ctl));
+	}
+
+	return rc;
+}
+
+int ploop_fb_filter_freeblks(struct ploop_freeblks_desc *fbd, unsigned long minlen)
+{
+	struct ploop_freeblks_extent *fextent, *n;
+
+	list_for_each_entry_safe(fextent, n, &fbd->fbd_free_list, list)
+		if (fextent->len < minlen) {
+			list_del(&fextent->list);
+			fbd->fbd_n_free -= fextent->len;
+			kfree(fextent);
+		}
+
+	if (list_empty(&fbd->fbd_free_list))
+		fbd->fbd_ffb.ext = NULL;
+	else
+		fbd->fbd_ffb.ext = list_entry(fbd->fbd_free_list.next,
+						struct ploop_freeblks_extent,
+						list);
+	fbd->fbd_ffb.off = 0;
+
+	return fbd->fbd_n_free;
+}
+
+struct ploop_request *
+ploop_fb_get_zero_request(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_request * preq;
+
+	BUG_ON (fbd == NULL);
+	BUG_ON (list_empty(&fbd->free_zero_list));
+
+	preq = list_entry(fbd->free_zero_list.next,
+			  struct ploop_request, list);
+	list_del(&preq->list);
+	return preq;
+}
+
+void ploop_fb_put_zero_request(struct ploop_freeblks_desc *fbd,
+			       struct ploop_request *preq)
+{
+	list_add(&preq->list, &fbd->free_zero_list);
+}
+
+static iblock_t ffb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_ffb.ext->iblk + fbd->fbd_ffb.off;
+}
+static cluster_t ffb_clu(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_ffb.ext->clu + fbd->fbd_ffb.off;
+}
+static iblock_t lfb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lfb.ext->iblk + fbd->fbd_lfb.off;
+}
+static cluster_t lfb_clu(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lfb.ext->clu + fbd->fbd_lfb.off;
+}
+static iblock_t lrb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lrb.ext->iblk + fbd->fbd_lrb.off;
+}
+
+static iblock_t get_first_reloc_iblk(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_relocblks_extent *r_extent;
+
+	BUG_ON (list_empty(&fbd->fbd_reloc_list));
+	r_extent = list_entry(fbd->fbd_reloc_list.next,
+			      struct ploop_relocblks_extent, list);
+	return r_extent->iblk;
+}
+
+static void advance_ffb_simple(struct ploop_freeblks_desc *fbd)
+{
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+
+	if (fbd->fbd_ffb.off < fbd->fbd_ffb.ext->len - 1) {
+		fbd->fbd_ffb.off++;
+	} else {
+		if (fbd->fbd_ffb.ext->list.next == &fbd->fbd_free_list)
+			fbd->fbd_ffb.ext = NULL;
+		else
+			fbd->fbd_ffb.ext = list_entry(fbd->fbd_ffb.ext->list.next,
+						      struct ploop_freeblks_extent,
+						      list);
+		fbd->fbd_ffb.off = 0;
+	}
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate ffb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+	}
+}
+
+static void advance_lrb(struct ploop_freeblks_desc *fbd)
+{
+	iblock_t skip = 0;
+	BUG_ON (fbd->fbd_lrb.ext == NULL);
+
+	if (likely(fbd->fbd_lrb.off)) {
+		fbd->fbd_lrb.off--;
+	} else {
+		struct ploop_relocblks_extent *r_extent = fbd->fbd_lrb.ext;
+		/* here 'skip' means: [new_lrb_ext]<--skip-->[r_extent] */
+
+		if (fbd->fbd_lrb.ext->list.prev == &fbd->fbd_reloc_list) {
+			BUG_ON (fbd->fbd_lost_range_addon < 0);
+			skip = fbd->fbd_lost_range_addon;
+			fbd->fbd_lrb.ext = NULL;
+		} else {
+			fbd->fbd_lrb.ext = list_entry(fbd->fbd_lrb.ext->list.prev,
+						      struct ploop_relocblks_extent,
+						      list);
+			fbd->fbd_lrb.off = fbd->fbd_lrb.ext->len - 1;
+			BUG_ON (r_extent->iblk < fbd->fbd_lrb.ext->iblk +
+						 fbd->fbd_lrb.ext->len);
+			skip = r_extent->iblk - (fbd->fbd_lrb.ext->iblk +
+						 fbd->fbd_lrb.ext->len);
+		}
+	}
+
+	fbd->fbd_first_lost_iblk -= 1 + skip;
+	fbd->fbd_lost_range_len	 += 1 + skip;
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate ffb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+	}
+
+	BUG_ON(fbd->fbd_n_free <= 0);
+	fbd->fbd_n_free--;
+}
+
+static int split_fb_extent(struct ploop_freeblks_extent *extent, u32 *off_p,
+			   struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_freeblks_extent *new_extent;
+
+	new_extent = kzalloc(sizeof(*new_extent), GFP_KERNEL);
+	if (new_extent == NULL) {
+		printk("Can't allocate new freeblks extent for splittig!\n");
+		return -ENOMEM;
+	}
+
+	new_extent->clu	 = extent->clu	+ *off_p + 1;
+	new_extent->iblk = extent->iblk + *off_p + 1;
+	new_extent->len	 = extent->len	- *off_p - 1;
+
+	extent->len  = *off_p;
+
+	list_add(&new_extent->list, &extent->list);
+
+	(*off_p)--;
+	return 0;
+}
+
+static int advance_lfb_left(struct ploop_freeblks_desc *fbd)
+{
+	int rc = 0;
+	struct ploop_freeblks_extent *lfb_ext = fbd->fbd_lfb.ext;
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (lfb_ext == NULL);
+	BUG_ON (ffb_iblk(fbd) > lfb_iblk(fbd));
+
+	if (ffb_iblk(fbd) == lfb_iblk(fbd)) {
+		/* invalidate lfb */
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+		advance_ffb_simple(fbd);
+		return 0;
+	}
+
+	if (fbd->fbd_lfb.off) {
+		if (fbd->fbd_lfb.off == lfb_ext->len - 1) {
+			lfb_ext->len--;
+			fbd->fbd_lfb.off--;
+		} else {
+			rc = split_fb_extent(lfb_ext, &fbd->fbd_lfb.off, fbd);
+		}
+	} else {
+		BUG_ON (lfb_ext->list.prev == &fbd->fbd_free_list);
+		BUG_ON (lfb_ext == fbd->fbd_ffb.ext);
+
+		lfb_ext->clu++;
+		lfb_ext->iblk++;
+		lfb_ext->len--;
+
+		fbd->fbd_lfb.ext = list_entry(lfb_ext->list.prev,
+					      struct ploop_freeblks_extent,
+					      list);
+		fbd->fbd_lfb.off = fbd->fbd_lfb.ext->len - 1;
+
+		if (lfb_ext->len == 0) {
+			list_del(&lfb_ext->list);
+			kfree(lfb_ext);
+		}
+	}
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (fbd->fbd_lfb.ext == NULL);
+	BUG_ON (lfb_iblk(fbd) < ffb_iblk(fbd));
+	return rc;
+}
+
+int ploop_fb_get_reloc_block(struct ploop_freeblks_desc *fbd,
+			     cluster_t *from_clu_p, iblock_t *from_iblk_p,
+			     cluster_t *to_clu_p, iblock_t *to_iblk_p,
+			     u32 *free_p)
+{
+	cluster_t from_clu, to_clu;
+	iblock_t  from_iblk, to_iblk;
+	u32 free;
+	struct ploop_relocblks_extent *r_extent = fbd->fbd_lrb.ext;
+
+	if (!fbd)
+		return -1;
+
+	/* whole range is drained? */
+	if (r_extent == NULL)
+		return -1;
+
+	BUG_ON (fbd->fbd_lrb.off >= r_extent->len);
+
+	from_clu  = r_extent->clu  + fbd->fbd_lrb.off;
+	from_iblk = r_extent->iblk + fbd->fbd_lrb.off;
+	free	  = r_extent->free;
+
+	/* from_iblk is in range to relocate, but it's marked as free.
+	 * This means that we only need to zero its index, no actual
+	 * relocation needed. Such an operation doesn't consume free
+	 * block that fbd_last_free refers to */
+	if (free) {
+		/* The block we're going to zero-index was already re-used? */
+		if (fbd->fbd_ffb.ext == NULL || ffb_iblk(fbd) > from_iblk)
+			return -1;
+
+		BUG_ON (fbd->fbd_ffb.off  >= fbd->fbd_ffb.ext->len);
+
+		to_iblk = ~0U;
+		to_clu	= ~0U;
+	} else {
+		/* run out of free blocks which can be used as destination
+		 * for relocation ? */
+		if (fbd->fbd_lfb.ext == NULL)
+			return -1;
+
+		BUG_ON (fbd->fbd_ffb.ext == NULL);
+		BUG_ON (fbd->fbd_ffb.off  >= fbd->fbd_ffb.ext->len);
+		BUG_ON (fbd->fbd_lfb.off  >= fbd->fbd_lfb.ext->len);
+		BUG_ON (ffb_iblk(fbd) > lfb_iblk(fbd));
+
+		to_clu	= lfb_clu(fbd);
+		to_iblk = lfb_iblk(fbd);
+
+		if (advance_lfb_left(fbd)) {
+			/* Error implies stopping relocation */
+			fbd->fbd_lrb.ext = NULL;
+			fbd->fbd_lrb.off = 0;
+			return -1;
+		}
+	}
+
+	/* consume one block from the end of reloc list */
+	advance_lrb(fbd);
+
+	fbd->fbd_n_relocating++;
+
+	*from_clu_p  = from_clu;
+	*from_iblk_p = from_iblk;
+	*to_clu_p    = to_clu;
+	*to_iblk_p   = to_iblk;
+	*free_p	     = free;
+	return 0;
+}
+
+void ploop_fb_relocate_req_completed(struct ploop_freeblks_desc *fbd)
+{
+	fbd->fbd_n_relocated++;
+}
+
+static void advance_lfb_right(struct ploop_freeblks_desc *fbd)
+{
+	iblock_t iblk = get_first_reloc_iblk(fbd);
+
+	if (fbd->fbd_lfb.off < fbd->fbd_lfb.ext->len - 1) {
+		if (fbd->fbd_lfb.ext->iblk + fbd->fbd_lfb.off + 1 < iblk) {
+			fbd->fbd_lfb.off++;
+		}
+	} else if (fbd->fbd_lfb.ext->list.next != &fbd->fbd_free_list) {
+		struct ploop_freeblks_extent *f_extent;
+		f_extent = list_entry(fbd->fbd_lfb.ext->list.next,
+				      struct ploop_freeblks_extent,
+				      list);
+		if (f_extent->iblk < iblk) {
+			fbd->fbd_lfb.ext = f_extent;
+			fbd->fbd_lfb.off = 0;
+		}
+	}
+
+	/* invalidating ffb always implies invalidating lfb */
+	BUG_ON (fbd->fbd_ffb.ext == NULL && fbd->fbd_lfb.ext != NULL);
+
+	/* caller has just advanced ffb, but we must keep lfb intact
+	 * if next-free-block (following to lfb) is in reloc-range */
+	if (fbd->fbd_ffb.ext != NULL && fbd->fbd_lfb.ext != NULL &&
+	    lfb_iblk(fbd) < ffb_iblk(fbd)) {
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+	}
+}
+
+static void trim_reloc_list_one_blk(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_relocblks_extent *r_extent_first;
+	iblock_t iblk = lrb_iblk(fbd);
+	int invalidate = 0;
+
+	BUG_ON (list_empty(&fbd->fbd_reloc_list));
+	r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+				    struct ploop_relocblks_extent, list);
+
+	if (r_extent_first->len > 1) {
+		fbd->fbd_lost_range_addon = 0;
+		r_extent_first->iblk++;
+		r_extent_first->clu++;
+		r_extent_first->len--;
+		if (iblk < r_extent_first->iblk) {
+			invalidate = 1;
+		} else if (r_extent_first == fbd->fbd_lrb.ext) {
+			BUG_ON (fbd->fbd_lrb.off == 0);
+			fbd->fbd_lrb.off--;
+		}
+	} else {
+		if (r_extent_first == fbd->fbd_lrb.ext) {
+			invalidate = 1;
+		} else {
+			struct ploop_relocblks_extent *r_extent;
+			BUG_ON (r_extent_first->list.next ==
+				&fbd->fbd_reloc_list);
+			r_extent = list_entry(r_extent_first->list.next,
+					      struct ploop_relocblks_extent,
+					      list);
+			fbd->fbd_lost_range_addon = r_extent->iblk -
+				(r_extent_first->iblk + r_extent_first->len);
+		}
+		list_del(&r_extent_first->list);
+		kfree(r_extent_first);
+	}
+
+	if (invalidate) {
+		/* invalidate both lfb and lrb */
+		fbd->fbd_lrb.ext = NULL;
+		fbd->fbd_lrb.off = 0;
+		if (fbd->fbd_lfb.ext != NULL) {
+			fbd->fbd_lfb.ext = NULL;
+			fbd->fbd_lfb.off = 0;
+		}
+	}
+}
+
+static void advance_ffb(struct ploop_freeblks_desc *fbd)
+{
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (fbd->fbd_lfb.ext != NULL && ffb_iblk(fbd) > lfb_iblk(fbd));
+
+	if (fbd->fbd_ffb.off < fbd->fbd_ffb.ext->len - 1) {
+		fbd->fbd_ffb.off++;
+	} else {
+		if (fbd->fbd_ffb.ext->list.next == &fbd->fbd_free_list) {
+			BUG_ON (fbd->fbd_lfb.ext != NULL &&
+				ffb_iblk(fbd) != lfb_iblk(fbd));
+			fbd->fbd_ffb.ext = NULL;
+		} else {
+			fbd->fbd_ffb.ext = list_entry(fbd->fbd_ffb.ext->list.next,
+						      struct ploop_freeblks_extent,
+						      list);
+		}
+		fbd->fbd_ffb.off = 0;
+	}
+
+	if (fbd->fbd_ffb.ext == NULL && fbd->fbd_lfb.ext != NULL) {
+		/* invalidate lfb */
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+		return;
+	}
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate both ffb and lfb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+	}
+
+	/* nothing to do anymore if relocation process is completed */
+	if (fbd->fbd_lrb.ext == NULL)
+		return;
+
+	trim_reloc_list_one_blk(fbd);
+
+	/* trim could invalidate both lrb and lfb */
+	if (fbd->fbd_lrb.ext == NULL || fbd->fbd_lfb.ext == NULL)
+		return;
+
+	advance_lfb_right(fbd);
+}
+
+int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd,
+			    cluster_t *clu, iblock_t *iblk)
+{
+	if (!fbd)
+		return -1;
+
+	if (fbd->fbd_ffb.ext == NULL) {
+		BUG_ON (fbd->fbd_lfb.ext != NULL);
+		BUG_ON (fbd->fbd_lost_range_len < 0);
+
+		if (fbd->fbd_lost_range_len == 0)
+			return -1;
+
+		*iblk = fbd->fbd_first_lost_iblk++;
+		fbd->fbd_lost_range_len--;
+
+		if (fbd->fbd_lrb.ext != NULL) {
+			/* stop relocation process */
+			fbd->fbd_lrb.ext = NULL;
+			fbd->fbd_lrb.off = 0;
+		}
+
+		return 0;
+	}
+
+	BUG_ON (ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk);
+	BUG_ON (fbd->fbd_n_free <= 0);
+
+	*clu = ffb_clu(fbd);
+	fbd->fbd_n_free--;
+
+	if (fbd->plo->maintenance_type == PLOOP_MNTN_RELOC)
+		advance_ffb(fbd);
+	else
+		advance_ffb_simple(fbd);
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL && fbd->fbd_n_free != 0);
+	BUG_ON (fbd->fbd_ffb.ext != NULL && fbd->fbd_n_free == 0);
+
+	return 1;
+}
+
+static void fbd_complete_bio(struct ploop_freeblks_desc *fbd, int err)
+{
+	struct ploop_device *plo = fbd->plo;
+	unsigned int nr_completed = 0;
+
+	while (fbd->fbd_dbl.head) {
+		struct bio * bio = fbd->fbd_dbl.head;
+		fbd->fbd_dbl.head = bio->bi_next;
+		bio->bi_next = NULL;
+		BIO_ENDIO(plo->queue, bio, err);
+		nr_completed++;
+	}
+	fbd->fbd_dbl.tail = NULL;
+
+	spin_lock_irq(&plo->lock);
+	plo->bio_total -= nr_completed;
+	if (!bio_list_empty(&plo->bio_discard_list) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err)
+{
+	fbd_complete_bio(fbd, err);
+
+	while (!list_empty(&fbd->fbd_free_list)) {
+		struct ploop_freeblks_extent *fblk_extent;
+
+		fblk_extent = list_first_entry(&fbd->fbd_free_list,
+					       struct ploop_freeblks_extent,
+					       list);
+		list_del(&fblk_extent->list);
+		kfree(fblk_extent);
+	}
+
+	while (!list_empty(&fbd->fbd_reloc_list)) {
+		struct ploop_relocblks_extent *rblk_extent;
+
+		rblk_extent = list_first_entry(&fbd->fbd_reloc_list,
+					       struct ploop_relocblks_extent,
+					       list);
+		list_del(&rblk_extent->list);
+		kfree(rblk_extent);
+	}
+
+	fbd->fbd_n_free = 0;
+	fbd->fbd_ffb.ext = NULL;
+	fbd->fbd_lfb.ext = NULL;
+	fbd->fbd_lrb.ext = NULL;
+	fbd->fbd_ffb.off = 0;
+	fbd->fbd_lfb.off = 0;
+	fbd->fbd_lrb.off = 0;
+	fbd->fbd_n_relocated = fbd->fbd_n_relocating = 0;
+	fbd->fbd_lost_range_len = 0;
+	fbd->fbd_lost_range_addon = 0;
+
+	BUG_ON(!RB_EMPTY_ROOT(&fbd->reloc_tree));
+}
+
+struct ploop_freeblks_desc *ploop_fb_init(struct ploop_device *plo)
+{
+	struct ploop_freeblks_desc *fbd;
+	int i;
+
+	fbd = kmalloc(sizeof(struct ploop_freeblks_desc), GFP_KERNEL);
+	if (fbd == NULL)
+		return NULL;
+
+	fbd->fbd_dbl.tail = fbd->fbd_dbl.head = NULL;
+	INIT_LIST_HEAD(&fbd->fbd_free_list);
+	INIT_LIST_HEAD(&fbd->fbd_reloc_list);
+	fbd->reloc_tree = RB_ROOT;
+	fbd->fbd_freezed_level = -1;
+
+	fbd->plo = plo;
+
+	ploop_fb_reinit(fbd, 0);
+
+	INIT_LIST_HEAD(&fbd->free_zero_list);
+	for (i = 0; i < plo->tune.max_requests; i++) {
+		struct ploop_request * preq;
+		preq = kzalloc(sizeof(struct ploop_request), GFP_KERNEL);
+		if (preq == NULL)
+			goto fb_init_failed;
+
+		preq->plo = plo;
+		INIT_LIST_HEAD(&preq->delay_list);
+		list_add(&preq->list, &fbd->free_zero_list);
+	}
+
+	return fbd;
+
+fb_init_failed:
+	ploop_fb_fini(fbd, -ENOMEM);
+	return NULL;
+}
+
+void ploop_fb_fini(struct ploop_freeblks_desc *fbd, int err)
+{
+	struct ploop_device *plo;
+
+	if (fbd == NULL)
+		return;
+
+	plo = fbd->plo;
+	BUG_ON (plo == NULL);
+
+	fbd_complete_bio(fbd, err);
+
+	while (!list_empty(&fbd->fbd_free_list)) {
+		struct ploop_freeblks_extent *fblk_extent;
+
+		fblk_extent = list_first_entry(&fbd->fbd_free_list,
+					       struct ploop_freeblks_extent,
+					       list);
+		list_del(&fblk_extent->list);
+		kfree(fblk_extent);
+	}
+
+	while (!list_empty(&fbd->fbd_reloc_list)) {
+		struct ploop_relocblks_extent *rblk_extent;
+
+		rblk_extent = list_first_entry(&fbd->fbd_reloc_list,
+					       struct ploop_relocblks_extent,
+					       list);
+		list_del(&rblk_extent->list);
+		kfree(rblk_extent);
+	}
+
+	while (!list_empty(&fbd->free_zero_list)) {
+		struct ploop_request * preq;
+
+		preq = list_first_entry(&fbd->free_zero_list,
+					struct ploop_request,
+					list);
+		list_del(&preq->list);
+		kfree(preq);
+	}
+
+	kfree(fbd);
+	plo->fbd = NULL;
+}
+
+int ploop_fb_add_free_extent(struct ploop_freeblks_desc *fbd,
+			     cluster_t clu, iblock_t iblk, u32 len)
+{
+	struct ploop_freeblks_extent *fblk_extent;
+	struct ploop_freeblks_extent *ex;
+
+	if (len == 0) {
+		printk("ploop_fb_add_free_extent(): empty extent! (%u/%u)\n",
+		       clu, iblk);
+		return 0;
+	}
+
+	list_for_each_entry_reverse(ex, &fbd->fbd_free_list, list)
+		if (ex->iblk < iblk)
+			break;
+
+	if (ex->list.next != &fbd->fbd_free_list) {
+		struct ploop_freeblks_extent *tmp;
+		tmp = list_entry(ex->list.next, struct ploop_freeblks_extent, list);
+
+		if (iblk + len > tmp->iblk) {
+			printk("ploop_fb_add_free_extent(): intersected extents");
+			return -EINVAL;
+		}
+	}
+
+	if (&ex->list != &fbd->fbd_free_list) {
+		if (ex->iblk + ex->len > iblk) {
+			printk("ploop_fb_add_free_extent(): intersected extents");
+			return -EINVAL;
+		}
+	}
+
+	fblk_extent = kzalloc(sizeof(*fblk_extent), GFP_KERNEL);
+	if (fblk_extent == NULL)
+		return -ENOMEM;
+
+	fblk_extent->clu  = clu;
+	fblk_extent->iblk = iblk;
+	fblk_extent->len  = len;
+
+	list_add(&fblk_extent->list, &ex->list);
+
+	fbd->fbd_n_free	 += len;
+
+	fbd->fbd_ffb.ext = list_entry(fbd->fbd_free_list.next, struct ploop_freeblks_extent, list);
+	fbd->fbd_ffb.off = 0;
+
+	return 0;
+}
+
+int ploop_fb_add_reloc_extent(struct ploop_freeblks_desc *fbd,
+			      cluster_t clu, iblock_t iblk, u32 len, u32 free)
+{
+	struct ploop_relocblks_extent *rblk_extent;
+
+	if (len == 0) {
+		printk("ploop_fb_add_reloc_extent(): empty extent! (%u/%u)\n",
+		       clu, iblk);
+		return 0;
+	}
+
+	if (!list_empty(&fbd->fbd_reloc_list)) {
+		rblk_extent = list_entry(fbd->fbd_reloc_list.prev,
+					 struct ploop_relocblks_extent, list);
+		if (rblk_extent->iblk + rblk_extent->len > iblk) {
+			printk("ploop_fb_add_reloc_extent(): extents should be sorted");
+			return -EINVAL;
+		}
+
+		if (rblk_extent->list.next != &fbd->fbd_reloc_list) {
+			rblk_extent = list_entry(rblk_extent->list.next,
+					 struct ploop_relocblks_extent, list);
+			if (iblk + len > rblk_extent->iblk) {
+				printk("ploop_fb_add_reloc_extent(): intersected extents");
+				return -EINVAL;
+			}
+		}
+	}
+
+	rblk_extent = kzalloc(sizeof(*rblk_extent), GFP_KERNEL);
+	if (rblk_extent == NULL)
+		return -ENOMEM;
+
+	rblk_extent->clu  = clu;
+	rblk_extent->iblk = iblk;
+	rblk_extent->len  = len;
+	rblk_extent->free = free;
+
+	list_add_tail(&rblk_extent->list, &fbd->fbd_reloc_list);
+
+	return 0;
+}
+
+void ploop_fb_lost_range_init(struct ploop_freeblks_desc *fbd,
+			      iblock_t first_lost_iblk)
+{
+	fbd->fbd_first_lost_iblk = first_lost_iblk;
+	fbd->fbd_lost_range_len = 0;
+}
+
+void ploop_fb_relocation_start(struct ploop_freeblks_desc *fbd,
+			       __u32 n_scanned)
+{
+	iblock_t a_h = fbd->fbd_first_lost_iblk;
+	iblock_t new_a_h; /* where a_h will be after relocation
+			     if no WRITEs intervene */
+	struct ploop_relocblks_extent *r_extent;
+	struct ploop_relocblks_extent *r_extent_first;
+	int n_free = fbd->fbd_n_free;
+	u32 l;
+	struct ploop_freeblks_extent *fextent;
+
+	BUG_ON(fbd->fbd_lost_range_len != 0);
+	if (list_empty(&fbd->fbd_reloc_list)) {
+		fbd->fbd_first_lost_iblk -= n_scanned;
+		fbd->fbd_lost_range_len	 += n_scanned;
+		return;
+	}
+
+	r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+				    struct ploop_relocblks_extent, list);
+	r_extent = list_entry(fbd->fbd_reloc_list.prev,
+			      struct ploop_relocblks_extent, list);
+	new_a_h = r_extent->iblk + r_extent->len;
+
+	BUG_ON(fbd->fbd_first_lost_iblk < new_a_h);
+	fbd->fbd_lost_range_len = fbd->fbd_first_lost_iblk - new_a_h;
+	fbd->fbd_first_lost_iblk = new_a_h;
+
+	if (!n_free)
+		return;
+
+	while (1) {
+		l = MIN(n_free, r_extent->len);
+
+		n_free	-= l;
+		new_a_h -= l;
+
+		if (!n_free)
+			break;
+
+		if (r_extent->list.prev == &fbd->fbd_reloc_list) {
+			r_extent = NULL;
+			break;
+		} else {
+			r_extent = list_entry(r_extent->list.prev,
+					      struct ploop_relocblks_extent,
+					      list);
+		}
+		/* skip lost blocks */
+		new_a_h = r_extent->iblk + r_extent->len;
+	}
+
+	l = 0;
+
+	/* ploop-balloon scanned exactly range [a_h - n_scanned .. a_h - 1] */
+	if (n_free) {
+		l = r_extent_first->iblk - (a_h - n_scanned);
+	} else if (r_extent->iblk == new_a_h) {
+		if (r_extent == r_extent_first) {
+			l = r_extent->iblk - (a_h - n_scanned);
+		} else {
+			struct ploop_relocblks_extent *r_extent_prev;
+
+			BUG_ON (r_extent->list.prev == &fbd->fbd_reloc_list);
+			r_extent_prev = list_entry(r_extent->list.prev,
+						   struct ploop_relocblks_extent,
+						   list);
+			l = r_extent->iblk - (r_extent_prev->iblk +
+					      r_extent_prev->len);
+		}
+	}
+
+	new_a_h -= l;
+
+	/* let's trim reloc_list a bit based on new_a_h */
+	while (r_extent_first->iblk < new_a_h) {
+
+		if (r_extent_first->iblk + r_extent_first->len > new_a_h) {
+			l = new_a_h - r_extent_first->iblk;
+			r_extent_first->iblk += l;
+			r_extent_first->clu  += l;
+			r_extent_first->len  -= l;
+			break;
+		}
+
+		if (r_extent_first->list.next == &fbd->fbd_reloc_list) {
+			list_del(&r_extent_first->list);
+			kfree(r_extent_first);
+			break;
+		}
+
+		list_del(&r_extent_first->list);
+		kfree(r_extent_first);
+		r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+					    struct ploop_relocblks_extent,
+					    list);
+	}
+
+	if (!list_empty(&fbd->fbd_reloc_list)) {
+		fbd->fbd_lrb.ext = list_entry(fbd->fbd_reloc_list.prev,
+					      struct ploop_relocblks_extent,
+					      list);
+		fbd->fbd_lrb.off = fbd->fbd_lrb.ext->len - 1;
+
+		fbd->fbd_lost_range_addon = r_extent_first->iblk - new_a_h;
+	}
+
+	/* new_a_h is calculated. now, let's find "last free block" position */
+	if (ffb_iblk(fbd) < new_a_h) {
+		list_for_each_entry_reverse(fextent, &fbd->fbd_free_list, list)
+			if (fextent->iblk < new_a_h)
+				break;
+
+		BUG_ON(&fextent->list == &fbd->fbd_free_list);
+	} else
+		fextent = NULL;
+
+	fbd->fbd_lfb.ext = fextent; /* NULL means
+				       "no free blocks for relocation" */
+	if (fextent != NULL)
+		fbd->fbd_lfb.off = MIN(new_a_h - fextent->iblk,
+				       fextent->len) - 1;
+}
+
+int ploop_discard_add_bio(struct ploop_freeblks_desc *fbd, struct bio *bio)
+{
+	struct ploop_device *plo;
+
+	if (!fbd)
+		return -EOPNOTSUPP;
+
+	plo = fbd->plo;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state))
+		return -EOPNOTSUPP;
+	if (fbd->plo->maintenance_type != PLOOP_MNTN_DISCARD)
+		return -EBUSY;
+	/* only one request can be processed simultaneously */
+	if (fbd->fbd_dbl.head)
+		return -EBUSY;
+
+	fbd->fbd_dbl.head = fbd->fbd_dbl.tail = bio;
+
+	return 0;
+}
+
+int ploop_discard_is_inprogress(struct ploop_freeblks_desc *fbd)
+{
+	return fbd && fbd->fbd_dbl.head != NULL;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/freeblks.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/freeblks.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/freeblks.h	2015-01-21 12:02:54.893916573 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/freeblks.h	2015-01-21 12:02:55.540899398 +0300
@@ -0,0 +1,51 @@
+#ifndef __FREEBLKS_H__
+#define __FREEBLKS_H__
+
+/* freeblks API - in-kernel balloon support */
+
+/* init/fini stuff */
+struct ploop_freeblks_desc *ploop_fb_init(struct ploop_device *plo);
+void ploop_fb_fini(struct ploop_freeblks_desc *fbd, int err);
+void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err);
+int ploop_fb_add_free_extent(struct ploop_freeblks_desc *fbd, cluster_t clu, iblock_t iblk, u32 len);
+int ploop_fb_add_reloc_extent(struct ploop_freeblks_desc *fbd, cluster_t clu, iblock_t iblk, u32 len, u32 free);
+void ploop_fb_lost_range_init(struct ploop_freeblks_desc *fbd, iblock_t first_lost_iblk);
+void ploop_fb_relocation_start(struct ploop_freeblks_desc *fbd, __u32 n_scanned);
+int ploop_discard_add_bio(struct ploop_freeblks_desc *fbd, struct bio *bio);
+int ploop_discard_is_inprogress(struct ploop_freeblks_desc *fbd);
+
+/* avoid direct access to freeblks internals */
+int ploop_fb_get_n_relocated(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_n_relocating(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_n_free(struct ploop_freeblks_desc *fbd);
+iblock_t ploop_fb_get_alloc_head(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_lost_range_len(struct ploop_freeblks_desc *fbd);
+iblock_t ploop_fb_get_first_lost_iblk(struct ploop_freeblks_desc *fbd);
+
+/* get/set freezed level (for sanity checks) */
+int ploop_fb_get_freezed_level(struct ploop_freeblks_desc *fbd);
+void ploop_fb_set_freezed_level(struct ploop_freeblks_desc *fbd, int level);
+
+/* maintain rb-tree of "in progress" relocation requests */
+void ploop_fb_add_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+void ploop_fb_del_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+int ploop_fb_check_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq, unsigned long pin_state);
+
+/* helper for ioctl(PLOOP_IOC_FBGET) */
+int ploop_fb_copy_freeblks_to_user(struct ploop_freeblks_desc *fbd, void *arg,
+				   struct ploop_freeblks_ctl *ctl);
+int ploop_fb_filter_freeblks(struct ploop_freeblks_desc *fbd, unsigned long minlen);
+
+/* get/put "zero index" request */
+struct ploop_request *ploop_fb_get_zero_request(struct ploop_freeblks_desc *fbd);
+void ploop_fb_put_zero_request(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+
+/* get/put block to relocate */
+int ploop_fb_get_reloc_block(struct ploop_freeblks_desc *fbd, cluster_t *from_clu, iblock_t *from_iblk,
+			     cluster_t *to_clu, iblock_t *to_iblk, u32 *free);
+void ploop_fb_relocate_req_completed(struct ploop_freeblks_desc *fbd);
+
+/* get free block to reuse */
+int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd, cluster_t *clu, iblock_t *iblk);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io.c	2015-01-21 12:02:54.710921431 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io.c	2015-01-21 12:02:55.738894145 +0300
@@ -0,0 +1,142 @@
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+#include <linux/ploop/ploop_if.h>
+
+/* Generic IO routines. */
+
+static LIST_HEAD(ploop_ios);
+static DEFINE_MUTEX(ploop_ios_mutex);
+
+int ploop_register_io(struct ploop_io_ops * ops)
+{
+	mutex_lock(&ploop_ios_mutex);
+	list_add(&ops->list, &ploop_ios);
+	mutex_unlock(&ploop_ios_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_register_io);
+
+void ploop_unregister_io(struct ploop_io_ops * ops)
+{
+	mutex_lock(&ploop_ios_mutex);
+	list_del(&ops->list);
+	mutex_unlock(&ploop_ios_mutex);
+}
+EXPORT_SYMBOL(ploop_unregister_io);
+
+static struct ploop_io_ops * ploop_io_get(struct ploop_io *io, unsigned int id)
+{
+	struct ploop_io_ops * ops;
+
+	mutex_lock(&ploop_ios_mutex);
+	list_for_each_entry(ops, &ploop_ios, list) {
+		if ((id == ops->id || id == PLOOP_IO_AUTO) &&
+		    !ops->autodetect(io) && try_module_get(ops->owner)) {
+			mutex_unlock(&ploop_ios_mutex);
+			return ops;
+		}
+	}
+	mutex_unlock(&ploop_ios_mutex);
+	return NULL;
+}
+
+void ploop_io_put(struct ploop_io_ops * ops)
+{
+	module_put(ops->owner);
+}
+
+
+int
+ploop_io_init(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	int err;
+
+	if (nchunks != 1)
+		return -EINVAL;
+
+	if (pc[0].pctl_offset ||
+	    pc[0].pctl_start ||
+	    pc[0].pctl_len)
+		return -EINVAL;
+
+	memset(&delta->io, 0, sizeof(struct ploop_io));
+	delta->io.plo = delta->plo;
+	delta->io.n_chunks = 1;
+
+	err = -EBADF;
+	delta->io.files.file = fget(pc[0].pctl_fd);
+	if (!delta->io.files.file)
+		goto out_err;
+
+	err = -EOPNOTSUPP;
+	delta->io.ops = ploop_io_get(&delta->io, pc[0].pctl_type);
+	if (delta->io.ops == NULL)
+		goto out_err;
+
+	err = delta->io.ops->init(&delta->io);
+	if (err)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	if (delta->io.files.file)
+		fput(delta->io.files.file);
+	delta->io.files.file = NULL;
+	if (delta->io.ops)
+		ploop_io_put(delta->io.ops);
+	delta->io.ops = NULL;
+	return err;
+}
+EXPORT_SYMBOL(ploop_io_init);
+
+int ploop_io_open(struct ploop_io * io)
+{
+	struct file * file;
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+	if ((file = io->files.file) == NULL)
+		return -EBADF;
+
+	if ((delta->flags & PLOOP_FMT_RDONLY) &&
+	    (io->ops->f_mode(io) & FMODE_WRITE))
+		return -EINVAL;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY) &&
+	    !(io->ops->f_mode(io) & FMODE_WRITE))
+		return -EINVAL;
+
+	return io->ops->open(io);
+}
+EXPORT_SYMBOL(ploop_io_open);
+
+void ploop_io_destroy(struct ploop_io * io)
+{
+	if (io->ops) {
+		io->ops->destroy(io);
+		ploop_io_put(io->ops);
+		io->ops = NULL;
+	}
+}
+EXPORT_SYMBOL(ploop_io_destroy);
+
+void ploop_io_report_fn(struct file * file, char * msg)
+{
+	char *fn = "?";
+	char *path;
+
+	path = (char *)__get_free_page(GFP_KERNEL);
+	if (path) {
+		fn = d_path(&file->f_path, path, PAGE_SIZE);
+		if (IS_ERR(fn))
+			fn = "?";
+	}
+
+	printk("%s: %s\n", msg, fn);
+
+	if (path)
+		free_page((unsigned long)path);
+}
+EXPORT_SYMBOL(ploop_io_report_fn);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct.c	2015-01-21 12:02:54.711921404 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct.c	2015-01-21 12:02:57.824838774 +0300
@@ -0,0 +1,1915 @@
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include <linux/falloc.h>
+#include <linux/magic.h>
+
+#include <linux/ploop/ploop.h>
+#include <linux/ploop/ploop_if.h>
+#include <linux/ploop/compat.h>
+#include "ploop_events.h"
+#include "io_direct_map.h"
+
+#define CREATE_TRACE_POINTS
+#include "io_direct_events.h"
+
+/* from fs/ext4/ext4.h */
+#define EXT4_EXTENTS_FL			0x00080000
+
+#define MIN(a, b) (a < b ? a : b)
+
+#define PLOOP_MAX_PREALLOC(plo) (128 * 1024 * 1024) /* 128MB */
+
+#define PLOOP_MAX_EXTENT_MAP (64 * 1024 * 1024)    /* 64MB */
+int max_extent_map_pages __read_mostly;
+int min_extent_map_entries __read_mostly;
+
+/* total sum of m->size for all ploop_mapping structs */
+atomic_long_t ploop_io_images_size = ATOMIC_LONG_INIT(0);
+
+/* Direct IO from/to file.
+ *
+ * Holes in image file are not allowed.
+ */
+
+static inline sector_t
+dio_isec_to_phys(struct extent_map * em, sector_t isec)
+{
+	return (isec - em->start) + em->block_start;
+}
+
+DEFINE_BIO_CB(dio_endio_async)
+{
+	struct ploop_request * preq = bio->bi_private;
+
+	if (!err && !bio_flagged(bio, BIO_UPTODATE))
+		err = -EIO;
+	if (err)
+		ploop_set_error(preq, err);
+
+	ploop_complete_io_request(preq);
+
+	bio_put(bio);
+}
+END_BIO_CB(dio_endio_async)
+
+struct bio_list_walk
+{
+	struct bio * cur;
+	int idx;
+	int bv_off;
+};
+
+static int cached_submit(struct ploop_io *io, iblock_t iblk,
+	      struct ploop_request * preq,
+	      struct bio_list * sbl, unsigned int size);
+
+static void
+dio_submit(struct ploop_io *io, struct ploop_request * preq,
+	   unsigned long rw,
+	   struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	struct bio_list bl;
+	struct bio * bio = NULL;
+	struct extent_map * em;
+	sector_t sec, nsec;
+	int err;
+	struct bio_list_walk bw;
+	int preflush;
+	int postfua = 0;
+	int write = !!(rw & (1<<BIO_RW));
+
+	trace_submit(preq);
+
+	preflush = !!(rw & BIO_FLUSH);
+	rw &= ~BIO_FLUSH;
+
+	/* In case of eng_state != COMPLETE, we'll do FUA in
+	 * ploop_index_update(). Otherwise, we should mark
+	 * last bio as FUA here. */
+	if (rw & BIO_FUA) {
+		rw &= ~BIO_FUA;
+		if (preq->eng_state == PLOOP_E_COMPLETE)
+			postfua = 1;
+	}
+
+	bio_list_init(&bl);
+
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	if ((rw & (1<<BIO_RW)) &&
+	    !(io->files.file->f_mode & FMODE_WRITE)) {
+		err = -EBADF;
+		goto out;
+	}
+
+	sec = sbl->head->bi_sector;
+	sec = ((sector_t)iblk << preq->plo->cluster_log) | (sec & ((1<<preq->plo->cluster_log) - 1));
+
+	em = extent_lookup_create(io, sec, size);
+	if (IS_ERR(em))
+		goto out_em_err;
+
+	if (write && em->block_start == BLOCK_UNINIT) {
+		sector_t end = (sector_t)(iblk + 1) << preq->plo->cluster_log;
+		sec = (sector_t)iblk << preq->plo->cluster_log;
+
+		if (em->start <= sec)
+			sec = em->end;
+		extent_put(em);
+
+		while (sec < end) {
+			em = extent_lookup_create(io, sec, end - sec);
+			if (IS_ERR(em))
+				goto out_em_err;
+			if (em->block_start != BLOCK_UNINIT)
+				goto write_unint_fail;
+
+			sec = em->end;
+			extent_put(em);
+		}
+
+		goto write_unint;
+	}
+
+	ploop_prepare_io_request(preq);
+	if (rw & (1<<BIO_RW))
+		ploop_prepare_tracker(preq, sec);
+
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+
+	bio = NULL;
+
+	while (size > 0) {
+		struct bio_vec * bv;
+		int copy;
+
+		bv = bw.cur->bi_io_vec + bw.idx;
+
+		if (bw.bv_off >= bv->bv_len) {
+			bw.idx++;
+			bv++;
+			bw.bv_off = 0;
+			if (bw.idx >= bw.cur->bi_vcnt) {
+				bw.cur = bw.cur->bi_next;
+				bw.idx = 0;
+				bv = bw.cur->bi_io_vec;
+			}
+			BUG_ON(bv->bv_len & 511);
+		}
+
+		if (sec >= em->end) {
+			extent_put(em);
+			em = extent_lookup_create(io, sec, size);
+			if (IS_ERR(em))
+				goto out_em_err;
+			if (write && em->block_start == BLOCK_UNINIT)
+				goto write_unint_fail;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (em->block_start != BLOCK_UNINIT &&
+		     (bio == NULL ||
+		     bio->bi_sector + (bio->bi_size>>9) != nsec)) {
+
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = bv->bv_len - bw.bv_off;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+
+		if (em->block_start == BLOCK_UNINIT) {
+			void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+			memset(kaddr + bv->bv_offset + bw.bv_off, 0, copy);
+			kunmap_atomic(kaddr, KM_USER0);
+		} else if (bio_add_page(bio, bv->bv_page, copy,
+				 bv->bv_offset + bw.bv_off) != copy) {
+			/* Oops, this chunk does not fit. Flush and start
+			 * fresh bio.
+			 */
+			goto flush_bio;
+		}
+
+		bw.bv_off += copy;
+		size -= copy >> 9;
+		sec += copy >> 9;
+	}
+	extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		unsigned long rw2 = rw;
+
+		bl.head = b->bi_next;
+		atomic_inc(&preq->io_count);
+		b->bi_next = NULL;
+		b->bi_private = preq;
+		b->bi_end_io = dio_endio_async;
+
+		if (unlikely(preflush)) {
+			rw2 |= BIO_FLUSH;
+			preflush = 0;
+		}
+		if (unlikely(postfua && !bl.head))
+			rw2 |= BIO_FUA;
+
+		ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
+		submit_bio(rw2 & ~(bl.head ? (1 << BIO_RW_UNPLUG) : 0), b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+write_unint:
+	spin_lock_irq(&preq->plo->lock);
+	ploop_add_lockout(preq, 0);
+	spin_unlock_irq(&preq->plo->lock);
+
+	err = cached_submit(io, iblk, preq, sbl, size);
+	goto out;
+
+write_unint_fail:
+	extent_put(em);
+	err = -EIO;
+	ploop_msg_once(io->plo, "A part of cluster is in uninitialized extent.");
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+
+	if (err)
+		ploop_fail_request(preq, err);
+}
+
+struct bio_iter {
+	struct bio     *bio;  /* traverses sbl */
+	struct bio_vec *bv;   /* traverses bio->bi_io_vec */
+	int             off;  /* offset in bv payload:
+			       * 0 <= off < bv->bv_len */
+};
+
+static inline void bio_iter_init(struct bio_iter *biter, struct bio_list *sbl)
+{
+	biter->bio  = sbl->head;
+	biter->bv   = biter->bio->bi_io_vec;
+	biter->off  = 0;
+}
+
+static inline void bio_iter_advance(struct bio_iter *biter, int len)
+{
+	if (biter->bv->bv_len - biter->off > len) {
+		biter->off += len;
+		return;
+	}
+
+	BUG_ON (biter->bv->bv_len - biter->off != len);
+
+	biter->bv++;
+	biter->off = 0;
+
+	if (biter->bv - biter->bio->bi_io_vec < biter->bio->bi_vcnt)
+		return;
+
+	biter->bio = biter->bio->bi_next;
+	if (biter->bio)
+		biter->bv = biter->bio->bi_io_vec;
+}
+
+static void bcopy_from_blist(struct page *page, int dst_off, /* dst */
+			     struct bio_iter *biter,         /* src */
+			     int copy_len)                   /* len */
+{
+	u8 *kdst = kmap_atomic(page, KM_USER0);
+
+	while (copy_len > 0) {
+		u8 *ksrc;
+		int copy = MIN(copy_len, biter->bv->bv_len - biter->off);
+
+		ksrc = kmap_atomic(biter->bv->bv_page, KM_USER1);
+		memcpy(kdst + dst_off,
+		       ksrc + biter->bv->bv_offset + biter->off,
+		       copy);
+		kunmap_atomic(ksrc, KM_USER1);
+
+		copy_len -= copy;
+		dst_off  += copy;
+		bio_iter_advance(biter, copy);
+		BUG_ON (copy_len && !biter->bio);
+	}
+
+	kunmap_atomic(kdst, KM_USER0);
+}
+
+static inline void bzero_page(struct page *page)
+{
+	void *kaddr = kmap_atomic(page, KM_USER0);
+
+	memset(kaddr, 0, PAGE_SIZE);
+
+	kunmap_atomic(kaddr, KM_USER0);
+}
+
+
+static int
+cached_submit(struct ploop_io *io, iblock_t iblk, struct ploop_request * preq,
+	      struct bio_list * sbl, unsigned int size)
+{
+	struct ploop_device * plo = preq->plo;
+	int err = 0;
+	loff_t pos, end_pos, start, end;
+	loff_t clu_siz = 1 << (plo->cluster_log + 9);
+	struct bio_iter biter;
+	loff_t new_size;
+
+	trace_cached_submit(preq);
+
+	pos = (loff_t)iblk << (plo->cluster_log + 9);
+	end_pos = pos + clu_siz;
+	sb_start_write(io->files.inode->i_sb);
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,24)
+	if (end_pos > i_size_read(io->files.inode) &&
+	    io->files.inode->i_op->fallocate &&
+	    io->files.flags & EXT4_EXTENTS_FL) {
+		if (unlikely(io->prealloced_size < clu_siz)) {
+			loff_t prealloc = end_pos;
+			if (prealloc > PLOOP_MAX_PREALLOC(plo))
+				prealloc = PLOOP_MAX_PREALLOC(plo);
+try_again:
+			err = io->files.inode->i_op->fallocate(io->files.inode, 0,
+							       pos, prealloc);
+			if (err) {
+				if (err == -ENOSPC && prealloc != clu_siz) {
+					prealloc = clu_siz;
+					goto try_again;
+				} else {
+					sb_end_write(io->files.inode->i_sb);
+					return err;
+				}
+			}
+
+			io->prealloced_size = prealloc;
+		}
+
+		io->prealloced_size -= clu_siz;
+	}
+#endif
+
+	bio_iter_init(&biter, sbl);
+	mutex_lock(&io->files.inode->i_mutex);
+
+	start = pos + ((sbl->head->bi_sector & ((1<<plo->cluster_log)-1)) << 9);
+	end = start + (size << 9);
+	ploop_prepare_tracker(preq, start>>9);
+
+	while (pos < end_pos) {
+		struct page * page;
+		void * fsdata;
+
+		err = pagecache_write_begin(io->files.file, io->files.mapping,
+					    pos, PAGE_CACHE_SIZE, 0,
+					    &page, &fsdata);
+		if (err)
+			break;
+
+		if (pos < start || pos + PAGE_CACHE_SIZE > end)
+			bzero_page(page);
+
+		if (pos < end && pos + PAGE_CACHE_SIZE > start) {
+			int dst_off = 0;
+			int copy_len = PAGE_CACHE_SIZE;
+
+			if (pos < start) {
+				dst_off = start - pos;
+				copy_len -= dst_off;
+				if (pos + PAGE_CACHE_SIZE > end)
+					copy_len = end - start;
+			} else {
+				if (pos + PAGE_CACHE_SIZE > end)
+					copy_len = end - pos;
+			}
+
+			bcopy_from_blist(page, dst_off, &biter, copy_len);
+		}
+
+		err = pagecache_write_end(io->files.file, io->files.mapping,
+					  pos, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
+					  page, &fsdata);
+		if (err != PAGE_CACHE_SIZE) {
+			if (err >= 0)
+				err = -EIO;
+			break;
+		}
+		err = 0;
+
+		pos += PAGE_CACHE_SIZE;
+	}
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	new_size = i_size_read(io->files.inode);
+	atomic_long_add(new_size - *io->size_ptr, &ploop_io_images_size);
+	*io->size_ptr = new_size;
+
+	if (!err)
+		err = filemap_fdatawrite(io->files.mapping);
+
+	if (!err) {
+		spin_lock_irq(&plo->lock);
+		ploop_acc_flush_skip_locked(plo, preq->req_rw);
+		preq->iblock = iblk;
+		list_add_tail(&preq->list, &io->fsync_queue);
+		plo->st.bio_syncwait++;
+		if ((test_bit(PLOOP_REQ_SYNC, &preq->state) ||
+		     ++io->fsync_qlen >= plo->tune.fsync_max) &&
+		    waitqueue_active(&io->fsync_waitq))
+			wake_up_interruptible(&io->fsync_waitq);
+		else if (!timer_pending(&io->fsync_timer))
+			mod_timer(&io->fsync_timer, jiffies + plo->tune.fsync_delay);
+		spin_unlock_irq(&plo->lock);
+	}
+	sb_end_write(io->files.inode->i_sb);
+	return err;
+}
+
+/* Submit the whole cluster. If preq contains only partial data
+ * within the cluster, pad the rest of cluster with zeros.
+ */
+static void
+dio_submit_pad(struct ploop_io *io, struct ploop_request * preq,
+	       struct bio_list * sbl, unsigned int size,
+	       struct extent_map *em)
+{
+	struct bio_list bl;
+	struct bio * bio = NULL;
+	sector_t sec, end_sec, nsec, start, end;
+	struct bio_list_walk bw;
+	int err;
+	int preflush = !!(preq->req_rw & BIO_FLUSH);
+
+	bio_list_init(&bl);
+
+	/* sec..end_sec is the range which we are going to write */
+	sec = (sector_t)preq->iblock << preq->plo->cluster_log;
+	end_sec = sec + (1 << preq->plo->cluster_log);
+
+	/* start..end is data that we have. The rest must be zero padded. */
+	start = sec + (sbl->head->bi_sector & ((1<<preq->plo->cluster_log) - 1));
+	end = start + size;
+
+	if (IS_ERR(em))
+		goto out_em_err;
+
+#if 1
+	/* GCC, shut up! */
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+#endif
+
+	ploop_prepare_io_request(preq);
+	ploop_prepare_tracker(preq, start);
+
+	bio = NULL;
+
+	while (sec < end_sec) {
+		struct page * page;
+		unsigned int poff, plen;
+
+		if (sec < start) {
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = start - sec;
+			if (plen > (PAGE_SIZE>>9))
+				plen = (PAGE_SIZE>>9);
+		} else if (sec >= end) {
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = end_sec - sec;
+			if (plen > (PAGE_SIZE>>9))
+				plen = (PAGE_SIZE>>9);
+		} else {
+			/* sec >= start && sec < end */
+			struct bio_vec * bv;
+
+			if (sec == start) {
+				bw.cur = sbl->head;
+				bw.idx = 0;
+				bw.bv_off = 0;
+				BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+			}
+			bv = bw.cur->bi_io_vec + bw.idx;
+
+			if (bw.bv_off >= bv->bv_len) {
+				bw.idx++;
+				bv++;
+				bw.bv_off = 0;
+				if (bw.idx >= bw.cur->bi_vcnt) {
+					bw.cur = bw.cur->bi_next;
+					bw.idx = 0;
+					bw.bv_off = 0;
+					bv = bw.cur->bi_io_vec;
+				}
+				BUG_ON(bv->bv_len & 511);
+			}
+
+			page = bv->bv_page;
+			poff = bv->bv_offset + bw.bv_off;
+			plen = (bv->bv_len - bw.bv_off) >> 9;
+		}
+
+		if (sec >= em->end) {
+			extent_put(em);
+			em = extent_lookup_create(io, sec, end_sec - sec);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		if (plen > em->end - sec)
+			plen = em->end - sec;
+
+		if (bio_add_page(bio, page, plen<<9, poff) != (plen<<9)) {
+			/* Oops, this chunk does not fit. Flush and start
+			 * new bio
+			 */
+			goto flush_bio;
+		}
+
+		bw.bv_off += (plen<<9);
+		BUG_ON(plen == 0);
+		sec += plen;
+	}
+	extent_put(em);
+
+	while (bl.head) {
+		unsigned long rw;
+		struct bio * b = bl.head;
+
+		bl.head = b->bi_next;
+		atomic_inc(&preq->io_count);
+		b->bi_next = NULL;
+		b->bi_private = preq;
+		b->bi_end_io = dio_endio_async;
+
+		rw = sbl->head->bi_rw | WRITE;
+		if (unlikely(preflush)) {
+			rw |= BIO_FLUSH;
+			preflush = 0;
+		}
+		ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
+		submit_bio(rw & ~(bl.head ? (1 << BIO_RW_UNPLUG) : 0), b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	ploop_fail_request(preq, err);
+}
+
+static struct extent_map * dio_fallocate(struct ploop_io *io, u32 iblk, int nr)
+{
+	struct extent_map * em;
+	mutex_lock(&io->files.inode->i_mutex);
+	em = map_extent_get_block(io,
+				  io->files.mapping,
+				  (sector_t)iblk << io->plo->cluster_log,
+				  1 << io->plo->cluster_log,
+				  1, mapping_gfp_mask(io->files.mapping),
+				  NULL);
+	mutex_unlock(&io->files.inode->i_mutex);
+	return em;
+}
+
+
+static void
+dio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	int err;
+	iblock_t iblk = io->alloc_head++;
+
+	trace_submit_alloc(preq);
+
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		ploop_fail_request(preq, -EBADF);
+		return;
+	}
+
+	/* io->fallocate is not a "posix" fallocate()!
+	 *
+	 * We require backing fs gave us _uninitialized_ blocks,
+	 * otherwise it does not make sense to go that way.
+	 *
+	 * IMPORTANT: file _grows_ and dio_submit_alloc() cannot
+	 * complete requests until i_size is commited to disk.
+	 * Read this as: no hope to do this in a non-suboptimal way,
+	 * linux updates i_size synchronously even when O_DIRECT AIO
+	 * is requested. Even in PCSS we have to update i_size synchronously.
+	 * Obviously, we will expand file by larger pieces
+	 * and take some measures to avoid initialization of the blocks
+	 * and the same time leakage of uninitizlized data
+	 * to user of our device.
+	 */
+	if (io->files.em_tree->_get_extent) {
+		struct extent_map * em;
+
+		em = dio_fallocate(io, iblk, 1);
+		if (unlikely(IS_ERR(em))) {
+			ploop_fail_request(preq, PTR_ERR(em));
+			return;
+		}
+
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_DATA_WBI;
+
+		dio_submit_pad(io, preq, sbl, size, em);
+		return;
+	}
+
+	err = cached_submit(io, iblk, preq, sbl, size);
+	if (err) {
+		if (err == -ENOSPC)
+			io->alloc_head--;
+		ploop_fail_request(preq, err);
+	}
+	preq->eng_state = PLOOP_E_DATA_WBI;
+}
+
+/* When backing fs does not export any method to allocate new blocks
+ * without initialization, we fallback to cached write with subsequent
+ * fsync. Obviously, this is going to be utterly inefficient.
+ *
+ * Here is a workaround. We start writeback, but do not fsync()
+ * immediately, but start a timer, which wakes up ploop_sync thread.
+ *
+ * Requests are queued to ploop_sync and when timer expires or we
+ * have a lot of requests scheduled for sync, the thread call
+ * real fsync.
+ *
+ * Still not sure this is an improvement. :-)
+ */
+
+static int dio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		LIST_HEAD(list);
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		INIT_LIST_HEAD(&list);
+		list_splice_init(&io->fsync_queue, &list);
+		spin_unlock_irq(&plo->lock);
+
+		/* filemap_fdatawrite() has been made already */
+		filemap_fdatawait(io->files.mapping);
+
+		err = 0;
+		mutex_lock(&io->files.inode->i_mutex);
+		if (io->files.file->f_op->fsync)
+			err = io->files.file->f_op->FOP_FSYNC(io->files.file,
+							      0);
+		mutex_unlock(&io->files.inode->i_mutex);
+
+		/* Do we need to invalidate page cache? Not really,
+		 * because we use it only to create full new pages,
+		 * which we overwrite completely. Probably, we should
+		 * invalidate in a non-blocking way to reclaim memory
+		 * faster than it happens with normal LRU logic.
+		 */
+
+		spin_lock_irq(&plo->lock);
+
+		while (!list_empty(&list)) {
+			struct ploop_request * preq;
+			preq = list_entry(list.next, struct ploop_request, list);
+			list_del(&preq->list);
+			if (err)
+				ploop_set_error(preq, err);
+			list_add_tail(&preq->list, &plo->ready_queue);
+			io->fsync_qlen--;
+		}
+		plo->st.bio_fsync++;
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+static int dio_fsync(struct file * file)
+{
+	int err, ret;
+	struct address_space *mapping = file->f_mapping;
+
+	ret = filemap_write_and_wait(mapping);
+	mutex_lock(&mapping->host->i_mutex);
+	err = 0;
+	if (file->f_op && file->f_op->fsync) {
+		err = file->f_op->FOP_FSYNC(file, 0);
+		if (!ret)
+			ret = err;
+	}
+	mutex_unlock(&mapping->host->i_mutex);
+	return ret;
+}
+
+/* Invalidate page cache. It is called with inode mutex taken
+ * and mapping mapping must be synced. If some dirty pages remained,
+ * it will fail.
+ *
+ * Retry with fs freeze is required to work around a race (bug?)
+ * in ext3, where some blocks can be held by uncommited transaction.
+ * The procedure is dangerous. No mutexes should be held, ploop
+ * must not be quiesced.
+ */
+
+static int dio_invalidate_cache(struct address_space * mapping,
+				struct block_device * bdev)
+{
+	int err;
+	int attempt2 = 0;
+
+retry:
+	err = invalidate_inode_pages2(mapping);
+	if (err) {
+		printk("PLOOP: failed to invalidate page cache %d/%d\n", err, attempt2);
+		if (attempt2)
+			return err;
+		attempt2 = 1;
+
+		mutex_unlock(&mapping->host->i_mutex);
+		thaw_bdev(bdev, freeze_bdev(bdev));
+		mutex_lock(&mapping->host->i_mutex);
+		goto retry;
+	}
+	return err;
+}
+
+static int dio_truncate(struct ploop_io *, struct file *, __u32);
+
+static int dio_release_prealloced(struct ploop_io * io)
+{
+	int ret;
+
+	if (!io->prealloced_size)
+		return 0;
+
+	ret = dio_truncate(io, io->files.file, io->alloc_head);
+	if (ret)
+		printk("Can't release %llu prealloced bytes: "
+		       "truncate to %llu failed (%d)\n",
+		       io->prealloced_size,
+		       (loff_t)io->alloc_head << (io->plo->cluster_log + 9),
+		       ret);
+	else
+		io->prealloced_size = 0;
+
+	return ret;
+}
+
+static void dio_destroy(struct ploop_io * io)
+{
+	if (io->files.file) {
+		struct file * file;
+		struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+		(void)dio_release_prealloced(io);
+
+		if (io->files.em_tree) {
+			io->files.em_tree = NULL;
+			mutex_lock(&io->files.inode->i_mutex);
+			ploop_dio_close(io, delta->flags & PLOOP_FMT_RDONLY);
+			(void)dio_invalidate_cache(io->files.mapping, io->files.bdev);
+			mutex_unlock(&io->files.inode->i_mutex);
+		}
+
+		del_timer_sync(&io->fsync_timer);
+
+		if (io->fsync_thread) {
+			kthread_stop(io->fsync_thread);
+			io->fsync_thread = NULL;
+		}
+
+		file = io->files.file;
+		mutex_lock(&delta->plo->sysfs_mutex);
+		io->files.file = NULL;
+		mutex_unlock(&delta->plo->sysfs_mutex);
+		if (!(delta->flags & PLOOP_FMT_RDONLY))
+			file_update_time(file);
+		fput(file);
+	}
+}
+
+static int dio_sync(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+
+	if (file)
+		dio_fsync(file);
+	return 0;
+}
+
+static int dio_stop(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+
+	if (file) {
+		dio_fsync(file);
+	}
+	return 0;
+}
+
+static int dio_open(struct ploop_io * io)
+{
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	int err = 0;
+	struct file * file = io->files.file;
+	struct extent_map_tree * em_tree;
+
+	if (file == NULL)
+		return -EBADF;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = io->files.inode->i_sb->s_bdev;
+
+	dio_fsync(file);
+
+	mutex_lock(&io->files.inode->i_mutex);
+	em_tree = ploop_dio_open(io, (delta->flags & PLOOP_FMT_RDONLY));
+	err = PTR_ERR(em_tree);
+	if (IS_ERR(em_tree))
+		goto out;
+
+	io->files.em_tree = em_tree;
+
+	err = dio_invalidate_cache(io->files.mapping, io->files.bdev);
+	if (err) {
+		io->files.em_tree = NULL;
+		ploop_dio_close(io, 0);
+		goto out;
+	}
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY) && !io->files.em_tree->_get_extent) {
+		io->fsync_thread = kthread_create(dio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  delta->plo->index);
+		if (io->fsync_thread == NULL) {
+			io->files.em_tree = NULL;
+			ploop_dio_close(io, 0);
+			goto out;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+out:
+	mutex_unlock(&io->files.inode->i_mutex);
+	return err;
+}
+
+void fsync_timeout(unsigned long data)
+{
+	struct ploop_io * io = (void*)data;
+
+	wake_up_interruptible(&io->fsync_waitq);
+}
+
+static int
+dio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+	init_timer(&io->fsync_timer);
+	io->fsync_timer.function = fsync_timeout;
+	io->fsync_timer.data = (unsigned long)io;
+
+	return 0;
+}
+
+struct dio_comp
+{
+	struct completion comp;
+	atomic_t count;
+	int error;
+};
+
+DEFINE_BIO_CB(dio_endio_sync)
+{
+	struct dio_comp * comp = bio->bi_private;
+
+	if (!err && !bio_flagged(bio, BIO_UPTODATE))
+		err = -EIO;
+	if (err && !comp->error)
+		comp->error = err;
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+
+	bio_put(bio);
+}
+END_BIO_CB(dio_endio_sync)
+
+static int
+dio_sync_io(struct ploop_io * io, int rw, struct page * page,
+	    unsigned int len, unsigned int off, sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	struct dio_comp comp;
+	struct extent_map * em;
+	sector_t nsec;
+	int err;
+
+	BUG_ON(len & 511);
+	BUG_ON(off & 511);
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+
+	init_completion(&comp.comp);
+	atomic_set(&comp.count, 1);
+	comp.error = 0;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (bio_add_page(bio, page, copy, off) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_sync;
+		b->bi_private = &comp;
+		atomic_inc(&comp.count);
+		submit_bio(rw & ~(bl.head ? (1<<BIO_RW_UNPLUG) : 0), b);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	return err;
+}
+
+static int
+dio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+	      unsigned int off, sector_t pos)
+{
+	return dio_sync_io(io, READ_SYNC, page, len, off, pos);
+}
+
+static int
+dio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+	       unsigned int off, sector_t sec)
+{
+	int err;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	err = dio_sync_io(io, WRITE_SYNC, page, len, off, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return err;
+}
+
+static int
+dio_sync_iovec(struct ploop_io * io, int rw, struct page ** pvec,
+	       unsigned int nr, sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	struct dio_comp comp;
+	unsigned int len = PAGE_SIZE * nr;
+	unsigned int off;
+	struct extent_map * em;
+	int err;
+	sector_t nsec;
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+	off = 0;
+
+	init_completion(&comp.comp);
+	atomic_set(&comp.count, 1);
+	comp.error = 0;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (off/PAGE_SIZE != (off + copy + 1)/PAGE_SIZE)
+			copy = PAGE_SIZE - (off & (PAGE_SIZE-1));
+		if (bio_add_page(bio, pvec[off/PAGE_SIZE], copy,
+				 off & (PAGE_SIZE-1) ) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_sync;
+		b->bi_private = &comp;
+		atomic_inc(&comp.count);
+		submit_bio(rw & ~(bl.head ? (1<<BIO_RW_UNPLUG) : 0), b);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	return err;
+}
+
+static int
+dio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		 sector_t sec)
+{
+	return dio_sync_iovec(io, READ_SYNC, pvec, nr, sec);
+}
+
+static int
+dio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		  sector_t sec)
+{
+	int err;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	err = dio_sync_iovec(io, WRITE_SYNC, pvec, nr, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return err;
+}
+
+/*
+ * Allocate and zero new block in file. Do it through page cache.
+ * It is assumed there is no point to optimize this, it is used
+ * (for ploop1 format) only for allocation of index clusters. Another
+ * use-case is growing raw delta, but this is assumed to be rare.
+ */
+static int dio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	int err;
+	int ret;
+	struct page *pad = NULL;
+	int pad_len = pos & (PAGE_CACHE_SIZE - 1);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,24)
+	if (pos + len > i_size_read(io->files.inode) &&
+	    io->files.inode->i_op->fallocate) {
+		err = io->files.inode->i_op->fallocate(io->files.inode, 0,
+						       pos, len);
+		if (err)
+			return err;
+	}
+#endif
+
+	if (pad_len) {
+		BUILD_BUG_ON(PAGE_SIZE != PAGE_CACHE_SIZE);
+
+		pad = alloc_page(GFP_NOFS);
+		if (pad == NULL)
+			return -ENOMEM;
+
+		len += pad_len;
+		pos -= pad_len;
+
+		err = dio_sync_read(io, pad, pad_len, 0, pos >> 9);
+		if (err) {
+			put_page(pad);
+			return err;
+		}
+	}
+
+	mutex_lock(&io->files.inode->i_mutex);
+
+	err = 0;
+
+	while (len > 0) {
+		struct page *page;
+		void *fsdata;
+
+		ret = pagecache_write_begin(io->files.file, io->files.mapping,
+					    pos, PAGE_CACHE_SIZE, 0,
+					    &page, &fsdata);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
+
+		bzero_page(page);
+
+		if (pad) {
+			memcpy(page_address(page), page_address(pad), pad_len);
+			put_page(pad);
+			pad = NULL;
+		}
+
+		ret = pagecache_write_end(io->files.file, io->files.mapping,
+					  pos, PAGE_CACHE_SIZE,
+					  PAGE_CACHE_SIZE, page, fsdata);
+		if (ret < 0 || ret != PAGE_CACHE_SIZE) {
+			err = ret;
+			goto fail;
+		}
+
+		len -= PAGE_CACHE_SIZE;
+		pos += PAGE_CACHE_SIZE;
+	}
+
+	err = filemap_fdatawrite(io->files.mapping);
+	if (err)
+		goto fail;
+
+	if (io->files.file->f_op && io->files.file->f_op->fsync) {
+		err = io->files.file->f_op->FOP_FSYNC(io->files.file, 0);
+		if (err)
+			goto fail;
+	}
+	err = filemap_fdatawait(io->files.mapping);
+
+fail:
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (pad)
+		put_page(pad);
+
+	if (!err)
+		io->alloc_head = pos >> (io->plo->cluster_log + 9);
+
+	return err;
+}
+
+static void
+dio_io_page(struct ploop_io * io, unsigned long rw,
+	    struct ploop_request * preq, struct page * page,
+	    sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	unsigned int len;
+	struct extent_map * em;
+	sector_t nsec;
+	int err;
+	int off;
+	int postfua;
+
+	postfua = !!(rw & BIO_FUA);
+	rw &= ~BIO_FUA;
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+	off = 0;
+
+	ploop_prepare_io_request(preq);
+	if (rw & (1 << BIO_RW))
+		ploop_prepare_tracker(preq, sec);
+
+	len = PAGE_SIZE;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (bio_add_page(bio, page, copy, off) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		extent_put(em);
+
+	while (bl.head) {
+		unsigned long rw2 = rw;
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		if (unlikely(postfua && !bl.head))
+			rw2 |= BIO_FUA;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_async;
+		b->bi_private = preq;
+		atomic_inc(&preq->io_count);
+		ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
+		submit_bio(rw2 | (bl.head ? 0 : (1<<BIO_RW_UNPLUG)), b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	ploop_fail_request(preq, err);
+}
+
+static void
+dio_read_page(struct ploop_io * io, struct ploop_request * preq,
+	      struct page * page, sector_t sec)
+{
+	dio_io_page(io, READ | (1 << BIO_RW_SYNCIO), preq, page, sec);
+}
+
+static void
+dio_write_page(struct ploop_io * io, struct ploop_request * preq,
+	       struct page * page, sector_t sec, int fua)
+{
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		ploop_fail_request(preq, -EBADF);
+		return;
+	}
+
+	dio_io_page(io, WRITE | (fua ? BIO_FUA : 0) | (1 << BIO_RW_SYNCIO),
+		    preq, page, sec);
+}
+
+static int
+dio_fastmap(struct ploop_io * io, struct bio * orig_bio,
+	    struct bio * bio, sector_t isec)
+{
+	struct request_queue * q;
+	struct extent_map * em;
+	int i;
+	struct bvec_merge_data bm_data;
+
+	if (orig_bio->bi_size == 0) {
+		bio->bi_vcnt   = 0;
+		bio->bi_sector = 0;
+		bio->bi_size   = 0;
+		bio->bi_idx    = 0;
+
+		bio->bi_rw   = orig_bio->bi_rw;
+		bio->bi_bdev = io->files.bdev;
+		return 0;
+	}
+
+	em = extent_lookup(io->files.em_tree, isec);
+
+	if (em == NULL) {
+		io->plo->st.fast_neg_noem++;
+		return 1;
+	}
+
+	if (isec + (orig_bio->bi_size>>9) > em->end) {
+		io->plo->st.fast_neg_shortem++;
+		extent_put(em);
+		return 1;
+	}
+
+	BUG_ON(bio->bi_max_vecs < orig_bio->bi_vcnt);
+
+	memcpy(bio->bi_io_vec, orig_bio->bi_io_vec,
+	       orig_bio->bi_vcnt * sizeof(struct bio_vec));
+
+	bio->bi_sector = dio_isec_to_phys(em, isec);
+	extent_put(em);
+
+	bio->bi_bdev = io->files.bdev;
+	bio->bi_rw = orig_bio->bi_rw;
+	bio->bi_vcnt = orig_bio->bi_vcnt;
+	bio->bi_size = orig_bio->bi_size;
+	bio->bi_idx = orig_bio->bi_idx;
+
+	q = bdev_get_queue(bio->bi_bdev);
+
+	if (q->merge_bvec_fn == NULL)
+		return 0;
+
+	bio->bi_size = 0;
+	bio->bi_vcnt = 0;
+
+	bm_data.bi_bdev = bio->bi_bdev;
+	bm_data.bi_sector = bio->bi_sector;
+	bm_data.bi_size = 0;
+	bm_data.bi_rw = bio->bi_rw;
+
+	for (i = 0; i < orig_bio->bi_vcnt; i++) {
+		struct bio_vec * bv = &bio->bi_io_vec[i];
+		if (q->merge_bvec_fn(q, &bm_data, bv) < bv->bv_len) {
+			io->plo->st.fast_neg_backing++;
+			return 1;
+		}
+		bio->bi_size += bv->bv_len;
+		bm_data.bi_size = bio->bi_size;
+		bio->bi_vcnt++;
+	}
+	return 0;
+}
+
+/* Merge is disabled _only_ if we _have_ resolved mapping and
+ * we are sure bio is going to be split in any case due to
+ * file level fragmentation.
+ */
+static int
+dio_disable_merge(struct ploop_io * io, sector_t isector, unsigned int len)
+{
+	int ret = 0;
+	struct extent_map * em;
+
+	em = extent_lookup(io->files.em_tree, isector);
+	if (em) {
+		if (isector + len > em->end)
+			ret = 1;
+		extent_put(em);
+	}
+	return ret;
+}
+
+static int dio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+
+	file = dentry_open(dget(F_DENTRY(file)), mntget(F_MNT(file)), O_RDONLY|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	dio_fsync(file);
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = dio_invalidate_cache(io->files.mapping, io->files.bdev);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static void dio_trim_prealloc(struct ploop_io * io, struct file * file)
+{
+	loff_t size;
+
+	size = (loff_t)io->alloc_head << (io->plo->cluster_log + 9);
+
+	if (size < i_size_read(io->files.inode)) {
+		struct iattr iattr;
+		/* Can't be here because of FALLOC_FL_KEEP_SIZE */
+		printk(KERN_WARNING "ploop%d: dio prealloc: %lld < %lld",
+		       io->plo->index, size, i_size_read(io->files.inode));
+		iattr.ia_size = size;
+		iattr.ia_valid = ATTR_SIZE;
+		notify_change(F_DENTRY(file), &iattr);
+
+		if (file->f_op->fsync)
+			file->f_op->FOP_FSYNC(file, 0);
+
+		trim_extent_mappings(io->files.em_tree, size >> 9);
+	}
+}
+
+static int dio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int ret;
+
+	ret = dio_release_prealloced(io);
+	if (ret)
+		return ret;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	mutex_lock(&io->files.inode->i_mutex);
+	ploop_dio_downgrade(io->files.mapping);
+	dio_trim_prealloc(io, file);
+	(void)invalidate_inode_pages2(io->files.mapping);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int dio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+
+	file = dentry_open(dget(F_DENTRY(file)), mntget(F_MNT(file)), O_RDWR|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	dio_fsync(file);
+
+	mutex_lock(&io->files.inode->i_mutex);
+
+	err = dio_invalidate_cache(io->files.mapping, io->files.bdev);
+	if (err) {
+		mutex_unlock(&io->files.inode->i_mutex);
+		fput(file);
+		return err;
+	}
+
+	err = ploop_dio_upgrade(io);
+	if (err) {
+		mutex_unlock(&io->files.inode->i_mutex);
+		fput(file);
+		return err;
+	}
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (!io->files.em_tree->_get_extent) {
+		io->fsync_thread = kthread_create(dio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  io->plo->index);
+		if (io->fsync_thread == NULL) {
+			fput(file);
+			return -ENOMEM;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int dio_truncate(struct ploop_io * io, struct file * file,
+			__u32 alloc_head)
+{
+	int err;
+	struct iattr newattrs;
+	loff_t new_size;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size = (u64)alloc_head << (io->plo->cluster_log + 9);
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	if (io->files.em_tree)
+		trim_extent_mappings(io->files.em_tree, newattrs.ia_size>>9);
+	io->files.inode->i_flags &= ~S_SWAPFILE;
+	err = notify_change(F_DENTRY(file), &newattrs);
+	io->files.inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	new_size = i_size_read(io->files.inode);
+	atomic_long_sub(*io->size_ptr - new_size, &ploop_io_images_size);
+	*io->size_ptr = new_size;
+
+	if (!err)
+		err = dio_fsync(file);
+
+	return err;
+}
+
+static int dio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static void dio_unplug(struct ploop_io * io)
+{
+	struct request_queue *bq;
+
+	bq = bdev_get_queue(io->files.bdev);
+
+	if (bq->unplug_fn)
+		bq->unplug_fn(bq);
+}
+
+static int dio_congested(struct ploop_io * io, int bits)
+{
+	struct request_queue *bq;
+
+	bq = bdev_get_queue(io->files.bdev);
+
+	return bdi_congested(&bq->backing_dev_info, bits);
+}
+
+static void dio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+	q->max_sectors = 0;
+	q->max_hw_sectors = 0;
+#endif
+	blk_queue_stack_limits(q, bdev_get_queue(io->files.bdev));
+}
+
+static void dio_issue_flush(struct ploop_io * io, struct ploop_request *preq)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOFS, 0);
+	if (unlikely(!bio)) {
+		ploop_fail_request(preq, -ENOMEM);
+		return;
+	}
+
+	ploop_prepare_io_request(preq);
+	bio->bi_end_io = dio_endio_async;
+	bio->bi_bdev = io->files.bdev;
+	bio->bi_private = preq;
+
+	atomic_inc(&preq->io_count);
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw);
+	submit_bio(preq->req_rw, bio);
+	ploop_complete_io_request(preq);
+}
+
+static int dio_dump(struct ploop_io * io)
+{
+	extern void dump_extent_map(struct extent_map_tree *tree);
+
+	if (io->files.em_tree) {
+		dump_extent_map(io->files.em_tree);
+		return 0;
+	}
+	return -1;
+}
+
+static int dio_autodetect(struct ploop_io * io)
+{
+	struct file  * file  = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+	char         * s_id  = inode->i_sb->s_id;
+
+	int err;
+	mm_segment_t fs;
+	unsigned int flags;
+	
+	if (inode->i_sb->s_magic != EXT4_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (inode->i_sb->s_bdev == NULL) {
+		printk("File on FS EXT(%s) without backing device\n", s_id);
+		return -1;
+	}
+
+	if (!inode->i_op->fallocate)
+		ploop_io_report_fn(file, KERN_WARNING
+					"File on FS w/o fallocate");
+
+	if (!file->f_op->unlocked_ioctl) {
+		printk("Cannot run on EXT4(%s): no unlocked_ioctl\n", s_id);
+		return -1;
+	}
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	flags = 0;
+	err = file->f_op->unlocked_ioctl(file, FS_IOC_GETFLAGS, (long)&flags);
+	set_fs(fs);
+
+	if (err != 0) {
+		printk("Cannot run on EXT4(%s): failed FS_IOC_GETFLAGS (%d)\n",
+		       s_id, err);
+		return -1;
+	}
+
+	io->files.flags = flags;
+	if (!(flags & EXT4_EXTENTS_FL))
+		ploop_io_report_fn(file, KERN_WARNING "File w/o extents");
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_direct =
+{
+	.id		=	PLOOP_IO_DIRECT,
+	.name		=	"direct",
+	.owner		=	THIS_MODULE,
+
+	.unplug		=	dio_unplug,
+	.congested	=	dio_congested,
+
+	.alloc		=	dio_alloc_sync,
+	.submit		=	dio_submit,
+	.submit_alloc	=	dio_submit_alloc,
+	.disable_merge	=	dio_disable_merge,
+	.fastmap	=	dio_fastmap,
+	.read_page	=	dio_read_page,
+	.write_page	=	dio_write_page,
+	.sync_read	=	dio_sync_read,
+	.sync_write	=	dio_sync_write,
+	.sync_readvec	=	dio_sync_readvec,
+	.sync_writevec	=	dio_sync_writevec,
+
+	.init		=	dio_init,
+	.destroy	=	dio_destroy,
+	.open		=	dio_open,
+	.sync		=	dio_sync,
+	.stop		=	dio_stop,
+	.prepare_snapshot =	dio_prepare_snapshot,
+	.complete_snapshot =	dio_complete_snapshot,
+	.prepare_merge  =	dio_prepare_merge,
+	.start_merge	=	dio_start_merge,
+	.truncate	=	dio_truncate,
+
+	.queue_settings	=	dio_queue_settings,
+	.issue_flush	=	dio_issue_flush,
+
+	.dump		=	dio_dump,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       dio_autodetect,
+};
+
+module_param(max_extent_map_pages, int, 0644);
+MODULE_PARM_DESC(max_extent_map_pages, "Maximal amount of pages taken by all extent map caches");
+module_param(min_extent_map_entries, int, 0644);
+MODULE_PARM_DESC(min_extent_map_entries, "Minimal amount of entries in a single extent map cache");
+
+static int __init pio_direct_mod_init(void)
+{
+	int err;
+
+	if (max_extent_map_pages == 0)
+		max_extent_map_pages = PLOOP_MAX_EXTENT_MAP >> PAGE_SHIFT;
+
+	if (min_extent_map_entries == 0)
+		min_extent_map_entries = 64;
+
+	err = extent_map_init();
+	if (!err) {
+		err = ploop_register_io(&ploop_io_ops_direct);
+		if (err)
+			extent_map_exit();
+	}
+
+	return err;
+}
+
+static void __exit pio_direct_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_direct);
+	extent_map_exit();
+	BUG_ON(atomic_long_read(&ploop_io_images_size));
+}
+
+module_init(pio_direct_mod_init);
+module_exit(pio_direct_mod_exit);
+
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct_events.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct_events.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct_events.h	2015-01-21 12:02:55.232907575 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct_events.h	2015-01-21 12:02:55.358904231 +0300
@@ -0,0 +1,42 @@
+#if !defined(_TRACE_IO_DIRECT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_DIRECT_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+#include "io_direct_map.h"
+
+TRACE_EVENT(add_extent_mapping,
+	TP_PROTO(struct extent_map *em),
+
+	TP_ARGS(em),
+
+	TP_STRUCT__entry(
+		__field(sector_t,  start)
+		__field(sector_t,  end)
+		__field(sector_t,  bstart)
+	),
+
+	TP_fast_assign(
+		__entry->start	= em->start;
+		__entry->end	= em->end;
+		__entry->bstart	= em->block_start;
+	),
+
+	TP_printk("start=0x%lx end=0x%lx block_start=0x%lx",
+			__entry->start, __entry->end, __entry->bstart)
+);
+
+#endif /* _TRACE_PLOOP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE io_direct_events
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct_map.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct_map.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct_map.c	2015-01-21 12:02:54.711921404 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct_map.c	2015-01-21 12:02:57.816838987 +0300
@@ -0,0 +1,856 @@
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/ploop/ploop_if.h>
+#include "io_direct_events.h"
+#include "io_direct_map.h"
+
+/* Part of io_direct shared between all the devices.
+ * No way this code is good. But it is the best, which we can do
+ * not modifying core.
+ *
+ * Keep track of images opened by ploop. Maintain shared extent
+ * maps for shared images, which are open read-only. Top level
+ * deltas, which are open for write, are open exclusively.
+ *
+ * Also take care about setting/clearing S_SWAPFILE and setting
+ * mapping gfp mask to GFP_NOFS.
+ */
+
+struct ploop_mapping
+{
+	struct list_head	list;
+	struct address_space	* mapping;
+	int			readers;
+	unsigned long		saved_gfp_mask;
+	loff_t			size;
+
+	struct extent_map_tree	extent_root;
+};
+
+static LIST_HEAD(ploop_mappings);
+static DEFINE_SPINLOCK(ploop_mappings_lock);
+
+/* total number of extent_map structures */
+static atomic_t ploop_extent_maps_count = ATOMIC_INIT(0);
+
+static void extent_map_tree_init(struct extent_map_tree *tree);
+static int drop_extent_map(struct extent_map_tree *tree);
+static int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+extern atomic_long_t ploop_io_images_size;
+
+/*
+ * ploop_dio_* functions must be called with i_mutex taken.
+ */
+
+struct extent_map_tree *
+ploop_dio_open(struct ploop_io * io, int rdonly)
+{
+	int err;
+	struct ploop_mapping *m, *pm;
+	struct file * file = io->files.file;
+	struct address_space * mapping = file->f_mapping;
+
+	pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
+
+	err = 0;
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				if (m->readers < 0)
+					err = -ETXTBSY;
+				else
+					m->readers++;
+			} else {
+				if (m->readers)
+					err = -EBUSY;
+				else
+					m->readers = -1;
+			}
+
+out_unlock:
+			spin_unlock(&ploop_mappings_lock);
+			if (pm)
+				kfree(pm);
+			if (!err)
+				io->size_ptr = &m->size;
+			return err ? ERR_PTR(err) : &m->extent_root;
+		}
+	}
+
+	if (pm == NULL) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	if (mapping->host->i_flags & S_SWAPFILE) {
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
+	pm->mapping = mapping;
+	extent_map_tree_init(&pm->extent_root);
+	pm->extent_root.mapping = mapping;
+	pm->readers = rdonly ? 1 : -1;
+	list_add(&pm->list, &ploop_mappings);
+	mapping->host->i_flags |= S_SWAPFILE;
+	io->size_ptr = &pm->size;
+	*io->size_ptr = i_size_read(mapping->host);
+	atomic_long_add(*io->size_ptr, &ploop_io_images_size);
+
+	pm->saved_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping,
+			     pm->saved_gfp_mask & ~__GFP_FS);
+
+	spin_unlock(&ploop_mappings_lock);
+
+	if (strcmp(mapping->host->i_sb->s_type->name, "pcss") == 0) {
+		struct ploop_xops xops;
+		if (file->f_op->unlocked_ioctl) {
+			mm_segment_t fs = get_fs();
+
+			set_fs(KERNEL_DS);
+			xops.magic = 0;
+			err = file->f_op->unlocked_ioctl(file, PLOOP_IOC_INTERNAL, (long)&xops);
+			set_fs(fs);
+			if (err == 0 && xops.magic == PLOOP_INTERNAL_MAGIC)
+				pm->extent_root._get_extent = xops.get_extent;
+		}
+	}
+	return &pm->extent_root;
+}
+
+int
+ploop_dio_close(struct ploop_io * io, int rdonly)
+{
+	struct address_space * mapping = io->files.mapping;
+	struct ploop_mapping *m, *pm = NULL;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				m->readers--;
+			} else {
+				BUG_ON(m->readers != -1);
+				m->readers = 0;
+			}
+
+			if (m->readers == 0) {
+				atomic_long_sub(*io->size_ptr,
+						&ploop_io_images_size);
+				*io->size_ptr = 0;
+				mapping->host->i_flags &= ~S_SWAPFILE;
+				list_del(&m->list);
+				pm = m;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+
+	if (pm) {
+		drop_extent_map(&pm->extent_root);
+		BUG_ON(pm->extent_root.map_size);
+		kfree(pm);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+void ploop_dio_downgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			BUG_ON(m->readers != -1);
+			m->readers = 1;
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+}
+
+int ploop_dio_upgrade(struct ploop_io * io)
+{
+	struct address_space * mapping = io->files.mapping;
+	struct ploop_mapping * m;
+	int err = -ESRCH;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			err = -EBUSY;
+			if (m->readers == 1) {
+				loff_t new_size = i_size_read(io->files.inode);
+				atomic_long_add(new_size - *io->size_ptr,
+						&ploop_io_images_size);
+				*io->size_ptr = new_size;
+
+				m->readers = -1;
+				err = 0;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+	return err;
+}
+
+
+/* The rest of the file is written by Jens Axboe.
+ * I just fixed a few of bugs (requests not aligned at fs block size
+ * due to direct-io aligned to 512) and truncated some useless functionality.
+ *
+ * In any case, it must be remade: not only because of GPL, but also
+ * because it is not good.
+ */
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("ploop_itree",
+						sizeof(struct extent_map), 0,
+						SLAB_MEM_SPREAD, NULL
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+						, NULL
+#endif
+						);
+	if (!extent_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void extent_map_exit(void)
+{
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+}
+
+static void extent_map_tree_init(struct extent_map_tree *tree)
+{
+	tree->map.rb_node = NULL;
+	INIT_LIST_HEAD(&tree->lru_list);
+	tree->map_size = 0;
+	rwlock_init(&tree->lock);
+}
+
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+	struct extent_map *em;
+
+	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+	if (em) {
+		atomic_set(&em->refs, 1);
+		INIT_LIST_HEAD(&em->lru_link);
+		atomic_inc(&ploop_extent_maps_count);
+	}
+	return em;
+}
+
+void extent_put(struct extent_map *em)
+{
+	if (!em)
+		return;
+	if (atomic_dec_and_test(&em->refs)) {
+		atomic_dec(&ploop_extent_maps_count);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, sector_t start,
+				   sector_t end, struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct extent_map *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		if (end <= entry->start)
+			p = &(*p)->rb_left;
+		else if (start >= entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/* Find extent which contains "offset". If there is no such extent,
+ * prev_ret is the first extent following "offset".
+ */
+static struct rb_node *__tree_search(struct rb_root *root, sector_t offset,
+				     struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct extent_map, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset >= entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while (prev && offset >= prev_entry->end) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+/* Find the first extent which could intersect a range starting at offset.
+ * Probably, it does not contain offset.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, sector_t offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static int tree_delete(struct rb_root *root, sector_t offset)
+{
+	struct rb_node *node;
+
+	node = __tree_search(root, offset, NULL);
+	if (!node)
+		return -ENOENT;
+	rb_erase(node, root);
+	return 0;
+}
+
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+	if (prev->end == next->start &&
+	    next->block_start == extent_map_block_end(prev))
+		return 1;
+	return 0;
+}
+
+static inline int purge_lru_mapping(struct extent_map_tree *tree)
+{
+	int max_entries = (max_extent_map_pages << PAGE_SHIFT) /
+		sizeof(struct extent_map);
+
+	return atomic_read(&ploop_extent_maps_count) > max_entries &&
+	       tree->map_size > max(1, min_extent_map_entries) &&
+	       (u64)tree->map_size * atomic_long_read(&ploop_io_images_size) >
+	       (u64)max_entries * i_size_read(tree->mapping->host);
+}
+
+static inline void purge_lru_warn(struct extent_map_tree *tree)
+{
+	int max_entries = (max_extent_map_pages << PAGE_SHIFT) /
+		sizeof(struct extent_map);
+
+	loff_t ratio = i_size_read(tree->mapping->host) * 100;
+	do_div(ratio, atomic_long_read(&ploop_io_images_size));
+
+	printk(KERN_WARNING "Purging lru entry from extent tree for inode %ld "
+	       "(map_size=%d ratio=%lld%%)\n",
+	       tree->mapping->host->i_ino, tree->map_size, ratio);
+
+	/* Claim FS as 'too fragmented' if average_extent_size < 8MB */
+	if ((u64)max_entries * (8 * 1024 * 1024) <
+	    atomic_long_read(&ploop_io_images_size))
+		printk(KERN_WARNING "max_extent_map_pages=%d is too low for "
+		       "ploop_io_images_size=%ld bytes\n",
+		       max_extent_map_pages,
+		       atomic_long_read(&ploop_io_images_size));
+	else {
+		loff_t avg_siz = i_size_read(tree->mapping->host);
+		do_div(avg_siz, tree->map_size);
+
+		printk(KERN_WARNING "host fs is too fragmented: average extent"
+		       " size is lesser than %lld bytes\n", avg_siz);
+	}
+}
+
+/*
+ * add_extent_mapping tries a simple forward/backward merge with existing
+ * mappings.  The extent_map struct passed in will be inserted into
+ * the tree directly (no copies made, just a reference taken).
+ */
+static int add_extent_mapping(struct extent_map_tree *tree,
+			      struct extent_map *em)
+{
+	int ret = 0;
+	struct rb_node *rb;
+
+	write_lock_irq(&tree->lock);
+
+	do {
+		rb = tree_insert(&tree->map, em->start, em->end, &em->rb_node);
+		/* A part of this extent can be in tree */
+		if (rb) {
+			struct extent_map *tmp =
+				rb_entry(rb, struct extent_map, rb_node);
+			BUG_ON(tmp->block_start - tmp->start !=
+					em->block_start - em->start);
+			if (tmp->start <= em->start &&
+			    tmp->end >= em->end) {
+				ret =  -EEXIST;
+				goto out;
+			}
+			if (tmp->start < em->start) {
+				em->start = tmp->start;
+				em->block_start = tmp->block_start;
+			}
+			if (tmp->end > em->end)
+				em->end = tmp->end;
+			rb_erase(rb, &tree->map);
+			list_del_init(&tmp->lru_link);
+			tree->map_size--;
+			extent_put(tmp);
+		} else {
+			list_add_tail(&em->lru_link, &tree->lru_list);
+			tree->map_size++;
+
+			if (purge_lru_mapping(tree)) {
+				struct extent_map *victim_em;
+				static unsigned long purge_lru_time;
+
+				/* Warn about this once per hour */
+				if (printk_timed_ratelimit(&purge_lru_time,
+							   60*60*HZ))
+					purge_lru_warn(tree);
+
+				victim_em = list_entry(tree->lru_list.next,
+						       struct extent_map,
+						       lru_link);
+
+				list_del_init(&victim_em->lru_link);
+				tree->map_size--;
+				rb_erase(&victim_em->rb_node, &tree->map);
+				extent_put(victim_em);
+			}
+		}
+	} while (rb);
+
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb) {
+			struct extent_map *merge;
+
+			merge = rb_entry(rb, struct extent_map, rb_node);
+			if (mergable_maps(merge, em)) {
+				em->start = merge->start;
+				em->block_start = merge->block_start;
+				rb_erase(&merge->rb_node, &tree->map);
+				list_del_init(&merge->lru_link);
+				tree->map_size--;
+				extent_put(merge);
+			}
+		}
+	}
+	rb = rb_next(&em->rb_node);
+	if (rb) {
+		struct extent_map *merge;
+
+		merge = rb_entry(rb, struct extent_map, rb_node);
+		if (mergable_maps(em, merge)) {
+			em->end = merge->end;
+			rb_erase(&merge->rb_node, &tree->map);
+			list_del_init(&merge->lru_link);
+			tree->map_size--;
+			extent_put(merge);
+		}
+	}
+
+	trace_add_extent_mapping(em);
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+struct extent_map *
+extent_lookup(struct extent_map_tree *tree, sector_t start)
+{
+	struct extent_map *em = NULL;
+	struct rb_node *rb_node;
+
+	/* extent_lookup() is called under plo->lock, so irq is disabled */
+	read_lock(&tree->lock);
+	rb_node = __tree_search(&tree->map, start, NULL);
+	if (rb_node) {
+		em = rb_entry(rb_node, struct extent_map, rb_node);
+		atomic_inc(&em->refs);
+	}
+	read_unlock(&tree->lock);
+
+	if (em) {
+		write_lock(&tree->lock);
+		/* em could not be released, but could be deleted
+		 * from the list before we re-acquired the lock */
+		if (!list_empty(&em->lru_link)) {
+			list_del(&em->lru_link);
+			list_add_tail(&em->lru_link, &tree->lru_list);
+		}
+		write_unlock(&tree->lock);
+	}
+
+	return em;
+}
+
+/*
+ * lookup_extent_mapping returns the first extent_map struct in the
+ * tree that intersects the [start, start+len) range.  There may
+ * be additional objects in the tree that intersect, so check the object
+ * returned carefully to make sure you don't need additional lookups.
+ */
+static struct extent_map *
+lookup_extent_mapping(struct extent_map_tree *tree, sector_t start, sector_t len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+
+	read_lock_irq(&tree->lock);
+	rb_node = tree_search(&tree->map, start);
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (em->end <= start || em->start >= start + len) {
+		em = NULL;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+
+out:
+	read_unlock_irq(&tree->lock);
+	return em;
+}
+
+/*
+ * removes an extent_map struct from the tree.  No reference counts are
+ * dropped, and no checks are done to  see if the range is in use
+ */
+static int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret;
+
+	write_lock_irq(&tree->lock);
+	ret = tree_delete(&tree->map, em->start);
+	if (!ret) {
+		list_del_init(&em->lru_link);
+		tree->map_size--;
+	}
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+static struct extent_map *__map_extent_get_extent(struct extent_map_tree *tree,
+						  struct address_space *mapping,
+						  sector_t start, sector_t len, int create,
+						  gfp_t gfp_mask)
+{
+	struct inode *inode = mapping->host;
+	struct extent_map *em;
+	sector_t nstart, result;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(tree, start, len);
+	if (em) {
+		if (em->start <= start && em->end >= start + len)
+			return em;
+
+		/*
+		 * we may have found an extent that starts after the
+		 * requested range.  Double check and alter the length
+		 * appropriately
+		 */
+		if (em->start > start) {
+			len = em->start - start;
+		} else if (!create) {
+			return em;
+		}
+		extent_put(em);
+	}
+	BUG_ON(gfp_mask & GFP_ATOMIC);
+
+	em = alloc_extent_map(gfp_mask);
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * FIXME if there are errors later on, we end up exposing stale
+	 * data on disk while filling holes.
+	 *
+	 * _XXX_ Danger! len is reduced above, therefore _get_extent
+	 * does not allocate all that we need. It works only with pcss
+	 * and only when cluster size <= pcss block size and allocation
+	 * is aligned. If we relax those conditions, the code must be fixed.
+	 */
+	ret = tree->_get_extent(inode, start, len, &nstart, &result, create);
+	if (ret < 0) {
+		extent_put(em);
+		return ERR_PTR(ret);
+	}
+
+	em->start = nstart;
+	em->end = nstart + ret;
+	em->block_start = result;
+
+	ret = add_extent_mapping(tree, em);
+	if (ret == -EEXIST) {
+		extent_put(em);
+		goto again;
+	}
+	return em;
+}
+
+static struct extent_map *__map_extent_bmap(struct ploop_io *io,
+				       struct address_space *mapping,
+				       sector_t start, sector_t len, gfp_t gfp_mask)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+	struct inode *inode = mapping->host;
+	loff_t start_off = (loff_t)start << 9;
+	struct extent_map *em;
+	struct fiemap_extent_info fieinfo;
+	struct fiemap_extent fi_extent;
+	mm_segment_t old_fs;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(tree, start, len);
+	if (em) {
+		/*
+		 * we may have found an extent that starts after the
+		 * requested range.  Double check and alter the length
+		 * appropriately
+		 */
+		if (em->start > start) {
+			len = em->start - start;
+		} else {
+			return em;
+		}
+		extent_put(em);
+	}
+
+	BUG_ON(gfp_mask & GFP_ATOMIC);
+
+	if (!inode->i_op->fiemap)
+		return ERR_PTR(-EINVAL);
+
+	em = alloc_extent_map(gfp_mask);
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	fieinfo.fi_extents_start = &fi_extent;
+	fieinfo.fi_extents_max = 1;
+	fieinfo.fi_flags = 0;
+	fieinfo.fi_extents_mapped = 0;
+	fi_extent.fe_flags = 0;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = inode->i_op->fiemap(inode, &fieinfo, start_off, 1);
+
+	/* chase for PSBM-26762: em->block_start == 0 */
+	if (!ret && fieinfo.fi_extents_mapped == 1 &&
+	    !(fi_extent.fe_flags & FIEMAP_EXTENT_UNWRITTEN) &&
+	    (fi_extent.fe_physical >> 9) == 0) {
+		/* see how ext4_fill_fiemap_extents() implemented */
+		if (!(fi_extent.fe_flags & FIEMAP_EXTENT_DELALLOC)) {
+			printk("bad fiemap(%ld,%ld) on inode=%p &fieinfo=%p"
+			       " i_size=%lld\n", start, len, inode, &fieinfo,
+				i_size_read(inode));
+			BUG();
+		}
+		/* complain about delalloc case -- ploop always fallocate
+		 * before buffered write */
+		WARN(1, "ploop%d: delalloc extent [%lld,%lld] for [%lld,%ld];"
+		     " i_size=%lld\n", io->plo->index, fi_extent.fe_logical,
+		     fi_extent.fe_length, start_off, len << 9,
+		     i_size_read(inode));
+		ret = -ENOENT;
+	}
+	set_fs(old_fs);
+
+	if (ret) {
+		extent_put(em);
+		return ERR_PTR(ret);
+	}
+
+	if (fieinfo.fi_extents_mapped != 1) {
+		if (start_off < i_size_read(inode))
+			ploop_msg_once(io->plo, "a hole in image file detected"
+				       " (mapped=%d i_size=%llu off=%llu)",
+				       fieinfo.fi_extents_mapped,
+				       i_size_read(inode), start_off);
+		extent_put(em);
+		return ERR_PTR(-EINVAL);
+	}
+
+	em->start = fi_extent.fe_logical >> 9;
+	em->end = (fi_extent.fe_logical + fi_extent.fe_length) >> 9;
+
+	if (fi_extent.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
+		em->block_start = BLOCK_UNINIT;
+	} else {
+		em->block_start = fi_extent.fe_physical >> 9;
+
+		ret = add_extent_mapping(tree, em);
+		if (ret == -EEXIST) {
+			extent_put(em);
+			goto again;
+		}
+	}
+	return em;
+}
+
+static struct extent_map *__map_extent(struct ploop_io *io,
+				       struct address_space *mapping,
+				       sector_t start, sector_t len, int create,
+				       gfp_t gfp_mask, get_block_t get_block)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+
+	if (tree->_get_extent)
+		return __map_extent_get_extent(tree, mapping, start, len, create,
+					       gfp_mask);
+	if (create)
+		/* create flag not supported by bmap implementation */
+		return ERR_PTR(-EINVAL);
+
+	return __map_extent_bmap(io, mapping, start,len, gfp_mask);
+}
+
+struct extent_map *map_extent_get_block(struct ploop_io *io,
+					struct address_space *mapping,
+					sector_t start, sector_t len, int create,
+					gfp_t gfp_mask, get_block_t get_block)
+{
+	struct extent_map *em;
+	sector_t last;
+	sector_t map_ahead_len = 0;
+
+	em = __map_extent(io, mapping, start, len, create,
+			  gfp_mask, get_block);
+
+	/*
+	 * if we're doing a write or we found a large extent, return it
+	 */
+	if (IS_ERR(em) || !em || create || start + len < em->end) {
+		return em;
+	}
+
+	/*
+	 * otherwise, try to walk forward a bit and see if we can build
+	 * something bigger.
+	 */
+	do {
+		last = em->end;
+		extent_put(em);
+		em = __map_extent(io, mapping, last, len, create,
+				  gfp_mask, get_block);
+		if (IS_ERR(em) || !em)
+			break;
+		map_ahead_len += em->end - last;
+	} while (em->start <= start && start + len <= em->end &&
+		 map_ahead_len < 1024);
+
+	/* make sure we return the extent for this range */
+	if (!em || IS_ERR(em) || em->start > start ||
+	    start + len > em->end) {
+		if (em && !IS_ERR(em))
+			extent_put(em);
+		em = __map_extent(io, mapping, start, len, create,
+				  gfp_mask, get_block);
+	}
+	return em;
+}
+
+
+struct extent_map *extent_lookup_create(struct ploop_io *io,
+					sector_t start, sector_t len)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+
+	return map_extent_get_block(io, tree->mapping,
+				    start, len, 0, mapping_gfp_mask(tree->mapping),
+				    NULL);
+}
+
+static int drop_extent_map(struct extent_map_tree *tree)
+{
+	struct extent_map *em;
+	struct rb_node * node;
+
+	write_lock_irq(&tree->lock);
+	while ((node = tree->map.rb_node) != NULL) {
+		em = rb_entry(node, struct extent_map, rb_node);
+		rb_erase(node, &tree->map);
+		list_del_init(&em->lru_link);
+		tree->map_size--;
+		extent_put(em);
+	}
+	write_unlock_irq(&tree->lock);
+	return 0;
+}
+
+void trim_extent_mappings(struct extent_map_tree *tree, sector_t start)
+{
+	struct extent_map *em;
+
+	while ((em = lookup_extent_mapping(tree, start, ((sector_t)(-1ULL)) - start))) {
+		remove_extent_mapping(tree, em);
+		WARN_ON(atomic_read(&em->refs) != 2);
+		/* once for us */
+		extent_put(em);
+		/* No concurrent lookups due to ploop_quiesce(). See WARN_ON above */
+		/* once for the tree */
+		extent_put(em);
+	}
+}
+
+
+void dump_extent_map(struct extent_map_tree *tree)
+{
+	struct rb_node * r = rb_first(&tree->map);
+
+	while (r) {
+		struct extent_map *em0 = rb_entry(r, struct extent_map, rb_node);
+		printk("N=%ld %ld -> %ld\n", (long)em0->start, (long)(em0->end - em0->start), (long)em0->block_start);
+		r = rb_next(r);
+	}
+}
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct_map.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct_map.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_direct_map.h	2015-01-21 12:02:54.711921404 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_direct_map.h	2015-01-21 12:02:57.727841350 +0300
@@ -0,0 +1,61 @@
+#ifndef __INTERVAL_TREE_H__
+#define __INTERVAL_TREE_H__
+
+#include <linux/rbtree.h>
+
+#define BLOCK_UNINIT ~((sector_t) 0)
+
+struct extent_map_tree
+{
+	struct rb_root map;
+	struct list_head lru_list;
+	unsigned int map_size; /* # entries in map */
+	rwlock_t lock;
+	struct address_space * mapping;
+	int (*_get_extent)(struct inode *inode, sector_t isec,
+			   unsigned int nr, sector_t *start,
+			   sector_t *psec, int creat);
+};
+
+struct extent_map
+{
+	struct rb_node rb_node;
+	struct list_head lru_link;
+
+	sector_t	start;
+	sector_t	end;
+
+	sector_t	block_start;
+
+	atomic_t refs;
+};
+
+extern int max_extent_map_pages;
+extern int min_extent_map_entries;
+
+static inline sector_t extent_map_block_end(struct extent_map *em)
+{
+	return em->block_start + (em->end - em->start);
+}
+
+struct extent_map *extent_lookup_create(struct ploop_io *io,
+					sector_t start, sector_t len);
+struct extent_map *extent_lookup(struct extent_map_tree *tree,
+				 sector_t start);
+void extent_put(struct extent_map *em);
+
+struct extent_map *map_extent_get_block(struct ploop_io *io,
+					struct address_space *mapping,
+					sector_t start, sector_t len, int create,
+					gfp_t gfp_mask, get_block_t get_block);
+void trim_extent_mappings(struct extent_map_tree *tree, sector_t start);
+
+int ploop_dio_close(struct ploop_io * io, int rdonly);
+struct extent_map_tree * ploop_dio_open(struct ploop_io * io, int rdonly);
+void ploop_dio_downgrade(struct address_space * mapping);
+int ploop_dio_upgrade(struct ploop_io * io);
+
+int __init extent_map_init(void);
+void extent_map_exit(void);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_kaio.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_kaio.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_kaio.c	2015-01-21 12:02:55.476901099 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_kaio.c	2015-01-21 12:02:57.709841826 +0300
@@ -0,0 +1,1023 @@
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/mount.h>
+
+#include <linux/ploop/ploop.h>
+
+/* from fs/inode/fuse.c */
+#define FUSE_SUPER_MAGIC 0x65735546
+
+#define KAIO_PREALLOC (128 * 1024 * 1024) /* 128 MB */
+
+#define KAIO_MAX_PAGES_PER_REQ 32	  /* 128 KB */
+
+/* This will be used as flag "ploop_kaio_open() succeeded" */
+static struct extent_map_tree
+{
+} dummy_em_tree;
+
+int ploop_kaio_open(struct file * file, int rdonly);
+int ploop_kaio_close(struct address_space * mapping, int rdonly);
+void ploop_kaio_downgrade(struct address_space * mapping);
+int ploop_kaio_upgrade(struct address_space * mapping);
+
+static int __kaio_truncate(struct ploop_io * io, struct file * file, u64 pos);
+static int kaio_truncate(struct ploop_io * io, struct file * file, __u32 a_h);
+
+static void __kaio_queue_fsync_req(struct ploop_request * preq, int prio)
+{
+	struct ploop_device * plo   = preq->plo;
+	struct ploop_delta  * delta = ploop_top_delta(plo);
+	struct ploop_io     * io    = &delta->io;
+
+	if (prio)
+		list_add(&preq->list, &io->fsync_queue);
+	else
+		list_add_tail(&preq->list, &io->fsync_queue);
+
+	io->fsync_qlen++;
+	if (waitqueue_active(&io->fsync_waitq))
+		wake_up_interruptible(&io->fsync_waitq);
+}
+
+static void kaio_queue_fsync_req(struct ploop_request * preq)
+{
+	__kaio_queue_fsync_req(preq, 0);
+}
+
+static void kaio_queue_trunc_req(struct ploop_request * preq)
+{
+	__kaio_queue_fsync_req(preq, 1);
+}
+
+static void kaio_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo   = preq->plo;
+	unsigned long flags;
+
+	if (preq->error || !(preq->req_rw & BIO_FUA) ||
+	    preq->eng_state == PLOOP_E_INDEX_READ ||
+	    preq->eng_state == PLOOP_E_TRANS_INDEX_READ ||
+	    preq->eng_state == PLOOP_E_DELTA_READ ||
+	    preq->eng_state == PLOOP_E_TRANS_DELTA_READ) {
+		ploop_complete_io_state(preq);
+		return;
+	}
+
+	preq->req_rw &= ~BIO_FUA;
+
+	spin_lock_irqsave(&plo->lock, flags);
+	kaio_queue_fsync_req(preq);
+	plo->st.bio_syncwait++;
+	spin_unlock_irqrestore(&plo->lock, flags);
+}
+
+static void kaio_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		kaio_complete_io_state(preq);
+}
+
+struct kaio_req {
+	struct ploop_request *preq;
+	struct bio_vec	      bvecs[0];
+};
+
+static void kaio_rw_aio_complete(u64 data, long res)
+{
+	struct ploop_request * preq = (struct ploop_request *)data;
+
+	if (unlikely(res < 0)) {
+		struct bio *b = preq->aux_bio;
+		printk("kaio_rw_aio_complete: kaio failed with err=%ld "
+		       "(rw=%s; state=%ld/0x%lx; clu=%d; iblk=%d; aux=%ld)\n",
+		       res, (preq->req_rw & (1<<BIO_RW)) ? "WRITE" : "READ",
+		       preq->eng_state, preq->state, preq->req_cluster,
+		       preq->iblock, b ? b->bi_sector : -1);
+		bio_list_for_each(b, &preq->bl)
+			printk(" bio=%p: bi_sector=%ld bi_size=%d\n",
+			       b, b->bi_sector, b->bi_size);
+		ploop_set_error(preq, res);
+	}
+
+	kaio_complete_io_request(preq);
+}
+
+static void kaio_rw_kreq_complete(u64 data, long res)
+{
+	struct kaio_req *kreq = (struct kaio_req *)data;
+	struct ploop_request *preq = kreq->preq;
+
+	kfree(kreq);
+	kaio_rw_aio_complete((u64)preq, res);
+}
+
+static struct kaio_req *kaio_kreq_alloc(struct ploop_request *preq, int *nr_p)
+{
+	static const int nr = KAIO_MAX_PAGES_PER_REQ;
+	struct kaio_req *kreq;
+
+	kreq = kmalloc(offsetof(struct kaio_req, bvecs[nr]), GFP_NOFS);
+	if (kreq) {
+		*nr_p = nr;
+		kreq->preq = preq;
+	}
+
+	return kreq;
+}
+
+static int kaio_kernel_submit(struct file *file, struct kaio_req *kreq,
+		size_t nr_segs, size_t count, loff_t pos, unsigned long rw)
+{
+	struct kiocb *iocb;
+	unsigned short op;
+	struct iov_iter iter;
+	int err;
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb)
+		return -ENOMEM;
+
+	if (rw & (1<<BIO_RW))
+		op = IOCB_CMD_WRITE_ITER;
+	else
+		op = IOCB_CMD_READ_ITER;
+
+	iov_iter_init_bvec(&iter, kreq->bvecs, nr_segs, count, 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_rw_kreq_complete, (u64)kreq);
+
+	err = aio_kernel_submit(iocb);
+	if (err)
+		printk("kaio_kernel_submit: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; state=%ld/0x%lx; pos=%lld; len=%ld)\n",
+		       err, (rw & (1<<BIO_RW)) ? "WRITE" : "READ",
+		       kreq->preq->eng_state, kreq->preq->state, pos, count);
+	return err;
+}
+
+/*
+ * Pack as many bios from the list pointed by '*bio_pp' to kreq as possible,
+ * but no more than 'size' bytes. Returns 'copy' equal to # bytes copied.
+ *
+ * <*bio_pp, *idx_p> plays the role of iterator to walk through bio list.
+ * NB: the iterator is valid only while 'size' > 'copy'
+ *
+ * NB: at enter, '*nr_segs' depicts capacity of kreq;
+ *     at return, it depicts actual payload
+ */
+static size_t kaio_kreq_pack(struct kaio_req *kreq, int *nr_segs,
+			     struct bio **bio_pp, int *idx_p, size_t size)
+{
+	int kreq_nr_max = *nr_segs;
+	struct bio *b = *bio_pp;
+	int idx = *idx_p;
+	struct bio_vec *src_bv = b->bi_io_vec + idx;
+	struct bio_vec *dst_bv = kreq->bvecs;
+	size_t copy = 0;
+
+	BUG_ON(b->bi_idx);
+
+	while (1) {
+		int nr = min_t(int, kreq_nr_max, b->bi_vcnt - idx);
+		BUG_ON(!nr);
+
+		memcpy(dst_bv, src_bv, nr * sizeof(struct bio_vec));
+
+		copy += bvec_length(dst_bv, nr);
+		if (copy >= size) {
+			*nr_segs = dst_bv - kreq->bvecs + nr;
+			return size;
+		}
+
+		dst_bv += nr;
+		src_bv += nr;
+		idx += nr;
+
+		if (b->bi_vcnt == idx) {
+			b = b->bi_next;
+			BUG_ON(!b);
+			src_bv = b->bi_io_vec;
+			idx = 0;
+		}
+
+		kreq_nr_max -= nr;
+		if (kreq_nr_max == 0)
+			break;
+	}
+
+	*bio_pp = b;
+	*idx_p = idx;
+	return copy;
+}
+
+/*
+ * WRITE case:
+ *
+ * sbl is the list of bio; the first bio in the list and iblk specify
+ * destination file offset; the content of bios in sbl is scattered source
+ * buffer.
+ *
+ * The goal is to write source buffer to the file with given offset. We're
+ * doing it by stuffing as many bvecs from source to kreqs as possible and
+ * submitting kreqs to in-kernel aio.
+ *
+ * READ case:
+ *
+ * The same as WRITE, but here the file plays the role of source and the
+ * content of bios in sbl plays the role of destination.
+ */
+static void kaio_sbl_submit(struct file *file, struct ploop_request *preq,
+			    unsigned long rw, struct bio_list *sbl,
+			    iblock_t iblk, size_t size)
+{
+	struct bio *bio = sbl->head;
+	int idx = 0;
+
+	loff_t off = bio->bi_sector;
+	off = ((loff_t)iblk << preq->plo->cluster_log) |
+		(off & ((1<<preq->plo->cluster_log) - 1));
+
+	if (rw & (1<<BIO_RW))
+		ploop_prepare_tracker(preq, off);
+
+	off <<= 9;
+	/* since now 'off' always points to a position in the file to X-mit */
+
+	WARN_ONCE(!(file->f_flags & O_DIRECT), "File opened w/o O_DIRECT");
+
+	ploop_prepare_io_request(preq);
+
+	size <<= 9;
+	while (size > 0) {
+		struct kaio_req *kreq;
+		int nr_segs;
+		size_t copy;
+		int err;
+
+		kreq = kaio_kreq_alloc(preq, &nr_segs);
+		if (!kreq) {
+			ploop_set_error(preq, -ENOMEM);
+			break;
+		}
+
+		copy = kaio_kreq_pack(kreq, &nr_segs, &bio, &idx, size);
+
+		atomic_inc(&preq->io_count);
+		err = kaio_kernel_submit(file, kreq, nr_segs, copy, off, rw);
+		if (err) {
+			ploop_set_error(preq, err);
+			ploop_complete_io_request(preq);
+			kfree(kreq);
+			break;
+		}
+
+		off += copy;
+		size -= copy;
+	}
+
+	kaio_complete_io_request(preq);
+}
+
+static void
+kaio_submit(struct ploop_io *io, struct ploop_request * preq,
+	     unsigned long rw,
+	     struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	if (rw & BIO_FLUSH) {
+		spin_lock_irq(&io->plo->lock);
+		kaio_queue_fsync_req(preq);
+		io->plo->st.bio_syncwait++;
+		spin_unlock_irq(&io->plo->lock);
+		return;
+	}
+
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	kaio_sbl_submit(io->files.file, preq, rw, sbl, iblk, size);
+}
+
+/* returns non-zero if and only if preq was resubmitted */
+static int kaio_resubmit(struct ploop_request * preq)
+{
+	struct ploop_delta * delta = ploop_top_delta(preq->plo);
+
+	switch (preq->eng_state) {
+	case PLOOP_E_ENTRY:
+		return 0;
+	case PLOOP_E_COMPLETE:
+	case PLOOP_E_RELOC_NULLIFY:
+	case PLOOP_E_DATA_WBI:
+		if (preq->aux_bio) {
+			struct bio_list tbl;
+			tbl.head = tbl.tail = preq->aux_bio;
+			kaio_submit(&delta->io, preq, preq->req_rw, &tbl,
+				    preq->iblock, 1<<preq->plo->cluster_log);
+		} else {
+			kaio_submit(&delta->io, preq, preq->req_rw, &preq->bl,
+				    preq->iblock, preq->req_size);
+		}
+		break;
+	case PLOOP_E_TRANS_DELTA_READ:
+		/* BUG_ON below guarantees that 'case PLOOP_E_DELTA_COPIED'
+		 * is equivalent to the part of 'case PLOOP_E_TRANS_DELTA_READ'
+		 * after bio_bcopy(). This is not trivial. */
+		BUG_ON(!test_bit(PLOOP_REQ_TRANS, &preq->state));
+		/* Fall through ... */
+	case PLOOP_E_DELTA_READ:
+		preq->eng_state = PLOOP_E_DELTA_COPIED; /* skip bcopy() */
+		return 0;
+	default:
+		printk("Resubmit bad state %lu\n", preq->eng_state);
+		BUG();
+	}
+
+	return 1;
+}
+
+static inline int io2level(struct ploop_io * io)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+	return delta->level;
+}	
+
+static int kaio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		struct ploop_request * preq;
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		preq = list_entry(io->fsync_queue.next, struct ploop_request, list);
+		list_del(&preq->list);
+		io->fsync_qlen--;
+		if (!preq->prealloc_size)
+			plo->st.bio_fsync++;
+		spin_unlock_irq(&plo->lock);
+
+		/* trick: preq->prealloc_size is actually new pos of eof */
+		if (preq->prealloc_size) {
+			err = kaio_truncate(io, io->files.file,
+					    preq->prealloc_size >> (plo->cluster_log + 9));
+			if (err)
+				ploop_set_error(preq, -EIO);
+		} else {
+			struct file *file = io->files.file;
+			err = vfs_fsync(file, file->f_path.dentry, 1);
+			if (err) {
+				printk("kaio_fsync_thread: vfs_fsync failed "
+				       "with err=%d (i_ino=%ld of level=%d "
+				       "on ploop%d)\n",
+				       err, io->files.inode->i_ino,
+				       io2level(io), plo->index);
+				ploop_set_error(preq, -EIO);
+			} else if (preq->req_rw & BIO_FLUSH) {
+				BUG_ON(!preq->req_size);
+				preq->req_rw &= ~BIO_FLUSH;
+				if (kaio_resubmit(preq)) {
+					spin_lock_irq(&plo->lock);
+					continue;
+				}
+			}
+		}
+
+		spin_lock_irq(&plo->lock);
+		list_add_tail(&preq->list, &plo->ready_queue);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+static void
+kaio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+	iblock_t iblk;
+	int log = preq->plo->cluster_log + 9;
+	loff_t clu_siz = 1 << log;
+
+	if (delta->flags & PLOOP_FMT_RDONLY) {
+		ploop_fail_request(preq, -EBADF);
+		return;
+	}
+
+	iblk = io->alloc_head;
+
+	if (unlikely(preq->req_rw & BIO_FLUSH)) {
+		spin_lock_irq(&io->plo->lock);
+		kaio_queue_fsync_req(preq);
+		io->plo->st.bio_syncwait++;
+		spin_unlock_irq(&io->plo->lock);
+		return;
+	}
+
+	/* trick: preq->prealloc_size is actually new pos of eof */
+	if (unlikely(preq->prealloc_size)) {
+		BUG_ON(preq != io->prealloc_preq);
+		io->prealloc_preq = NULL;
+
+		io->prealloced_size = preq->prealloc_size - ((loff_t)iblk << log);
+		preq->prealloc_size = 0; /* only for sanity */
+	}
+
+	if (unlikely(io->prealloced_size < clu_siz)) {
+		if (!io->prealloc_preq) {
+			loff_t pos = (((loff_t)(iblk + 1)  << log) |
+				      (KAIO_PREALLOC - 1)) + 1;
+
+			BUG_ON(preq->prealloc_size);
+			preq->prealloc_size = pos;
+			io->prealloc_preq   = preq;
+
+			spin_lock_irq(&io->plo->lock);
+			kaio_queue_trunc_req(preq);
+			io->plo->st.bio_syncwait++;
+			spin_unlock_irq(&io->plo->lock);
+			return;
+		} else { /* we're not first */
+			list_add_tail(&preq->list,
+				      &io->prealloc_preq->delay_list);
+			return;
+		}
+	}
+
+	io->prealloced_size -= clu_siz;
+	io->alloc_head++;
+
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_DATA_WBI;
+
+	kaio_sbl_submit(io->files.file, preq, 1<<BIO_RW, sbl, iblk, size);
+}
+
+static int kaio_release_prealloced(struct ploop_io * io)
+{
+	int ret;
+
+	if (!io->prealloced_size)
+		return 0;
+
+	ret = kaio_truncate(io, io->files.file, io->alloc_head);
+	if (ret)
+		printk("Can't release %llu prealloced bytes: "
+		       "truncate to %llu failed (%d)\n",
+		       io->prealloced_size,
+		       (loff_t)io->alloc_head << (io->plo->cluster_log + 9),
+		       ret);
+	else
+		io->prealloced_size = 0;
+
+	return ret;
+}
+
+static void
+kaio_destroy(struct ploop_io * io)
+{
+	if (io->files.file) {
+		struct file * file;
+		struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+		if (io->fsync_thread) {
+			kthread_stop(io->fsync_thread);
+			io->fsync_thread = NULL;
+		}
+
+		(void)kaio_release_prealloced(io);
+
+		if (io->files.em_tree) {
+			mutex_lock(&io->files.inode->i_mutex);
+			ploop_kaio_close(io->files.mapping, delta->flags & PLOOP_FMT_RDONLY);
+			mutex_unlock(&io->files.inode->i_mutex);
+		}
+
+		file = io->files.file;
+		mutex_lock(&delta->plo->sysfs_mutex);
+		io->files.file = NULL;
+		mutex_unlock(&delta->plo->sysfs_mutex);
+		fput(file);
+	}
+}
+
+static int
+kaio_sync(struct ploop_io * io)
+{
+	struct file *file = io->files.file;
+
+	return vfs_fsync(file, file->f_path.dentry, 0);
+}
+
+static int
+kaio_stop(struct ploop_io * io)
+{
+	return 0;
+}
+
+static int
+kaio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+
+	return 0;
+}
+
+static void
+kaio_io_page(struct ploop_io * io, int op, struct ploop_request * preq,
+	     struct page * page, sector_t sec)
+{
+
+	struct kiocb *iocb;
+	struct iov_iter iter;
+	loff_t pos = (loff_t) sec << 9;
+	struct file *file = io->files.file;
+	int err;
+
+	ploop_prepare_io_request(preq);
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb) {
+		ploop_set_error(preq, -ENOMEM);
+		goto out;
+	}
+
+	iov_iter_init_page(&iter, page, PAGE_SIZE, 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_rw_aio_complete, (u64)preq);
+
+	atomic_inc(&preq->io_count);
+
+	err = aio_kernel_submit(iocb);
+	if (err) {
+		printk("kaio_io_page: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; state=%ld/0x%lx; pos=%lld)\n",
+		       err, (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       preq->eng_state, preq->state, pos);
+		ploop_set_error(preq, err);
+	}
+
+out:
+	ploop_complete_io_request(preq);
+}
+
+static void
+kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
+		struct page * page, sector_t sec)
+{
+	kaio_io_page(io, IOCB_CMD_READ_ITER, preq, page, sec);
+}
+
+static void
+kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
+		 struct page * page, sector_t sec, int fua)
+{
+	ploop_prepare_tracker(preq, sec);
+	kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
+}
+
+static int
+kaio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		   sector_t sec)
+{
+	return -EINVAL;
+}
+
+static int
+kaio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		    sector_t sec)
+{
+	return -EINVAL;
+}
+
+struct kaio_comp {
+	struct completion comp;
+	atomic_t count;
+	int error;
+};
+
+static inline void kaio_comp_init(struct kaio_comp * c)
+{
+	init_completion(&c->comp);
+	atomic_set(&c->count, 1);
+	c->error = 0;
+}
+
+static void kaio_sync_io_complete(u64 data, long err)
+{
+
+	struct kaio_comp *comp = (struct kaio_comp *) data;
+
+	if (unlikely(err < 0)) {
+		if (!comp->error)
+			comp->error = err;
+	}
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+}
+
+static int
+kaio_sync_io(struct ploop_io * io, int op, struct page * page,
+	     unsigned int len, unsigned int off, sector_t sec)
+{
+	struct kiocb *iocb;
+	struct iov_iter iter;
+	struct bio_vec bvec;
+	loff_t pos = (loff_t) sec << 9;
+	struct file *file = io->files.file;
+	struct kaio_comp comp;
+	int err;
+
+	kaio_comp_init(&comp);
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb)
+		return -ENOMEM;
+
+	bvec.bv_page = page;
+	bvec.bv_len = len;
+	bvec.bv_offset = off;
+
+	iov_iter_init_bvec(&iter, &bvec, 1, bvec_length(&bvec, 1), 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_sync_io_complete, (u64)&comp);
+
+	atomic_inc(&comp.count);
+
+	err = aio_kernel_submit(iocb);
+	if (err) {
+		printk("kaio_sync_io: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; pos=%lld; len=%d off=%d)\n",
+		       err, (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       pos, len, off);
+		comp.error = err;
+		if (atomic_dec_and_test(&comp.count))
+			complete(&comp.comp);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	if (!err && comp.error)
+		printk("kaio_sync_io: kaio failed with err=%d "
+		       "(rw=%s; pos=%lld; len=%d off=%d)\n",
+		       comp.error,
+		       (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       pos, len, off);
+
+	return comp.error;
+}
+
+static int
+kaio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+		unsigned int off, sector_t sec)
+{
+	return kaio_sync_io(io, IOCB_CMD_READ_ITER, page, len, off, sec);
+}
+
+static int
+kaio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+		 unsigned int off, sector_t sec)
+{
+	int ret;
+
+	ret = kaio_sync_io(io, IOCB_CMD_WRITE_ITER, page, len, off, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return ret;
+}
+
+static int kaio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	return __kaio_truncate(io, io->files.file, pos + len);
+}
+
+static int kaio_open(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	int err;
+
+	if (file == NULL)
+		return -EBADF;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = io->files.inode->i_sb->s_bdev;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = ploop_kaio_open(file, delta->flags & PLOOP_FMT_RDONLY);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err)
+		return err;
+
+	io->files.em_tree = &dummy_em_tree;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		io->fsync_thread = kthread_create(kaio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  delta->plo->index);
+		if (io->fsync_thread == NULL) {
+			ploop_kaio_close(io->files.mapping, 0);
+			return -ENOMEM;
+		}
+
+		wake_up_process(io->fsync_thread);
+	}
+
+	return 0;
+}
+
+static int kaio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int err;
+
+	file = dentry_open(dget(F_DENTRY(file)), mntget(F_MNT(file)),
+			   O_RDONLY|O_LARGEFILE|O_DIRECT, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = vfs_fsync(file, file->f_path.dentry, 0);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int kaio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int ret;
+
+	ret = kaio_release_prealloced(io);
+	if (ret)
+		return ret;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	ploop_kaio_downgrade(io->files.mapping);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int kaio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int err;
+
+	file = dentry_open(dget(F_DENTRY(file)), mntget(F_MNT(file)),
+			   O_RDWR|O_LARGEFILE|O_DIRECT, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		err = -EINVAL;
+		goto prep_merge_done;
+	}
+
+	err = vfs_fsync(file, file->f_path.dentry, 0);
+	if (err)
+		goto prep_merge_done;
+
+	err = ploop_kaio_upgrade(io->files.mapping);
+	if (err)
+		goto prep_merge_done;
+
+	io->fsync_thread = kthread_create(kaio_fsync_thread,
+					  io, "ploop_fsync%d",
+					  io->plo->index);
+	if (io->fsync_thread == NULL) {
+		err = -ENOMEM;
+		goto prep_merge_done;
+	}
+
+	wake_up_process(io->fsync_thread);
+
+	sd->file = file;
+
+prep_merge_done:
+	if (err)
+		fput(file);
+	return err;
+}
+
+static int kaio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static int __kaio_truncate(struct ploop_io * io, struct file * file, u64 pos)
+{
+	int err;
+	struct iattr newattrs;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size  = pos;
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	io->files.inode->i_flags &= ~S_SWAPFILE;
+	err = notify_change(F_DENTRY(file), &newattrs);
+	io->files.inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err) {
+		printk("__kaio_truncate(i_ino=%ld of level=%d on ploop%d, "
+		       "pos=%lld): notify_change failed with err=%d "
+		       "(i_size=%lld)\n",
+		       io->files.inode->i_ino, io2level(io), io->plo->index,
+		       pos, err, i_size_read(io->files.inode));
+		return err;
+	}
+
+	err = vfs_fsync(file, file->f_path.dentry, 0);
+
+	if (err)
+		printk("__kaio_truncate(i_ino=%ld of level=%d on ploop%d, "
+		       "pos=%lld): vfs_fsync failed with err=%d\n",
+		       io->files.inode->i_ino, io2level(io), io->plo->index,
+		       pos, err);
+
+	return err;
+}
+
+static int kaio_truncate(struct ploop_io * io, struct file * file,
+			  __u32 alloc_head)
+{
+	return __kaio_truncate(io, file,
+			       (u64)alloc_head << (io->plo->cluster_log + 9));
+}
+
+static void kaio_unplug(struct ploop_io * io)
+{
+	blk_run_address_space(io->files.file->f_mapping);
+}
+
+static void kaio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+	blk_set_stacking_limits(&q->limits);
+}
+
+static void kaio_issue_flush(struct ploop_io * io, struct ploop_request *preq)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+
+	preq->eng_state = PLOOP_E_COMPLETE;
+	preq->req_rw &= ~BIO_FLUSH;
+
+	spin_lock_irq(&io->plo->lock);
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		list_add_tail(&preq->list, &io->plo->ready_queue);
+	else
+		kaio_queue_fsync_req(preq);
+
+	spin_unlock_irq(&io->plo->lock);
+}
+
+static int kaio_autodetect(struct ploop_io * io)
+{
+	struct file  * file  = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+
+	if (inode->i_sb->s_magic != FUSE_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (!(file->f_flags & O_DIRECT)) {
+		ploop_io_report_fn(file, "File opened w/o O_DIRECT");
+		return -1;
+	}
+
+	if (file->f_mapping->a_ops->direct_IO_bvec == NULL) {
+		printk("Cannot run kaio over fs (%s) w/o direct_IO_bvec\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	if (file->f_mapping->a_ops->direct_IO_page == NULL) {
+		printk("Cannot run kaio over fs (%s) w/o direct_IO_page\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_kaio =
+{
+	.id		=	PLOOP_IO_KAIO,
+	.name		=	"kaio",
+	.owner		=	THIS_MODULE,
+
+	.unplug		=	kaio_unplug,
+
+	.alloc		=	kaio_alloc_sync,
+	.submit		=	kaio_submit,
+	.submit_alloc	=	kaio_submit_alloc,
+	.read_page	=	kaio_read_page,
+	.write_page	=	kaio_write_page,
+	.sync_read	=	kaio_sync_read,
+	.sync_write	=	kaio_sync_write,
+	.sync_readvec	=	kaio_sync_readvec,
+	.sync_writevec	=	kaio_sync_writevec,
+
+	.init		=	kaio_init,
+	.destroy	=	kaio_destroy,
+	.open		=	kaio_open,
+	.sync		=	kaio_sync,
+	.stop		=	kaio_stop,
+	.prepare_snapshot =	kaio_prepare_snapshot,
+	.complete_snapshot =	kaio_complete_snapshot,
+	.prepare_merge	=	kaio_prepare_merge,
+	.start_merge	=	kaio_start_merge,
+	.truncate	=	kaio_truncate,
+
+	.queue_settings	=	kaio_queue_settings,
+	.issue_flush	=	kaio_issue_flush,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       kaio_autodetect,
+};
+
+static int __init pio_kaio_mod_init(void)
+{
+	return ploop_register_io(&ploop_io_ops_kaio);
+}
+
+static void __exit pio_kaio_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_kaio);
+}
+
+module_init(pio_kaio_mod_init);
+module_exit(pio_kaio_mod_exit);
+
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_kaio_map.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_kaio_map.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_kaio_map.c	2015-01-21 12:02:55.548899187 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_kaio_map.c	2015-01-21 12:02:55.574898497 +0300
@@ -0,0 +1,126 @@
+#include <linux/ploop/ploop.h>
+
+struct ploop_mapping
+{
+	struct list_head	list;
+	struct address_space	* mapping;
+	int			readers;
+};
+
+static LIST_HEAD(ploop_mappings);
+static DEFINE_SPINLOCK(ploop_mappings_lock);
+
+int ploop_kaio_open(struct file * file, int rdonly)
+{
+	int err = 0;
+	struct ploop_mapping *m, *pm;
+	struct address_space * mapping = file->f_mapping;
+
+	pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				if (m->readers < 0)
+					err = -ETXTBSY;
+				else
+					m->readers++;
+			} else {
+				if (m->readers)
+					err = -EBUSY;
+				else
+					m->readers = -1;
+			}
+			goto kaio_open_done;
+		}
+	}
+
+	if (pm == NULL) {
+		err = -ENOMEM;
+		goto kaio_open_done;
+	}
+
+	if (mapping->host->i_flags & S_SWAPFILE) {
+		err = -EBUSY;
+		goto kaio_open_done;
+	}
+
+	pm->mapping = mapping;
+	pm->readers = rdonly ? 1 : -1;
+	list_add(&pm->list, &ploop_mappings);
+	pm = NULL;
+	mapping->host->i_flags |= S_SWAPFILE;
+
+kaio_open_done:
+	spin_unlock(&ploop_mappings_lock);
+	if (pm)
+		kfree(pm);
+	return err;
+}
+
+int ploop_kaio_close(struct address_space * mapping, int rdonly)
+{
+	struct ploop_mapping *m, *pm = NULL;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				m->readers--;
+			} else {
+				BUG_ON(m->readers != -1);
+				m->readers = 0;
+			}
+
+			if (m->readers == 0) {
+				mapping->host->i_flags &= ~S_SWAPFILE;
+				list_del(&m->list);
+				pm = m;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+
+	if (pm) {
+		kfree(pm);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+void ploop_kaio_downgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			BUG_ON(m->readers != -1);
+			m->readers = 1;
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+}
+
+int ploop_kaio_upgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+	int err = -ESRCH;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			err = -EBUSY;
+			if (m->readers == 1) {
+				m->readers = -1;
+				err = 0;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_nfs.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_nfs.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/io_nfs.c	2015-01-21 12:02:54.712921377 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/io_nfs.c	2015-01-21 12:02:57.696842173 +0300
@@ -0,0 +1,1702 @@
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+
+#include <linux/mount.h>
+
+#include <linux/ploop/ploop.h>
+
+#define MAX_NBIO_PAGES	32
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
+struct workqueue_struct *nfsio_workqueue;
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
+{
+	return filp->private_data;
+}
+#endif
+
+static struct nfs_write_data *nfsio_wbio_alloc(unsigned int pagecount);
+static struct nfs_read_data *nfsio_rbio_alloc(unsigned int pagecount);
+static struct nfs_commit_data *nfsio_cbio_alloc(void);
+static void nfsio_rbio_release(void *);
+static void nfsio_wbio_release(void *);
+static void nfsio_cbio_release(void *);
+static int verify_bounce(struct nfs_write_data * nreq);
+
+extern int nfs_initiate_commit(struct rpc_clnt *clnt,
+			       struct nfs_commit_data *data,
+			       const struct rpc_call_ops *call_ops,
+			       int how);
+
+void nfsio_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	unsigned long flags;
+
+	spin_lock_irqsave(&plo->lock, flags);
+	if (preq->error)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+
+	if (!preq->error &&
+	    test_bit(PLOOP_REQ_UNSTABLE, &preq->state)) {
+		struct ploop_io * io = &map_writable_delta(preq)->io;
+		list_add_tail(&preq->list, &io->fsync_queue);
+		io->fsync_qlen++;
+		if (waitqueue_active(&io->fsync_waitq))
+			wake_up_interruptible(&io->fsync_waitq);
+		plo->st.bio_syncwait++;
+	} else {
+		list_add_tail(&preq->list, &plo->ready_queue);
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+		    waitqueue_active(&plo->waitq))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irqrestore(&plo->lock, flags);
+}
+
+
+static inline void nfsio_prepare_io_request(struct ploop_request * preq)
+{
+	atomic_set(&preq->io_count, 1);
+}
+
+static inline void nfsio_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		nfsio_complete_io_state(preq);
+}
+
+
+static void nfsio_read_result(struct rpc_task *task, void *calldata)
+{
+	int status;
+	struct nfs_read_data *nreq = calldata;
+
+	status = NFS_PROTO(nreq->header->inode)->read_done(task, nreq);
+	if (status != 0)
+		return;
+
+	if (task->tk_status == -ESTALE) {
+		set_bit(NFS_INO_STALE, &NFS_I(nreq->header->inode)->flags);
+		nfs_mark_for_revalidate(nreq->header->inode);
+	}
+}
+
+static void nfsio_read_release(void *calldata)
+{
+	struct nfs_read_data *nreq = calldata;
+	struct ploop_request *preq = (struct ploop_request *) nreq->header->req;
+	int status = nreq->task.tk_status;
+
+	if (unlikely(status < 0))
+		ploop_set_error(preq, status);
+
+	ploop_complete_io_request(preq);
+
+	nfsio_rbio_release(calldata);
+}
+
+static const struct rpc_call_ops nfsio_read_ops = {
+	.rpc_call_done = nfsio_read_result,
+	.rpc_release = nfsio_read_release,
+};
+
+
+static struct nfs_read_data *
+rbio_init(loff_t pos, struct page * page, unsigned int off, unsigned int len,
+	  void * priv, struct inode * inode)
+{
+	struct nfs_read_data * nreq;
+
+	nreq = nfsio_rbio_alloc(MAX_NBIO_PAGES);
+	if (unlikely(nreq == NULL))
+		return NULL;
+
+	nreq->args.offset = pos;
+	nreq->args.pgbase = off;
+	nreq->args.count = len;
+	nreq->pages.pagevec[0] = page;
+	nreq->pages.npages = 1;
+	nreq->header->req = priv;
+	nreq->header->inode = inode;
+	nreq->args.fh = NFS_FH(inode);
+	nreq->args.pages = nreq->pages.pagevec;
+	nreq->res.fattr = &nreq->fattr;
+	nreq->res.eof = 0;
+	return nreq;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
+
+static int
+rbio_submit(struct ploop_io * io, struct nfs_read_data * nreq,
+	    const struct rpc_call_ops * cb)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(io->files.file);
+	struct inode *inode = io->files.inode;
+	struct rpc_task *task;
+
+	struct rpc_message msg = {
+		.rpc_cred = ctx->cred,
+	};
+
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
+		.callback_ops = cb,
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
+		.workqueue = nfsio_workqueue,
+#endif
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	nreq->res.count = nreq->args.count;
+	nreq->header->cred = msg.rpc_cred;
+	nreq->args.context = ctx;
+
+	task_setup_data.task = &nreq->task;
+	task_setup_data.callback_data = nreq;
+	msg.rpc_argp = &nreq->args;
+	msg.rpc_resp = &nreq->res;
+	NFS_PROTO(inode)->read_setup(nreq, &msg);
+
+	task = rpc_run_task(&task_setup_data);
+	if (unlikely(IS_ERR(task)))
+		return PTR_ERR(task);
+
+	rpc_put_task(task);
+	return 0;
+}
+
+#else
+
+static int
+rbio_submit(struct ploop_io * io, struct nfs_read_data * nreq,
+	    const struct rpc_call_ops * cb)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(io->files.file);
+	struct inode *inode = io->files.inode;
+
+	nreq->res.count = nreq->args.count;
+	nreq->cred = ctx->cred;
+	nreq->args.context = ctx;
+
+	rpc_init_task(&nreq->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, cb, nreq);
+	NFS_PROTO(inode)->read_setup(nreq);
+
+	nreq->task.tk_cookie = (unsigned long) inode;
+
+	lock_kernel();
+	rpc_execute(&nreq->task);
+	unlock_kernel();
+	return 0;
+}
+#endif
+
+static void
+nfsio_submit_read(struct ploop_io *io, struct ploop_request * preq,
+		  struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	struct inode *inode = io->files.inode;
+	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct nfs_read_data *nreq = NULL;
+	loff_t pos;
+	unsigned int prev_end;
+	struct bio * b;
+
+	ploop_prepare_io_request(preq);
+
+	pos = sbl->head->bi_sector;
+	pos = ((loff_t)iblk << preq->plo->cluster_log) | (pos & ((1<<preq->plo->cluster_log) - 1));
+	pos <<= 9;
+
+	prev_end = PAGE_SIZE;
+
+	for (b = sbl->head; b != NULL; b = b->bi_next) {
+		int bv_idx;
+
+		for (bv_idx = 0; bv_idx < b->bi_vcnt; bv_idx++) {
+			struct bio_vec * bv = &b->bi_io_vec[bv_idx];
+
+			if (nreq && nreq->args.count + bv->bv_len <= rsize) {
+				if (nreq->pages.pagevec[nreq->pages.npages-1] == bv->bv_page &&
+				    prev_end == bv->bv_offset) {
+					nreq->args.count += bv->bv_len;
+					pos += bv->bv_len;
+					prev_end += bv->bv_len;
+					continue;
+				}
+				if (nreq->pages.npages < MAX_NBIO_PAGES &&
+				    bv->bv_offset == 0 && prev_end == PAGE_SIZE) {
+					nreq->args.count += bv->bv_len;
+					nreq->pages.pagevec[nreq->pages.npages] = bv->bv_page;
+					nreq->pages.npages++;
+					pos += bv->bv_len;
+					prev_end = bv->bv_offset + bv->bv_len;
+					continue;
+				}
+			}
+
+			if (nreq) {
+				int err;
+
+				atomic_inc(&preq->io_count);
+
+				err = rbio_submit(io, nreq, &nfsio_read_ops);
+				if (err) {
+					ploop_set_error(preq, err);
+					ploop_complete_io_request(preq);
+					goto out;
+				}
+			}
+
+			nreq = rbio_init(pos, bv->bv_page, bv->bv_offset,
+					 bv->bv_len, preq, inode);
+
+			if (nreq == NULL) {
+				ploop_set_error(preq, -ENOMEM);
+				goto out;
+			}
+
+			pos += bv->bv_len;
+			prev_end = bv->bv_offset + bv->bv_len;
+		}
+	}
+
+	if (nreq) {
+		int err;
+
+		atomic_inc(&preq->io_count);
+
+		err = rbio_submit(io, nreq, &nfsio_read_ops);
+		if (err) {
+			ploop_set_error(preq, err);
+			ploop_complete_io_request(preq);
+			goto out;
+		}
+	}
+
+out:
+	ploop_complete_io_request(preq);
+}
+
+static void nfsio_write_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_writeargs	*argp = &data->args;
+	struct nfs_writeres	*resp = &data->res;
+	int status;
+
+	status = NFS_PROTO(data->header->inode)->write_done(task, data);
+	if (status != 0)
+		return;
+
+	if (task->tk_status >= 0 && resp->count < argp->count)
+		task->tk_status = -EIO;
+}
+
+static void nfsio_write_release(void *calldata)
+{
+	struct nfs_write_data *nreq = calldata;
+	struct ploop_request *preq = (struct ploop_request *) nreq->header->req;
+	int status = nreq->task.tk_status;
+
+	if (unlikely(status < 0))
+		ploop_set_error(preq, status);
+
+	if (!preq->error &&
+	    nreq->res.verf->committed != NFS_FILE_SYNC) {
+		if (!test_and_set_bit(PLOOP_REQ_UNSTABLE, &preq->state))
+			memcpy(&preq->verf, &nreq->res.verf->verifier, 8);
+	}
+	nfsio_complete_io_request(preq);
+
+	nfsio_wbio_release(calldata);
+}
+
+static const struct rpc_call_ops nfsio_write_ops = {
+	.rpc_call_done = nfsio_write_result,
+	.rpc_release = nfsio_write_release,
+};
+
+static struct nfs_write_data *
+wbio_init(loff_t pos, struct page * page, unsigned int off, unsigned int len,
+	  void * priv, struct inode * inode)
+{
+	struct nfs_write_data * nreq;
+
+	nreq = nfsio_wbio_alloc(MAX_NBIO_PAGES);
+	if (unlikely(nreq == NULL))
+		return NULL;
+
+	nreq->args.offset = pos;
+	nreq->args.pgbase = off;
+	nreq->args.count = len;
+	nreq->pages.pagevec[0] = page;
+	nreq->pages.npages = 1;
+	nreq->header->req = priv;
+	nreq->header->inode = inode;
+	nreq->args.fh = NFS_FH(inode);
+	nreq->args.pages = nreq->pages.pagevec;
+	nreq->args.stable = NFS_UNSTABLE;
+	nreq->res.fattr = &nreq->fattr;
+	nreq->res.verf = &nreq->verf;
+	return nreq;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
+
+static int wbio_submit(struct ploop_io * io, struct nfs_write_data *nreq,
+		       const struct rpc_call_ops * cb)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(io->files.file);
+	struct inode *inode = io->files.inode;
+
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_cred = ctx->cred,
+	};
+
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
+		.callback_ops = cb,
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
+		.workqueue = nfsio_workqueue,
+#endif
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	if (verify_bounce(nreq))
+		return -ENOMEM;
+
+	nreq->res.count = nreq->args.count;
+	nreq->args.context = ctx;
+	nreq->header->cred = msg.rpc_cred;
+
+	task_setup_data.task = &nreq->task;
+	task_setup_data.callback_data = nreq;
+	msg.rpc_argp = &nreq->args;
+	msg.rpc_resp = &nreq->res;
+	NFS_PROTO(inode)->write_setup(nreq, &msg);
+
+	task = rpc_run_task(&task_setup_data);
+	if (unlikely(IS_ERR(task)))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+
+#else
+
+static int wbio_submit(struct ploop_io * io, struct nfs_write_data *nreq,
+		       const struct rpc_call_ops * cb)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(io->files.file);
+	struct inode *inode = io->files.inode;
+
+	if (verify_bounce(nreq))
+		return -ENOMEM;
+
+	nreq->res.count = nreq->args.count;
+	nreq->args.context = ctx;
+	nreq->cred = ctx->cred;
+
+	rpc_init_task(&nreq->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, cb, nreq);
+	NFS_PROTO(inode)->write_setup(nreq, NFS_UNSTABLE);
+
+	nreq->task.tk_priority = RPC_PRIORITY_NORMAL;
+	nreq->task.tk_cookie = (unsigned long) inode;
+
+	lock_kernel();
+	rpc_execute(&nreq->task);
+	unlock_kernel();
+	return 0;
+}
+
+#endif
+
+
+static void
+nfsio_submit_write(struct ploop_io *io, struct ploop_request * preq,
+		   struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	struct inode *inode = io->files.inode;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	struct nfs_write_data *nreq = NULL;
+	loff_t pos;
+	struct bio * b;
+	unsigned int prev_end;
+
+	nfsio_prepare_io_request(preq);
+
+	pos = sbl->head->bi_sector;
+	pos = ((loff_t)iblk << preq->plo->cluster_log) | (pos & ((1<<preq->plo->cluster_log) - 1));
+	ploop_prepare_tracker(preq, pos);
+	pos <<= 9;
+
+	prev_end = PAGE_SIZE;
+
+	for (b = sbl->head; b != NULL; b = b->bi_next) {
+		int bv_idx;
+
+		for (bv_idx = 0; bv_idx < b->bi_vcnt; bv_idx++) {
+			struct bio_vec * bv = &b->bi_io_vec[bv_idx];
+
+			if (nreq && nreq->args.count + bv->bv_len <= wsize) {
+				if (nreq->pages.pagevec[nreq->pages.npages-1] == bv->bv_page &&
+				    prev_end == bv->bv_offset) {
+					nreq->args.count += bv->bv_len;
+					pos += bv->bv_len;
+					prev_end += bv->bv_len;
+					continue;
+				}
+				if (nreq->pages.npages < MAX_NBIO_PAGES &&
+				    bv->bv_offset == 0 && prev_end == PAGE_SIZE) {
+					nreq->args.count += bv->bv_len;
+					nreq->pages.pagevec[nreq->pages.npages] = bv->bv_page;
+					nreq->pages.npages++;
+					pos += bv->bv_len;
+					prev_end = bv->bv_offset + bv->bv_len;
+					continue;
+				}
+			}
+
+			if (nreq) {
+				int err;
+				atomic_inc(&preq->io_count);
+				err = wbio_submit(io, nreq, &nfsio_write_ops);
+				if (err) {
+					ploop_set_error(preq, err);
+					nfsio_complete_io_request(preq);
+					goto out;
+				}
+			}
+
+			nreq = wbio_init(pos, bv->bv_page, bv->bv_offset,
+					 bv->bv_len, preq, inode);
+
+			if (nreq == NULL) {
+				ploop_set_error(preq, -ENOMEM);
+				goto out;
+			}
+
+			prev_end = bv->bv_offset + bv->bv_len;
+			pos += bv->bv_len;
+		}
+	}
+
+	if (nreq) {
+		int err;
+		atomic_inc(&preq->io_count);
+		err = wbio_submit(io, nreq, &nfsio_write_ops);
+		if (err) {
+			ploop_set_error(preq, err);
+			nfsio_complete_io_request(preq);
+		}
+	}
+
+out:
+	nfsio_complete_io_request(preq);
+}
+
+static void
+nfsio_submit(struct ploop_io *io, struct ploop_request * preq,
+	     unsigned long rw,
+	     struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	if (rw & (1<<BIO_RW))
+		nfsio_submit_write(io, preq, sbl, iblk, size);
+	else
+		nfsio_submit_read(io, preq, sbl, iblk, size);
+}
+
+struct bio_list_walk
+{
+	struct bio * cur;
+	int idx;
+	int bv_off;
+};
+
+static void
+nfsio_submit_write_pad(struct ploop_io *io, struct ploop_request * preq,
+		       struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	struct inode *inode = io->files.inode;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	struct nfs_write_data *nreq = NULL;
+	struct bio_list_walk bw;
+	unsigned prev_end;
+
+	loff_t pos, end_pos, start, end;
+
+	/* pos..end_pos is the range which we are going to write */
+	pos = (loff_t)iblk << (preq->plo->cluster_log + 9);
+	end_pos = pos + (1 << (preq->plo->cluster_log + 9));
+
+	/* start..end is data that we have. The rest must be zero padded. */
+	start = pos + ((sbl->head->bi_sector & ((1<<preq->plo->cluster_log) - 1)) << 9);
+	end = start + (size << 9);
+
+	nfsio_prepare_io_request(preq);
+	ploop_prepare_tracker(preq, start >> 9);
+
+	prev_end = PAGE_SIZE;
+
+#if 1
+	/* GCC, shut up! */
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+#endif
+
+	while (pos < end_pos) {
+		struct page * page;
+		unsigned int poff, plen;
+
+		if (pos < start) {
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = start - pos;
+			if (plen > PAGE_SIZE)
+				plen = PAGE_SIZE;
+		} else if (pos >= end) {
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = end_pos - pos;
+			if (plen > PAGE_SIZE)
+				plen = PAGE_SIZE;
+		} else {
+			/* pos >= start && pos < end */
+			struct bio_vec * bv;
+
+			if (pos == start) {
+				bw.cur = sbl->head;
+				bw.idx = 0;
+				bw.bv_off = 0;
+				BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+			}
+			bv = bw.cur->bi_io_vec + bw.idx;
+
+			if (bw.bv_off >= bv->bv_len) {
+				bw.idx++;
+				bv++;
+				bw.bv_off = 0;
+				if (bw.idx >= bw.cur->bi_vcnt) {
+					bw.cur = bw.cur->bi_next;
+					bw.idx = 0;
+					bw.bv_off = 0;
+					bv = bw.cur->bi_io_vec;
+				}
+				BUG_ON(bv->bv_len & 511);
+			}
+
+			page = bv->bv_page;
+			poff = bv->bv_offset + bw.bv_off;
+			plen = bv->bv_len - bw.bv_off;
+		}
+
+		if (nreq && nreq->args.count + plen <= wsize) {
+			if (nreq->pages.pagevec[nreq->pages.npages-1] == page &&
+			    prev_end == poff) {
+				nreq->args.count += plen;
+				pos += plen;
+				bw.bv_off += plen;
+				prev_end += plen;
+				continue;
+			}
+			if (nreq->pages.npages < MAX_NBIO_PAGES &&
+			    poff == 0 && prev_end == PAGE_SIZE) {
+				nreq->args.count += plen;
+				nreq->pages.pagevec[nreq->pages.npages] = page;
+				nreq->pages.npages++;
+				pos += plen;
+				bw.bv_off += plen;
+				prev_end = poff + plen;
+				continue;
+			}
+		}
+
+		if (nreq) {
+			int err;
+			atomic_inc(&preq->io_count);
+			err = wbio_submit(io, nreq, &nfsio_write_ops);
+			if (err) {
+				ploop_set_error(preq, err);
+				nfsio_complete_io_request(preq);
+				goto out;
+			}
+		}
+
+		nreq = wbio_init(pos, page, poff, plen, preq, inode);
+
+		if (nreq == NULL) {
+			ploop_set_error(preq, -ENOMEM);
+			goto out;
+		}
+
+		prev_end = poff + plen;
+		pos += plen;
+		bw.bv_off += plen;
+	}
+
+	if (nreq) {
+		int err;
+		atomic_inc(&preq->io_count);
+		err = wbio_submit(io, nreq, &nfsio_write_ops);
+		if (err) {
+			ploop_set_error(preq, err);
+			nfsio_complete_io_request(preq);
+		}
+	}
+
+out:
+	nfsio_complete_io_request(preq);
+}
+
+
+static void
+nfsio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	iblock_t iblk = io->alloc_head++;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		ploop_fail_request(preq, -EBADF);
+		return;
+	}
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_DATA_WBI;
+
+	nfsio_submit_write_pad(io, preq, sbl, iblk, size);
+}
+
+static void nfsio_destroy(struct ploop_io * io)
+{
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	if (io->files.file) {
+		struct file * file = io->files.file;
+		mutex_lock(&io->plo->sysfs_mutex);
+		io->files.file = NULL;
+		if (io->files.mapping)
+			(void)invalidate_inode_pages2(io->files.mapping);
+		mutex_unlock(&io->plo->sysfs_mutex);
+		fput(file);
+	}
+}
+
+static int nfsio_sync(struct ploop_io * io)
+{
+	return 0;
+}
+
+static int nfsio_stop(struct ploop_io * io)
+{
+	return 0;
+}
+
+
+static int
+nfsio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+	return 0;
+}
+
+
+static void
+nfsio_read_page(struct ploop_io * io, struct ploop_request * preq,
+		struct page * page, sector_t sec)
+{
+	struct inode *inode = io->files.inode;
+	struct nfs_read_data *nreq;
+	int err;
+
+	ploop_prepare_io_request(preq);
+
+	nreq = rbio_init((loff_t)sec << 9, page, 0, PAGE_SIZE, preq, inode);
+	if (nreq == NULL) {
+		ploop_set_error(preq, -ENOMEM);
+		goto out;
+	}
+
+	atomic_inc(&preq->io_count);
+
+	err = rbio_submit(io, nreq, &nfsio_read_ops);
+	if (err) {
+		ploop_set_error(preq, err);
+		ploop_complete_io_request(preq);
+	}
+
+out:
+	ploop_complete_io_request(preq);
+}
+
+static void
+nfsio_write_page(struct ploop_io * io, struct ploop_request * preq,
+		 struct page * page, sector_t sec, int fua)
+{
+	struct inode *inode = io->files.inode;
+	struct nfs_write_data *nreq;
+	int err;
+
+	nfsio_prepare_io_request(preq);
+	ploop_prepare_tracker(preq, sec);
+
+	nreq = wbio_init((loff_t)sec << 9, page, 0, PAGE_SIZE, preq, inode);
+
+	if (nreq == NULL) {
+		ploop_set_error(preq, -ENOMEM);
+		goto out;
+	}
+
+	atomic_inc(&preq->io_count);
+	err = wbio_submit(io, nreq, &nfsio_write_ops);
+	if (err) {
+		ploop_set_error(preq, err);
+		nfsio_complete_io_request(preq);
+	}
+
+out:
+	nfsio_complete_io_request(preq);
+}
+
+struct nfsio_comp
+{
+	struct completion comp;
+	atomic_t count;
+	int error;
+	u64 * verf;
+};
+
+static inline void nfsio_comp_init(struct nfsio_comp * c)
+{
+	init_completion(&c->comp);
+	atomic_set(&c->count, 1);
+	c->error = 0;
+}
+
+static void nfsio_read_release_sync(void *calldata)
+{
+	struct nfs_read_data *nreq = calldata;
+	struct nfsio_comp *comp = (struct nfsio_comp *) nreq->header->req;
+	int status = nreq->task.tk_status;
+
+	if (unlikely(status < 0)) {
+		if (!comp->error)
+			comp->error = status;
+	}
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+
+	nfsio_rbio_release(calldata);
+}
+
+static const struct rpc_call_ops nfsio_read_direct_sync_ops = {
+	.rpc_call_done = nfsio_read_result,
+	.rpc_release = nfsio_read_release_sync,
+};
+
+
+static int
+nfsio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		   sector_t sec)
+{
+	struct inode *inode = io->files.inode;
+	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct nfs_read_data *nreq = NULL;
+	loff_t pos;
+	int i;
+	struct nfsio_comp comp;
+
+	nfsio_comp_init(&comp);
+
+	pos = (loff_t)sec << 9;
+
+	i = 0;
+	while (i < nr) {
+		int err;
+		int k;
+
+		nreq = rbio_init(pos, pvec[i], 0, PAGE_SIZE, &comp, inode);
+		if (nreq == NULL) {
+			comp.error = -ENOMEM;
+			break;
+		}
+
+		nreq->pages.npages = rsize / PAGE_SIZE;
+		if (nreq->pages.npages > nr - i)
+			nreq->pages.npages = nr - i;
+		for (k = 0; k < nreq->pages.npages; k++) {
+			nreq->pages.pagevec[k] = pvec[i + k];
+		}
+		nreq->args.count = nreq->pages.npages*PAGE_SIZE;
+
+		i += nreq->pages.npages;
+		pos += nreq->pages.npages*PAGE_SIZE;
+
+		atomic_inc(&comp.count);
+
+		err = rbio_submit(io, nreq, &nfsio_read_direct_sync_ops);
+		if (err) {
+			comp.error = err;
+			if (atomic_dec_and_test(&comp.count))
+				complete(&comp.comp);
+			break;
+		}
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+}
+
+static void nfsio_write_release_sync(void *calldata)
+{
+	struct nfs_write_data *nreq = calldata;
+	struct nfsio_comp *comp = (struct nfsio_comp *) nreq->header->req;
+	int status = nreq->task.tk_status;
+
+	if (unlikely(status < 0)) {
+		if (!comp->error)
+			comp->error = status;
+	}
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+
+	nfsio_wbio_release(calldata);
+}
+
+static const struct rpc_call_ops nfsio_write_direct_sync_ops = {
+	.rpc_call_done = nfsio_write_result,
+	.rpc_release = nfsio_write_release_sync,
+};
+
+
+static int
+nfsio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		    sector_t sec)
+{
+	struct inode *inode = io->files.inode;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	struct nfs_write_data *nreq;
+	loff_t pos;
+	int i;
+
+	struct nfsio_comp comp;
+
+	nfsio_comp_init(&comp);
+
+	pos = (loff_t)sec << 9;
+
+	i = 0;
+	while (i < nr) {
+		int err;
+		int k;
+
+		nreq = wbio_init(pos, pvec[i], 0, PAGE_SIZE, &comp, inode);
+		if (nreq == NULL) {
+			comp.error = -ENOMEM;
+			break;
+		}
+
+		nreq->pages.npages = wsize / PAGE_SIZE;
+		if (nreq->pages.npages > nr - i)
+			nreq->pages.npages = nr - i;
+		for (k = 0; k < nreq->pages.npages; k++) {
+			nreq->pages.pagevec[k] = pvec[i + k];
+		}
+		nreq->args.count = nreq->pages.npages*PAGE_SIZE;
+		nreq->args.stable = NFS_FILE_SYNC;
+
+		i += nreq->pages.npages;
+		pos += nreq->pages.npages*PAGE_SIZE;
+
+		atomic_inc(&comp.count);
+
+		err = wbio_submit(io, nreq, &nfsio_write_direct_sync_ops);
+		if (err) {
+			comp.error = err;
+			if (atomic_dec_and_test(&comp.count))
+				complete(&comp.comp);
+			break;
+		}
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return comp.error;
+}
+
+static int
+nfsio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+		unsigned int off, sector_t sec)
+{
+	struct inode *inode = io->files.inode;
+	struct nfs_read_data *nreq = NULL;
+	int err;
+
+	struct nfsio_comp comp;
+
+	nfsio_comp_init(&comp);
+
+	nreq = rbio_init((loff_t)sec << 9, page, off, len, &comp, inode);
+	if (nreq == NULL)
+		return -ENOMEM;
+
+	atomic_inc(&comp.count);
+
+	err = rbio_submit(io, nreq, &nfsio_read_direct_sync_ops);
+	if (err) {
+		comp.error = err;
+		if (atomic_dec_and_test(&comp.count))
+			complete(&comp.comp);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+}
+
+static int
+nfsio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+		 unsigned int off, sector_t sec)
+{
+	struct inode *inode = io->files.inode;
+	struct nfs_write_data *nreq;
+	struct nfsio_comp comp;
+	int err;
+
+	nfsio_comp_init(&comp);
+
+	nreq = wbio_init((loff_t)sec << 9, page, off, len, &comp, inode);
+	if (nreq == NULL)
+		return -ENOMEM;
+
+	nreq->args.stable = NFS_FILE_SYNC;
+
+	atomic_inc(&comp.count);
+	err = wbio_submit(io, nreq, &nfsio_write_direct_sync_ops);
+	if (err) {
+		comp.error = err;
+		if (atomic_dec_and_test(&comp.count))
+			complete(&comp.comp);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return comp.error;
+}
+
+static int nfsio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	int head_len = len & (PAGE_SIZE - 1);
+	int nr_total = len >> PAGE_SHIFT;
+	int nr = 1 << (io->plo->cluster_log + 9 - PAGE_SHIFT);
+	struct page * pvec[nr];
+	int i;
+	int err = 0;
+
+	for (i = 0; i < nr; i++)
+		pvec[i] = ZERO_PAGE(0);
+
+	if (head_len) {
+		err = nfsio_sync_write(io, pvec[0], head_len, 0, pos >> 9);
+		if (err)
+			return err;
+
+		pos += head_len;
+	}
+
+	while (nr_total > 0) {
+		int n = (nr_total < nr) ? nr_total : nr;
+
+		err = nfsio_sync_writevec(io, pvec, n, pos >> 9);
+		if (err)
+			return err;
+
+		pos += n << PAGE_SHIFT;
+		nr_total -= n;
+	}
+
+	io->alloc_head = pos >> (io->plo->cluster_log + 9);
+	return 0;
+}
+
+static void nfsio_commit_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	NFS_PROTO(data->inode)->commit_done(task, data);
+}
+
+static void nfsio_commit_release(void *calldata)
+{
+	struct nfs_commit_data *creq = calldata;
+	struct nfsio_comp *comp = (struct nfsio_comp *) creq->dreq;
+	int status = creq->task.tk_status;
+
+	if (status < 0) {
+		if (!comp->error)
+			comp->error = status;
+	}
+
+	memcpy(comp->verf, &creq->verf.verifier, 8);
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+
+	nfsio_cbio_release(calldata);
+}
+
+void nfsio_commit_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfsio_commit_ops = {
+	.rpc_call_prepare = nfsio_commit_prepare,
+	.rpc_call_done = nfsio_commit_result,
+	.rpc_release = nfsio_commit_release,
+};
+
+static struct nfs_open_context *
+nfsio_get_open_context(struct nfs_open_context *ctx)
+{
+	BUG_ON(!ctx);
+	atomic_inc(&ctx->lock_context.count);
+	return ctx;
+}
+
+static struct nfs_commit_data *
+cbio_init(struct ploop_io * io, void * priv)
+{
+	struct inode *inode = io->files.inode;
+	struct nfs_open_context *ctx;
+	struct nfs_commit_data * creq;
+
+	creq = nfsio_cbio_alloc();
+	if (unlikely(creq == NULL))
+		return NULL;
+
+	ctx = nfs_file_open_context(io->files.file);
+
+	creq->inode	  = inode;
+	creq->cred	  = ctx->cred;
+	creq->mds_ops     = &nfsio_commit_ops;
+	creq->dreq	  = priv;
+
+	creq->args.fh     = NFS_FH(inode);
+	creq->args.offset = 0;
+	creq->args.count  = 0;
+	creq->context     = nfsio_get_open_context(ctx);
+	creq->res.fattr   = &creq->fattr;
+	creq->res.verf    = &creq->verf;
+
+	return creq;
+}
+
+static int nfsio_commit(struct ploop_io *io, u64 * verf)
+{
+	struct inode *inode = io->files.inode;
+	struct nfs_commit_data *creq;
+	struct nfsio_comp comp;
+	int err;
+
+	nfsio_comp_init(&comp);
+	comp.verf = verf;
+
+	creq = cbio_init(io, &comp);
+	if (unlikely(creq == NULL))
+		return -ENOMEM;
+
+	atomic_inc(&comp.count);
+
+	err = nfs_initiate_commit(NFS_CLIENT(inode), creq, creq->mds_ops, 0);
+	if (err) {
+		comp.error = err;
+		if (atomic_dec_and_test(&comp.count))
+			complete(&comp.comp);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	if (err)
+		nfsio_cbio_release(creq);
+
+	return comp.error;
+}
+
+/* Unfortunately, state machine does not record, what
+ * it is doing exactly. We have to do some ugly "reverse engineering", which
+ * is not good of course. _XXX_ The function is just a proof of concept,
+ * it must be remade.
+ */
+
+static void resubmit(struct ploop_request * preq)
+{
+	struct ploop_delta * delta = ploop_top_delta(preq->plo);
+	unsigned long sec;
+
+	switch (preq->eng_state) {
+	case PLOOP_E_INDEX_WB:
+		delta = map_writable_delta(preq);
+		map_index(delta, preq, &sec);
+		nfsio_write_page(&delta->io, preq,
+				 preq->sinfo.wi.tpage,
+				 sec, 0);
+		break;
+	case PLOOP_E_DATA_WBI:
+		if (preq->aux_bio) {
+			struct bio_list tbl;
+			tbl.head = tbl.tail = preq->aux_bio;
+			nfsio_submit_write_pad(&delta->io, preq, &tbl,
+					       preq->iblock, 1<<preq->plo->cluster_log);
+		} else {
+			nfsio_submit_write_pad(&delta->io, preq, &preq->bl,
+					       preq->iblock, preq->req_size);
+		}
+		break;
+	case PLOOP_E_COMPLETE:
+	case PLOOP_E_RELOC_NULLIFY:
+		if (preq->aux_bio) {
+			struct bio_list tbl;
+			tbl.head = tbl.tail = preq->aux_bio;
+			nfsio_submit_write(&delta->io, preq, &tbl,
+					   preq->iblock, 1<<preq->plo->cluster_log);
+		} else {
+			nfsio_submit_write(&delta->io, preq, &preq->bl,
+					   preq->iblock, preq->req_size);
+		}
+		break;
+	default:
+		printk("Resubmit bad state %lu\n\n", preq->eng_state);
+		BUG();
+	}
+}
+
+static int nfsio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		LIST_HEAD(list);
+		u64 verf;
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		INIT_LIST_HEAD(&list);
+		list_splice_init(&io->fsync_queue, &list);
+		spin_unlock_irq(&plo->lock);
+
+		err = 0;
+		if (!list_empty(&list)) {
+			err = nfsio_commit(io, &verf);
+		}
+
+		spin_lock_irq(&plo->lock);
+
+		while (!list_empty(&list)) {
+			struct ploop_request * preq;
+			preq = list_entry(list.next, struct ploop_request, list);
+			list_del(&preq->list);
+			clear_bit(PLOOP_REQ_UNSTABLE, &preq->state);
+			io->fsync_qlen--;
+
+			if (err) {
+				ploop_set_error(preq, err);
+			} else if (memcmp(&preq->verf, &verf, 8)) {
+				/* Oops, server reboot. Must resubmit write. */
+				spin_unlock_irq(&plo->lock);
+				resubmit(preq);
+				spin_lock_irq(&plo->lock);
+				continue;
+			}
+			list_add_tail(&preq->list, &plo->ready_queue);
+		}
+		plo->st.bio_fsync++;
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+static int nfsio_open(struct ploop_io * io)
+{
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	struct file * file = io->files.file;
+	int err = 0;
+
+	if (file == NULL)
+		return -EBADF;
+
+	err = invalidate_inode_pages2(file->f_mapping);
+	if (err)
+		return err;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = NULL;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		io->fsync_thread = kthread_create(nfsio_fsync_thread,
+						  io, "nfsio_commit%d",
+						  delta->plo->index);
+		if (IS_ERR(io->fsync_thread)) {
+			err = PTR_ERR(io->fsync_thread);
+			io->fsync_thread = NULL;
+			goto out;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+out:
+	return err;
+}
+
+static int nfsio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+
+	file = dentry_open(dget(F_DENTRY(file)), mntget(F_MNT(file)), O_RDONLY|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = invalidate_inode_pages2(file->f_mapping);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int nfsio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	(void)invalidate_inode_pages2(io->files.mapping);
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int nfsio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+
+	file = dentry_open(dget(F_DENTRY(file)), mntget(F_MNT(file)), O_RDWR|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = invalidate_inode_pages2(file->f_mapping);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	if (io->fsync_thread == NULL) {
+		io->fsync_thread = kthread_create(nfsio_fsync_thread,
+						  io, "nfsio_commit%d",
+						  io->plo->index);
+		if (IS_ERR(io->fsync_thread)) {
+			io->fsync_thread = NULL;
+			fput(file);
+			return -ENOMEM;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int nfsio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static int nfsio_truncate(struct ploop_io * io, struct file * file,
+			  __u32 alloc_head)
+{
+	int err;
+	struct iattr newattrs;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size = (u64)alloc_head << (io->plo->cluster_log + 9);
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = notify_change(F_DENTRY(file), &newattrs);
+	mutex_unlock(&io->files.inode->i_mutex);
+	return err;
+}
+
+static void nfsio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+	q->max_sectors = 1024;
+	q->max_hw_sectors = 8192;
+	q->max_phys_segments = 2048;
+	q->max_hw_segments = 40;
+#endif
+}
+
+static int nfsio_autodetect(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+
+	if (inode->i_sb->s_magic != NFS_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (strcmp(file->f_mapping->host->i_sb->s_type->name, "nfs")) {
+		printk("%s is not supported; use '-o vers=3' mounting nfs\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	if (NFS_SERVER(file->f_mapping->host)->wsize < PAGE_SIZE ||
+	    NFS_SERVER(file->f_mapping->host)->rsize < PAGE_SIZE) {
+		printk("NFS server wsize/rsize too small: %d/%d\n",
+		       NFS_SERVER(file->f_mapping->host)->wsize,
+		       NFS_SERVER(file->f_mapping->host)->rsize);
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_nfs =
+{
+	.id		=	PLOOP_IO_NFS,
+	.name		=	"nfs",
+	.owner		=	THIS_MODULE,
+
+	.alloc		=	nfsio_alloc_sync,
+	.submit		=	nfsio_submit,
+	.submit_alloc	=	nfsio_submit_alloc,
+	.read_page	=	nfsio_read_page,
+	.write_page	=	nfsio_write_page,
+	.sync_read	=	nfsio_sync_read,
+	.sync_write	=	nfsio_sync_write,
+	.sync_readvec	=	nfsio_sync_readvec,
+	.sync_writevec	=	nfsio_sync_writevec,
+
+	.init		=	nfsio_init,
+	.destroy	=	nfsio_destroy,
+	.open		=	nfsio_open,
+	.sync		=	nfsio_sync,
+	.stop		=	nfsio_stop,
+	.prepare_snapshot =	nfsio_prepare_snapshot,
+	.complete_snapshot =	nfsio_complete_snapshot,
+	.prepare_merge  =	nfsio_prepare_merge,
+	.start_merge	=	nfsio_start_merge,
+	.truncate	=	nfsio_truncate,
+
+	.queue_settings	=	nfsio_queue_settings,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       nfsio_autodetect,
+};
+
+union nfsio_bio
+{
+	struct {
+		struct nfs_read_header	r;
+		struct page		*padd[MAX_NBIO_PAGES];
+	} ru;
+	struct {
+		struct nfs_write_header	w;
+		struct page		*padd[MAX_NBIO_PAGES];
+		u32			bounced;
+	} wu;
+	struct {
+		struct nfs_commit_data c;
+	} cu;
+};
+
+static struct kmem_cache *nfsio_bio_cachep;
+static mempool_t *nfsio_bio_mempool;
+
+
+static struct nfs_read_data *nfsio_rbio_alloc(unsigned int pagecount)
+{
+	union nfsio_bio * b = mempool_alloc(nfsio_bio_mempool, GFP_NOFS);
+	struct nfs_read_header *p_hdr;
+	struct nfs_read_data *p;
+
+	if (b == NULL)
+		return NULL;
+
+	p_hdr = &b->ru.r;
+	p = &p_hdr->rpc_data;
+
+	memset(b, 0, sizeof(*b));
+	p->header = &p_hdr->header;
+	INIT_LIST_HEAD(&p->header->pages);
+	p->pages.npages = pagecount;
+	p->pages.pagevec = b->ru.padd;
+	return p;
+}
+
+static struct nfs_write_data *nfsio_wbio_alloc(unsigned int pagecount)
+{
+	union nfsio_bio * b = mempool_alloc(nfsio_bio_mempool, GFP_NOFS);
+	struct nfs_write_header *p_hdr;
+	struct nfs_write_data *p;
+
+	if (b == NULL)
+		return NULL;
+
+	p_hdr = &b->wu.w;
+	p = &p_hdr->rpc_data;
+
+	memset(b, 0, sizeof(*b));
+	p->header = &p_hdr->header;
+	INIT_LIST_HEAD(&p->header->pages);
+	p->pages.npages = pagecount;
+	p->pages.pagevec = b->wu.padd;
+	return p;
+}
+
+static struct nfs_commit_data *nfsio_cbio_alloc(void)
+{
+	union nfsio_bio * b = mempool_alloc(nfsio_bio_mempool, GFP_NOFS);
+	struct nfs_commit_data *p;
+
+	if (b == NULL)
+		return NULL;
+
+	p = &b->cu.c;
+
+	memset(b, 0, sizeof(*b));
+	INIT_LIST_HEAD(&p->pages);
+	return p;
+}
+
+void nfsio_wbio_release(void *data)
+{
+	struct nfs_write_data *p = data;
+	struct nfs_write_header *p_hdr = container_of(p, struct nfs_write_header, rpc_data);
+	union nfsio_bio * b = container_of(p_hdr, union nfsio_bio, wu.w);
+
+	if (b->wu.bounced) {
+		int i;
+
+		for (i=0; i<32; i++) {
+			if (b->wu.bounced & (1<<i))
+				put_page(b->wu.w.rpc_data.pages.pagevec[i]);
+		}
+	}
+
+	mempool_free(b, nfsio_bio_mempool);
+}
+
+void nfsio_rbio_release(void *data)
+{
+	struct nfs_read_data *p = data;
+	struct nfs_read_header *p_hdr = container_of(p, struct nfs_read_header, rpc_data);
+	union nfsio_bio * b = container_of(p_hdr, union nfsio_bio, ru.r);
+	mempool_free(b, nfsio_bio_mempool);
+}
+
+static void nfsio_put_open_context(struct nfs_open_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->lock_context.count))
+		BUG();
+}
+
+void nfsio_cbio_release(void *data)
+{
+	struct nfs_commit_data *p = data;
+	union nfsio_bio * b = container_of(p, union nfsio_bio, cu.c);
+	nfsio_put_open_context(p->context);
+	mempool_free(b, nfsio_bio_mempool);
+}
+
+int verify_bounce(struct nfs_write_data * nreq)
+{
+	int i;
+
+	for (i = 0; i < nreq->pages.npages; i++) {
+		if (PageSlab(nreq->pages.pagevec[i]) ||
+		    page_count(nreq->pages.pagevec[i]) == 0) {
+			struct page * page;
+			void *ksrc, *kdst;
+			static int once;
+
+			if (!once) {
+				printk("ploop io_nfs got invalid page. XFS? Do not use this crap for Christ's sake.\n");
+				once = 1;
+			}
+
+			page = alloc_page(GFP_NOFS|__GFP_HIGHMEM);
+			if (!page)
+				return -ENOMEM;
+
+			ksrc = kmap_atomic(nreq->pages.pagevec[i], KM_USER0);
+			kdst = kmap_atomic(page, KM_USER1);
+			memcpy(kdst, ksrc, PAGE_SIZE);
+			kunmap_atomic(kdst, KM_USER1);
+			kunmap_atomic(ksrc, KM_USER0);
+			nreq->pages.pagevec[i] = page;
+			((union nfsio_bio*)nreq)->wu.bounced |= (1<<i);
+		}
+	}
+	return 0;
+}
+
+
+static int __init pio_nfs_mod_init(void)
+{
+	nfsio_bio_cachep = kmem_cache_create("nfsio_bio",
+					     sizeof(union nfsio_bio),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+					     , NULL
+#endif
+					     );
+	if (nfsio_bio_cachep == NULL)
+		return -ENOMEM;
+
+	nfsio_bio_mempool = mempool_create_slab_pool(128,
+						     nfsio_bio_cachep);
+	if (nfsio_bio_mempool == NULL)
+		return -ENOMEM;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
+	nfsio_workqueue = create_singlethread_workqueue("nfsio");
+	if (nfsio_workqueue == NULL)
+		return -ENOMEM;
+#endif
+
+	return ploop_register_io(&ploop_io_ops_nfs);
+}
+
+static void __exit pio_nfs_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_nfs);
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
+	destroy_workqueue(nfsio_workqueue);
+#endif
+	mempool_destroy(nfsio_bio_mempool);
+	kmem_cache_destroy(nfsio_bio_cachep);
+}
+
+module_init(pio_nfs_mod_init);
+module_exit(pio_nfs_mod_exit);
+
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/map.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/map.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/map.c	2015-01-21 12:02:54.712921377 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/map.c	2015-01-21 12:02:57.777840021 +0300
@@ -0,0 +1,1279 @@
+/*
+ * Generic engine for mapping virtual blocks (cluster_t) to indices
+ * in image (iblock_t).
+ *
+ * Mapping is global, it is defined not for some particular delta,
+ * but for the whole disk. Therefore it is abstract and does not depend
+ * on particular virtual disk format. Of course, for some disk types
+ * it can be not so easy to fetch/update backing store. Actually,
+ * this engine is tightly bound to organization of index tables in ploop1.
+ *
+ * Technically, it is just array of pages with some metainformation
+ * attached to each page. The array may be highly sparse, so that it is
+ * in rbtree keyed by array index cluster_no / (PAGE_SIZE / sizeof(map_index)).
+ *
+ * Sadly, it is completely similar to linux page cache for a virtual
+ * mapping. "Sadly" is because linux page cache provides only a crippled
+ * implementation of asynchronous read/writeback, which requires synchronous
+ * waits for completions and does not making any callbacks on completion.
+ * So that, we have to redo all the work here.
+ *
+ * Two words about synchronization. All the updates to map are
+ * made from single thread. Lookups can happen in an unserialized context,
+ * therefore we protect all critical updates with spinlock. RCU can be used too.
+ *
+ * Mapping is UPTODATE, when it is in sync with top delta.
+ * When a mapping is accessed the first time and there is no mapping in top
+ * delta, we search for lower level delta. We could create empty mapping
+ * and this would have advantage: when the whole blocks are rewritten
+ * we do not even need lower deltas (_XXX_).
+ */
+
+#include <linux/version.h>
+
+#include <linux/ploop/ploop.h>
+
+/* This defines slot in mapping page. Right now it is 32 bit
+ * and therefore it directly matches ploop1 structure. */
+typedef u32 map_index_t;
+
+#define INDEX_PER_PAGE	(PAGE_SIZE / sizeof(map_index_t))
+
+static struct kmem_cache * ploop_map_cache;
+
+static LIST_HEAD(map_lru);
+static DEFINE_SPINLOCK(map_lru_lock);
+static atomic_t map_pages_nr = ATOMIC_INIT(0);
+
+/*
+ * Additional information for each page is:
+ * 1. rb tree link
+ * 2. Page
+ * 3. mn_start, mn_end - the first and the last index
+ * (correspondingly) the page maps to iblocks.
+ * 4. lru linkage
+ * 5. delta level of whole page, it is delta, where this page
+ *    is backed.
+ * 6. Array of delta levels for each map_index in the page.
+ *    If page is backed at level N, those levels cannot be >N.
+ *    If all the levels == N, array of levels is not allocated.
+ *    When at least one level < N, it is stored in the array.
+ *    Note, that in this case exporting page to disk implies
+ *    clearing irrelevant entries.
+ */
+
+struct map_node
+{
+	struct rb_node		rb_link;
+	cluster_t		mn_start;
+	cluster_t		mn_end;
+	unsigned long		state;
+	atomic_t		refcnt;
+	struct ploop_map	*parent;
+
+	struct page		*page;
+	struct list_head	lru;
+	u8			*levels;
+
+	/* List of preq's blocking on this mapping.
+	 *
+	 * We queue here several kinds of requests:
+	 * 1. If mapping is not uptodate, all the requests which need
+	 *    this mapping are queued here. preq state is ENTRY.
+	 * 2. If preq requires index update and it is delayed
+	 *    because writeback is in progress. preq state is INDEX_DELAY,
+	 *    new index is kept in preq->iblock.
+	 * 3. If preq's started index update, preq state is INDEX_WB,
+	 *    new indices are sent to io, but they are not inserted
+	 *    into mapping until writeback is complete.
+	 */
+	struct list_head	io_queue;
+};
+
+cluster_t map_get_mn_end(struct map_node *m)
+{
+	return m->mn_end;
+}
+
+#define MAP_LEVEL(m)		((m)->state & 0xFF)
+#define MAP_SET_LEVEL(m, l)	((m)->state = ((m)->state & ~0xFF) | (l))
+
+#define MAP_UPTODATE(m)		(((m)->state >> 8) & 0xFFUL)
+#define MAP_SET_UPTODATE(m, l)	((m)->state = ((m)->state & ~0xFF00UL) | ((l)<<8))
+
+enum {
+	PLOOP_MAP_UPTODATE	= 16,	/* Mapping is in sync with top_delta,
+					 * we can write index. But zero entries
+					 * still require read lower delta indices.
+					 */
+	PLOOP_MAP_READ		= 17,	/* Mapping read is scheduled */
+	PLOOP_MAP_WRITEBACK	= 18,	/* Mapping is under writeback */
+	PLOOP_MAP_ERROR		= 19,	/* Mapping is baaad */
+};
+
+void map_init(struct ploop_device * plo, struct ploop_map * map)
+{
+	INIT_LIST_HEAD(&map->delta_list);
+	map->flags = 0;
+	map->last_activity = jiffies;
+	map->plo = plo;
+	map->rb_root = RB_ROOT;
+	map->lru_buffer_ptr = 0;
+	init_waitqueue_head(&map->destroy_waitq);
+}
+
+/* Deliver batch of LRU updates from buffer to global LRU.
+ * Everything, which has zero refcnt, is added to LRU or moved to tail
+ * of LRU. Everything, which has non-zero refcnt, is removed from LRU.
+ */
+static void flush_lru_buffer(struct ploop_map * map)
+{
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&map_lru_lock, flags);
+	for (i = 0; i < map->lru_buffer_ptr; i++) {
+		struct map_node * m = map->lru_buffer[i];
+		if (atomic_dec_and_test(&m->refcnt))
+			list_move_tail(&m->lru, &map_lru);
+		else
+			list_del_init(&m->lru);
+	}
+	spin_unlock_irqrestore(&map_lru_lock, flags);
+
+	map->lru_buffer_ptr = 0;
+}
+
+/*
+ * map_release() must be called under plo-lock, because
+ * The pair atomic_read & atomic_dec_and_test is not atomic.
+ */
+void map_release(struct map_node * m)
+{
+	struct ploop_map * map = m->parent;
+
+	if (atomic_read(&m->refcnt) == 1) {
+		if (!list_empty(&m->lru))
+			return;
+		if (map->lru_buffer_ptr == PLOOP_LRU_BUFFER)
+			flush_lru_buffer(map);
+		map->lru_buffer[map->lru_buffer_ptr++] = m;
+		return;
+	}
+	if (atomic_dec_and_test(&m->refcnt))
+		BUG();
+}
+
+static inline void cond_flush_lru_buffer(struct ploop_map * map)
+{
+	if (map->lru_buffer_ptr == PLOOP_LRU_BUFFER)
+		flush_lru_buffer(map);
+}
+
+
+static struct map_node * map_lookup(struct ploop_map * map, cluster_t block)
+{
+	struct rb_node * n = map->rb_root.rb_node;
+	struct map_node * m;
+
+	while (n) {
+		m = rb_entry(n, struct map_node, rb_link);
+
+		if (block < m->mn_start)
+			n = n->rb_left;
+		else if (block > m->mn_end)
+			n = n->rb_right;
+		else
+			return m;
+	}
+	return NULL;
+}
+
+/* Lookup mapping atomically. */
+
+int ploop_fastmap(struct ploop_map * map, cluster_t block, iblock_t *result)
+{
+	struct map_node * m;
+	u32 idx;
+	map_index_t blk;
+
+	if (unlikely(block >= map->max_index))
+		return -1;
+
+	if (test_bit(PLOOP_MAP_IDENTICAL, &map->flags)) {
+		*result = block;
+		return 0;
+	}
+
+	m = map_lookup(map, block);
+	if (m == NULL)
+		return -1;
+
+	if (atomic_read(&m->refcnt) == 0) {
+		cond_flush_lru_buffer(map);
+		if (atomic_read(&m->refcnt) == 0) {
+			atomic_inc(&m->refcnt);
+			map->lru_buffer[map->lru_buffer_ptr++] = m;
+		}
+	}
+	map->last_activity = jiffies;
+
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return -1;
+
+	idx = (block + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1); 
+	blk = ((map_index_t *)page_address(m->page))[idx] >>
+	       ploop_map_log(map->plo);
+
+	if (blk) {
+		*result = blk;
+		if (m->levels)
+			return m->levels[idx];
+		else
+			return MAP_LEVEL(m);
+	}
+	return -1;
+}
+
+static void map_node_destroy(struct map_node *m)
+{
+	rb_erase(&m->rb_link, &m->parent->rb_root);
+	list_del_init(&m->lru);
+	BUG_ON(atomic_read(&m->refcnt));
+	BUG_ON(!list_empty(&m->io_queue));
+	if (m->page)
+		put_page(m->page);
+	if (m->levels)
+		kfree(m->levels);
+	m->parent->pages--;
+	atomic_dec(&map_pages_nr);
+	kmem_cache_free(ploop_map_cache, m);
+}
+
+static void map_lru_scan(void)
+{
+	int max_loops = atomic_read(&map_pages_nr);
+
+	while (atomic_read(&map_pages_nr) > max_map_pages &&
+	       --max_loops >= 0) {
+		struct ploop_map * map;
+		struct map_node * candidate = NULL;
+
+		spin_lock_irq(&map_lru_lock);
+		if (!list_empty(&map_lru)) {
+			candidate = list_first_entry(&map_lru, struct map_node, lru);
+			atomic_inc(&candidate->refcnt);
+		}
+		spin_unlock_irq(&map_lru_lock);
+
+		if (!candidate)
+			break;
+
+		map = candidate->parent;
+
+		spin_lock_irq(&map->plo->lock);
+		spin_lock(&map_lru_lock);
+
+		if (waitqueue_active(&map->destroy_waitq)) {
+			atomic_dec(&candidate->refcnt);
+			wake_up(&map->destroy_waitq);
+			spin_unlock(&map_lru_lock);
+			spin_unlock_irq(&map->plo->lock);
+			return;
+		}
+
+		list_del_init(&candidate->lru);
+
+		if (atomic_dec_and_test(&candidate->refcnt)) {
+			/* This instance is within its limits, just
+			 * readd node back to tail of lru.
+			 */
+			if (map->pages <= map->plo->tune.min_map_pages &&
+			    time_after(map->last_activity +
+				       map->plo->tune.max_map_inactivity, jiffies) &&
+			    !test_bit(PLOOP_MAP_DEAD, &map->flags)) {
+				list_add_tail(&candidate->lru, &map_lru);
+			} else {
+				map_node_destroy(candidate);
+			}
+		}
+		spin_unlock(&map_lru_lock);
+		spin_unlock_irq(&map->plo->lock);
+
+		if (!(max_loops & 16))
+			cond_resched();
+	}
+}
+
+static struct map_node *
+map_create(struct ploop_map * map, cluster_t block)
+{
+	struct ploop_device * plo = map->plo;
+	struct rb_node **p, *parent;
+	struct map_node * m;
+	cluster_t ondisk_pageno = (block + PLOOP_MAP_OFFSET) / INDEX_PER_PAGE;
+
+	m = kmem_cache_alloc(ploop_map_cache, GFP_NOFS);
+	if (unlikely(m == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	m->page = alloc_page(GFP_NOFS);
+	if (unlikely(m->page == NULL)) {
+		kmem_cache_free(ploop_map_cache, m);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (ondisk_pageno == 0) {
+		m->mn_start = 0;
+		m->mn_end = INDEX_PER_PAGE - PLOOP_MAP_OFFSET - 1;
+	} else {
+		m->mn_start = ondisk_pageno * INDEX_PER_PAGE - PLOOP_MAP_OFFSET;
+		m->mn_end = m->mn_start + INDEX_PER_PAGE - 1;
+	}
+
+	INIT_LIST_HEAD(&m->io_queue);
+	INIT_LIST_HEAD(&m->lru);
+	m->levels = NULL;
+	m->state = 0;
+	atomic_set(&m->refcnt, 1);
+	m->parent = map;
+
+	spin_lock_irq(&plo->lock);
+
+	p = &map->rb_root.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		struct map_node * entry;
+		parent = *p;
+		entry = rb_entry(parent, struct map_node, rb_link);
+
+		/* Nodes can be deleted by any of ploop threads,
+		 * but they are inserted only in ploop thread.
+		 * Before calling map_create() we checked the node
+		 * is absent, therefore:
+		 */
+		BUG_ON(ondisk_pageno ==
+		       (entry->mn_start + PLOOP_MAP_OFFSET) / INDEX_PER_PAGE);
+
+		if (block < entry->mn_start)
+			p = &(*p)->rb_left;
+		else if (block > entry->mn_end)
+			p = &(*p)->rb_right;
+		else
+			printk("map_create: Oops! block=%u; mn_range=[%u..%u]\n",
+			       block, entry->mn_start, entry->mn_end);
+	}
+
+	rb_link_node(&m->rb_link, parent, p);
+	rb_insert_color(&m->rb_link, &map->rb_root);
+
+	map->pages++;
+	atomic_inc(&map_pages_nr);
+	spin_unlock_irq(&plo->lock);
+
+	if (atomic_read(&map_pages_nr) > max_map_pages)
+		map_lru_scan();
+
+	return m;
+}
+
+/* helper for trans_map_get_index() and map_get_index() */
+static iblock_t
+cluster2iblock(struct ploop_request *preq, struct map_node *m, cluster_t block,
+	       u32 *idx)
+{
+	iblock_t iblk;
+	char *fmt;
+
+	BUG_ON (block < INDEX_PER_PAGE - PLOOP_MAP_OFFSET && m->mn_start != 0);
+	BUG_ON (block >= INDEX_PER_PAGE - PLOOP_MAP_OFFSET && m->mn_start !=
+		((block + PLOOP_MAP_OFFSET) &
+		 ~(INDEX_PER_PAGE - 1)) - PLOOP_MAP_OFFSET);
+
+	*idx = (block + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+	iblk = ((map_index_t *)page_address(m->page))[*idx];
+
+	if (likely(iblk != PLOOP_ZERO_INDEX))
+		iblk >>= ploop_map_log(preq->plo);
+
+	if (m == preq->trans_map)
+		fmt = "tmgi %u %d %u [ %u %u ]\n";
+	else if (m == preq->map)
+		fmt = "mgi %u %d %u [ %u %u ]\n";
+	else
+		BUG();
+
+	__TRACE(fmt, block, *idx, iblk,
+		((map_index_t *)page_address(m->page))[0],
+		((map_index_t *)page_address(m->page))[1]);
+
+	return iblk;
+}
+
+int trans_map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result)
+{
+	struct map_node * m = preq->trans_map;
+	u32 idx;
+	map_index_t blk;
+
+	if (m == NULL)
+		return -1;
+
+	blk = cluster2iblock(preq, m, block, &idx);
+
+	if (blk) {
+		*result = blk;
+		return 0;
+	}
+	return -1;
+}
+
+
+int map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result)
+{
+	struct map_node * m = preq->map;
+	u32 idx;
+	map_index_t blk;
+
+	if (m == NULL) {
+		*result = block;
+		return 0;
+	}
+
+	blk = cluster2iblock(preq, m, block, &idx);
+
+	if (blk) {
+		*result = blk;
+		if (m->levels)
+			return m->levels[idx];
+		else
+			return MAP_LEVEL(m);
+	}
+	return -1;
+}
+
+int map_index_fault(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct ploop_delta * top_delta, * delta, * ndelta;
+	struct map_node * m = preq->map;
+	int uptodate_level;
+	sector_t pos;
+	int err;
+
+	uptodate_level = MAP_UPTODATE(m);
+
+	/* All the levels are read, mapping is absent. */
+	if (uptodate_level == 0) {
+		__TRACE("MAP E %u\n", preq->req_cluster);
+		return -1;
+	}
+
+	top_delta = ploop_top_delta(plo);
+	delta = NULL;
+
+	list_for_each_entry(ndelta, &plo->map.delta_list, list) {
+		int rc;
+
+		if (ndelta->level >= uptodate_level)
+			continue;
+
+		rc = ndelta->ops->map_index(ndelta, m->mn_start, &pos);
+		if (rc != 0) {
+			delta = ndelta;
+			break;
+		}
+
+		MAP_SET_UPTODATE(m, ndelta->level);
+		__TRACE("MAP SKIP %u %d\n", preq->req_cluster, ndelta->level);
+	}
+
+	/* Not found anywhere. */
+	if (!delta) {
+		__TRACE("MAP NF %u\n", preq->req_cluster);
+		return -1;
+	}
+
+	/* Mapping is present in lower delta, start merge */
+	spin_lock_irq(&plo->lock);
+	ploop_add_lockout(preq, 0);
+
+	if (test_and_set_bit(PLOOP_MAP_READ, &m->state)) {
+		__TRACE("r %p %u %p\n", preq, preq->req_cluster, m);
+		list_add_tail(&preq->list, &m->io_queue);
+		plo->st.merge_lockouts++;
+		spin_unlock_irq(&plo->lock);
+		/* Someone already scheduled read. */
+		return 0;
+	}
+	spin_unlock_irq(&plo->lock);
+
+	err = -EIO;
+	if (test_bit(PLOOP_MAP_ERROR, &m->state))
+		goto err_out;
+
+	err = -ENOMEM;
+	preq->sinfo.ri.tpage = alloc_page(GFP_NOFS);
+	if (preq->sinfo.ri.tpage == NULL)
+		goto err_out;
+
+	preq->sinfo.ri.level = delta->level;
+	preq->eng_state = PLOOP_E_INDEX_READ;
+
+	plo->st.map_merges++;
+	delta->ops->read_index(delta, preq, preq->sinfo.ri.tpage, pos);
+	return 0;
+
+err_out:
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	ploop_fail_request(preq, err);
+	return 0;
+}
+
+static void map_read_endio(struct ploop_request * preq, struct map_node * m)
+{
+	struct ploop_device * plo = preq->plo;
+	struct list_head * n, *pn;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&plo->lock);
+
+	if (!preq->error) {
+		set_bit(PLOOP_MAP_UPTODATE, &m->state);
+	} else {
+		set_bit(PLOOP_MAP_ERROR, &m->state);
+	}
+	clear_bit(PLOOP_MAP_READ, &m->state);
+
+	__TRACE(">E %p %u %p\n", preq, preq->req_cluster, m);
+
+	list_for_each_safe(n, pn, &m->io_queue) {
+		preq = list_entry(n, struct ploop_request, list);
+		if (preq->eng_state == PLOOP_E_ENTRY) {
+			list_del(&preq->list);
+			list_add_tail(&preq->list, &list);
+		}
+	}
+	if (!list_empty(&list))
+		list_splice(&list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void map_merge_endio(struct ploop_request * preq, struct map_node * m)
+{
+	struct ploop_device * plo = preq->plo;
+	struct list_head *n, *pn;
+	LIST_HEAD(list);
+	int i;
+	u32 * map;
+	u32 * merged;
+	int skip = m->mn_start == 0 ? PLOOP_MAP_OFFSET : 0;
+
+	__TRACE(">M %p %u %p\n", preq, preq->req_cluster, m);
+
+	if (unlikely(preq->error))
+		goto abort_update;
+
+	map = page_address(m->page);
+	merged = page_address(preq->sinfo.ri.tpage);
+
+	for (i = skip; i < INDEX_PER_PAGE; i++) {
+		if (map[i] != 0)
+			continue;
+		if (merged[i] == 0)
+			continue;
+		if (preq->sinfo.ri.level != MAP_LEVEL(m)) {
+			if (!m->levels) {
+				m->levels = kmalloc(INDEX_PER_PAGE, GFP_NOFS);
+				if (unlikely(m->levels == NULL)) {
+					preq->error = -ENOMEM;
+					goto abort_update;
+				}
+				memset(m->levels, MAP_LEVEL(m), INDEX_PER_PAGE);
+			}
+			m->levels[i] = preq->sinfo.ri.level;
+		}
+		map[i] = merged[i];
+	}
+
+	put_page(preq->sinfo.ri.tpage);
+	preq->sinfo.ri.tpage = NULL;
+
+	spin_lock_irq(&plo->lock);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	MAP_SET_UPTODATE(m, preq->sinfo.ri.level);
+	__TRACE("MAP U %u %d\n", preq->req_cluster, preq->sinfo.ri.level);
+	preq->eng_state = PLOOP_E_ENTRY;
+
+flush_queue:
+	list_for_each_safe(n, pn, &m->io_queue) {
+		preq = list_entry(n, struct ploop_request, list);
+		if (preq->eng_state == PLOOP_E_ENTRY) {
+			list_del(&preq->list);
+			list_add_tail(&preq->list, &list);
+		}
+	}
+	if (!list_empty(&list))
+		list_splice(&list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+	return;
+
+abort_update:
+	put_page(preq->sinfo.ri.tpage);
+	preq->sinfo.ri.tpage = NULL;
+	preq->eng_state = PLOOP_E_COMPLETE;
+
+	spin_lock_irq(&plo->lock);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	set_bit(PLOOP_MAP_ERROR, &m->state);
+	goto flush_queue;
+}
+
+
+void map_read_complete(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+
+	if (preq->eng_state == PLOOP_E_TRANS_INDEX_READ)
+		m = preq->trans_map;
+
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		map_read_endio(preq, m);
+	else
+		map_merge_endio(preq, m);
+}
+
+static int
+ploop_map_start_read(struct ploop_map * map, struct ploop_request * preq,
+		     struct map_node * m)
+{
+	struct ploop_device * plo = map->plo;
+	struct ploop_delta * top_delta, * delta, * ndelta;
+	sector_t pos;
+
+	top_delta = map_top_delta(map);
+	delta = NULL;
+
+	list_for_each_entry(ndelta, &map->delta_list, list) {
+		int rc;
+
+		rc = ndelta->ops->map_index(ndelta, m->mn_start, &pos);
+		if (rc != 0) {
+			delta = ndelta;
+			break;
+		}
+	}
+
+	if (delta) {
+		__TRACE("MAP R0 %u %d %lu %d\n", preq->req_cluster, delta->level, pos, m->index);
+		/* We know delta, we know position. We can read. */
+		MAP_SET_LEVEL(m, delta->level);
+		MAP_SET_UPTODATE(m, delta->level);
+		if (map == &plo->map)
+			preq->eng_state = PLOOP_E_INDEX_READ;
+		else
+			preq->eng_state = PLOOP_E_TRANS_INDEX_READ;
+		delta->ops->read_index(delta, preq, m->page, pos);
+		plo->st.map_reads++;
+		return 1;
+	}
+
+	/* Otherwise mapping does not exist. */
+	memset(page_address(m->page), 0, PAGE_SIZE);
+	__TRACE("MAP R1 %u %d\n", preq->req_cluster, top_delta->level);
+	MAP_SET_LEVEL(m, top_delta->level);
+	MAP_SET_UPTODATE(m, 0);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	set_bit(PLOOP_MAP_UPTODATE, &m->state);
+	return 0;
+}
+
+static int ploop_read_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = (map == &plo->map) ? preq->map : preq->trans_map;
+	int err = 0;
+
+	spin_lock_irq(&plo->lock);
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state)) {
+		if (test_bit(PLOOP_MAP_ERROR, &m->state)) {
+			err = -EIO;
+			goto out;
+		}
+
+		if (!test_and_set_bit(PLOOP_MAP_READ, &m->state)) {
+			spin_unlock_irq(&plo->lock);
+
+			return ploop_map_start_read(map, preq, m);
+		} else {
+			__TRACE("g %p %u %p\n", preq, preq->req_cluster, m);
+			plo->st.map_lockouts++;
+			list_add_tail(&preq->list, &m->io_queue);
+			err = 1;
+		}
+	}
+
+out:
+	spin_unlock_irq(&plo->lock);
+	return err;
+}
+
+void ploop_update_map(struct ploop_map * map, int level,
+		      cluster_t block, iblock_t iblk)
+{
+	struct map_node * m;
+	u32 idx;
+	map_index_t *p;
+
+	spin_lock_irq(&map->plo->lock);
+
+	m = map_lookup(map, block);
+	if (!m || !test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		goto out;
+
+	p = (map_index_t *)page_address(m->page);
+	idx = (block  + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+
+	if (p[idx]) {
+		int lvl = m->levels ? m->levels[idx] : MAP_LEVEL(m);
+
+		if (lvl == level)
+			p[idx] = iblk << ploop_map_log(map->plo);
+		else if (lvl < level)
+			printk("Unexpected condition: uptodate map_node %p "
+			       "covering range %u..%u maps %u to %u on level "
+			       "%d, while user-space merge detected mapping "
+			       "on level %d\n", m, m->mn_start, m->mn_end,
+			       block, p[idx] >> map->plo->cluster_log, lvl,
+			       level);
+	}
+out:
+	spin_unlock_irq(&map->plo->lock);
+}
+
+void ploop_update_map_hdr(struct ploop_map * map, u8 *hdr, int hdr_size)
+{
+	struct map_node * m;
+
+	spin_lock_irq(&map->plo->lock);
+
+	m = map_lookup(map, 0);
+	if (m && test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		memcpy(page_address(m->page), hdr, hdr_size);
+
+	spin_unlock_irq(&map->plo->lock);
+}
+EXPORT_SYMBOL(ploop_update_map_hdr);
+
+int ploop_find_trans_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct map_node * m;
+	cluster_t block;
+
+	block = preq->req_cluster;
+
+	if (unlikely(block >= map->max_index))
+		return -ERANGE;
+
+	map->last_activity = jiffies;
+
+	m = preq->trans_map;
+	if (m == NULL) {
+		spin_lock_irq(&map->plo->lock);
+		m = map_lookup(map, block);
+		if (m) {
+			atomic_inc(&m->refcnt);
+			if (!list_empty(&m->lru) && atomic_read(&m->refcnt) == 1) {
+				cond_flush_lru_buffer(map);
+				if (atomic_read(&m->refcnt) == 1) {
+					atomic_inc(&m->refcnt);
+					map->lru_buffer[map->lru_buffer_ptr++] = m;
+				}
+			}
+		}
+		spin_unlock_irq(&map->plo->lock);
+
+		if (m == NULL) {
+			struct ploop_delta * mdelta = map_top_delta(map);
+			sector_t sec;
+			if (mdelta->ops->map_index(mdelta, block, &sec) == 0)
+				return 0;
+
+			m = map_create(map, block);
+			if (IS_ERR(m))
+				return PTR_ERR(m);
+		}
+
+		preq->trans_map = m;
+	}
+
+	if (test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return 0;
+
+	return ploop_read_map(map, preq);
+}
+
+/* Find mapping for this request. Mapping can be not uptodate. */
+
+int ploop_find_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct map_node * m;
+	cluster_t block;
+
+	block = preq->req_cluster;
+
+	if (unlikely(block >= map->max_index))
+		return -ERANGE;
+
+	if (test_bit(PLOOP_MAP_IDENTICAL, &map->flags))
+		return 0;
+
+	map->last_activity = jiffies;
+
+	m = preq->map;
+	if (m == NULL) {
+		spin_lock_irq(&map->plo->lock);
+		m = map_lookup(map, block);
+		if (m) {
+			atomic_inc(&m->refcnt);
+			if (!list_empty(&m->lru) && atomic_read(&m->refcnt) == 1) {
+				cond_flush_lru_buffer(map);
+				if (atomic_read(&m->refcnt) == 1) {
+					atomic_inc(&m->refcnt);
+					map->lru_buffer[map->lru_buffer_ptr++] = m;
+				}
+			}
+		}
+		spin_unlock_irq(&map->plo->lock);
+
+		if (m == NULL) {
+			m = map_create(map, block);
+			if (IS_ERR(m))
+				return PTR_ERR(m);
+		}
+
+		preq->map = m;
+	}
+
+	if (test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return 0;
+
+	return ploop_read_map(map, preq);
+}
+
+
+/* Blank entries, which refer to another delta
+ * _XXX_ a little more brain stress to detect the case, when we do not
+ * have such entries. Also, copy cries for an optimization.
+ */
+
+static void copy_index_for_wb(struct page * page, struct map_node * m, int level)
+{
+	int i;
+	u32 * s = page_address(m->page);
+	u32 * d = page_address(page);
+	int skip = 0;
+
+	if (m->mn_start == 0) {
+		skip = PLOOP_MAP_OFFSET;
+		memcpy(d, s, skip * sizeof(u32));
+	}
+
+	for (i = skip; i < INDEX_PER_PAGE; i++) {
+		if (level != (m->levels ? m->levels[i] : MAP_LEVEL(m)))
+			d[i] = 0;
+		else
+			d[i] = s[i];
+	}
+}
+
+/* Data write is commited. Now we need to update index. */
+
+void ploop_index_update(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = preq->map;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	u32 idx;
+	map_index_t blk;
+	int old_level;
+	struct page * page;
+	sector_t sec;
+
+	/* No way back, we are going to initiate index write. */
+
+	idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+	blk = ((map_index_t *)page_address(m->page))[idx]  >> ploop_map_log(plo);
+	old_level = m->levels ? m->levels[idx] : MAP_LEVEL(m);
+
+	if (top_delta->level != old_level) {
+		if (m->levels == NULL) {
+			u8 * levels = kmalloc(INDEX_PER_PAGE, GFP_NOFS);
+			if (levels == NULL)
+				goto enomem;
+			memset(levels, MAP_LEVEL(m), INDEX_PER_PAGE);
+			m->levels = levels;
+		}
+	}
+
+	BUG_ON (test_bit(PLOOP_REQ_ZERO, &preq->state) && preq->iblock);
+	if (test_bit(PLOOP_REQ_ZERO, &preq->state) && !blk) {
+		printk("Either map_node is corrupted or bug in "
+		       "ploop-balloon (%u)\n", preq->req_cluster);
+		ploop_set_error(preq, -EIO);
+		goto corrupted;
+	}
+
+	if (blk == preq->iblock && top_delta->level == old_level)
+		goto out;
+
+	if (test_and_set_bit(PLOOP_MAP_WRITEBACK, &m->state)) {
+		preq->eng_state = PLOOP_E_INDEX_DELAY;
+		list_add_tail(&preq->list, &m->io_queue);
+		__TRACE("d %p %u %p\n", preq, preq->req_cluster, m);
+		return;
+	}
+
+	page = alloc_page(GFP_NOFS);
+	if (page == NULL) {
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		goto enomem;
+	}
+
+	copy_index_for_wb(page, m, top_delta->level);
+
+	((map_index_t*)page_address(page))[idx] = preq->iblock << ploop_map_log(plo);
+
+	preq->eng_state = PLOOP_E_INDEX_WB;
+	get_page(page);
+	preq->sinfo.wi.tpage = page;
+
+	__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
+	plo->st.map_single_writes++;
+	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
+	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
+				      !!(preq->req_rw & BIO_FUA));
+	put_page(page);
+	return;
+
+enomem:
+	ploop_set_error(preq, -ENOMEM);
+corrupted:
+	set_bit(PLOOP_S_ABORT, &plo->state);
+out:
+	preq->eng_state = PLOOP_E_COMPLETE;
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+	return;
+}
+EXPORT_SYMBOL(ploop_index_update);
+
+int map_index(struct ploop_delta * delta, struct ploop_request * preq, unsigned long *sec)
+{
+	return delta->ops->map_index(delta, preq->map->mn_start, sec);
+}
+EXPORT_SYMBOL(map_index);
+
+struct ploop_delta * map_writable_delta(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+
+	if (m == NULL)
+		return ploop_top_delta(preq->plo);
+	else
+		return map_top_delta(m->parent);
+}
+EXPORT_SYMBOL(map_writable_delta);
+
+static void map_idx_swap(struct map_node *m, unsigned int idx,
+			 iblock_t *iblk, int log)
+{
+	iblock_t iblk2 = ((map_index_t*)page_address(m->page))[idx] >> log;
+	((map_index_t*)page_address(m->page))[idx] = *iblk << log;
+	*iblk = iblk2;
+}
+
+static inline void requeue_req(struct ploop_request *preq,
+			       unsigned long new_eng_state)
+{
+	preq->eng_state = new_eng_state;
+	spin_lock_irq(&preq->plo->lock);
+	list_del(&preq->list);
+	list_add_tail(&preq->list, &preq->plo->ready_queue);
+	spin_unlock_irq(&preq->plo->lock);
+}
+
+/*
+ * Index write-back for given preq happened, map_wb_complete()
+ * found preq in m->io_queue in PLOOP_E_INDEX_WB eng_state and
+ * updated in-core page of L2-table with preq->iblock. Now, it's
+ * time to either finalize preq (main case) setting eng_state to
+ * PLOOP_E_COMPLETE or process it further (RELOC_[A|S] case)
+ */
+static void map_wb_complete_post_process(struct ploop_map *map,
+					 struct ploop_request *preq, int err)
+{
+	struct ploop_device *plo       = map->plo;
+	struct ploop_delta  *top_delta = map_top_delta(map);
+	struct bio_list sbl;
+	int i;
+
+	if (likely(err ||
+		   (!test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
+		    !test_bit(PLOOP_REQ_RELOC_S, &preq->state)))) {
+
+		requeue_req(preq, PLOOP_E_COMPLETE);
+		return;
+	}
+
+	if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		spin_lock_irq(&plo->lock);
+		del_lockout(preq);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+
+		requeue_req(preq, PLOOP_E_RELOC_COMPLETE);
+		return;
+	}
+
+	BUG_ON (!test_bit(PLOOP_REQ_RELOC_A, &preq->state));
+	BUG_ON (!preq->aux_bio);
+
+	sbl.head = sbl.tail = preq->aux_bio;
+	preq->eng_state = PLOOP_E_RELOC_NULLIFY;
+	list_del_init(&preq->list);
+	for (i = 0; i < preq->aux_bio->bi_vcnt; i++)
+		memset(page_address(preq->aux_bio->bi_io_vec[i].bv_page),
+		       0, PAGE_SIZE);
+
+	top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+				  &sbl, preq->iblock, 1<<plo->cluster_log);
+}
+
+static void map_wb_complete(struct map_node * m, int err)
+{
+	struct ploop_device * plo = m->parent->plo;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	struct list_head * cursor, * tmp;
+	struct ploop_request * main_preq;
+	struct page * page = NULL;
+	int delayed = 0;
+	unsigned int idx;
+	sector_t sec;
+	int fua;
+
+	/* First, complete processing of written back indices,
+	 * finally instantiate indices in mapping cache.
+	 */
+	list_for_each_safe(cursor, tmp, &m->io_queue) {
+		struct ploop_request * preq;
+
+		preq = list_entry(cursor, struct ploop_request, list);
+
+		switch (preq->eng_state) {
+		case PLOOP_E_ENTRY:
+			break;
+		case PLOOP_E_INDEX_WB:
+			idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+			if (!err) {
+				struct ploop_request *pr = preq;
+
+				if (unlikely(test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+					BUG_ON (list_empty(&preq->delay_list));
+					pr = list_first_entry(&preq->delay_list,
+							      struct ploop_request,
+							      list);
+				}
+
+				if (unlikely(test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+					     test_bit(PLOOP_REQ_ZERO, &preq->state)))
+					map_idx_swap(m, idx, &pr->iblock,
+						     ploop_map_log(plo));
+				else
+					((map_index_t*)page_address(m->page))[idx] =
+						pr->iblock << ploop_map_log(plo);
+
+				if (m->levels) {
+					m->levels[idx] = top_delta->level;
+				} else {
+					BUG_ON(MAP_LEVEL(m) != top_delta->level);
+				}
+			} else {
+				ploop_set_error(preq, err);
+			}
+			put_page(preq->sinfo.wi.tpage);
+			preq->sinfo.wi.tpage = NULL;
+			map_wb_complete_post_process(m->parent, preq, err);
+			break;
+		case PLOOP_E_INDEX_DELAY:
+			if (err) {
+				ploop_set_error(preq, err);
+				preq->eng_state = PLOOP_E_COMPLETE;
+				spin_lock_irq(&plo->lock);
+				list_del(cursor);
+				list_add_tail(cursor, &preq->plo->ready_queue);
+				spin_unlock_irq(&plo->lock);
+			} else {
+				delayed++;
+			}
+			break;
+		}
+	}
+
+	if (!delayed) {
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		return;
+	}
+
+	page = alloc_page(GFP_NOFS);
+	if (page)
+		copy_index_for_wb(page, m, top_delta->level);
+
+	main_preq = NULL;
+	fua = 0;
+
+	list_for_each_safe(cursor, tmp, &m->io_queue) {
+		struct ploop_request * preq;
+
+		preq = list_entry(cursor, struct ploop_request, list);
+
+		switch (preq->eng_state) {
+		case PLOOP_E_INDEX_DELAY:
+			if (page == NULL) {
+				ploop_set_error(preq, -ENOMEM);
+				preq->eng_state = PLOOP_E_COMPLETE;
+				spin_lock_irq(&plo->lock);
+				list_del(cursor);
+				list_add_tail(cursor, &plo->ready_queue);
+				spin_unlock_irq(&plo->lock);
+				break;
+			}
+
+			if (preq->req_rw & BIO_FUA)
+				fua = 1;
+
+			preq->eng_state = PLOOP_E_INDEX_WB;
+			get_page(page);
+			preq->sinfo.wi.tpage = page;
+			idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+
+			((map_index_t*)page_address(page))[idx] = preq->iblock << ploop_map_log(plo);
+
+			if (!main_preq) {
+				main_preq = preq;
+				list_del_init(&main_preq->list);
+			}
+			plo->st.map_multi_updates++;
+		}
+	}
+
+	if (!page) {
+		/* Writes are discarded */
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		return;
+	}
+
+	__TRACE("wbi2 %p %u %p\n", main_preq, main_preq->req_cluster, m);
+	plo->st.map_multi_writes++;
+	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
+	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, fua);
+	put_page(page);
+}
+
+void
+ploop_index_wb_complete(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = preq->map;
+
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &m->io_queue);
+	spin_unlock_irq(&plo->lock);
+
+	map_wb_complete(m, preq->error);
+}
+
+void ploop_map_start(struct ploop_map * map, u64 bd_size)
+{
+	struct ploop_device * plo = map->plo;
+
+	map->max_index = (bd_size + (1 << plo->cluster_log) - 1 ) >> plo->cluster_log;
+	map->flags = 0;
+}
+
+
+static void map_wait(struct ploop_map * map)
+{
+	DEFINE_WAIT(_wait);
+	prepare_to_wait(&map->destroy_waitq, &_wait, TASK_UNINTERRUPTIBLE);
+
+	spin_unlock(&map_lru_lock);
+	spin_unlock_irq(&map->plo->lock);
+	io_schedule();
+	spin_lock_irq(&map->plo->lock);
+	spin_lock(&map_lru_lock);
+
+	finish_wait(&map->destroy_waitq, &_wait);
+}
+
+void ploop_map_destroy(struct ploop_map * map)
+{
+	int i;
+	struct rb_node * node;
+
+	spin_lock_irq(&map->plo->lock);
+	set_bit(PLOOP_MAP_DEAD, &map->flags);
+
+	for (i = 0; i < map->lru_buffer_ptr; i++)
+		atomic_dec(&map->lru_buffer[i]->refcnt);
+
+	map->lru_buffer_ptr = 0;
+
+	spin_lock(&map_lru_lock);
+	while ((node = map->rb_root.rb_node) != NULL) {
+		struct map_node * m = rb_entry(node, struct map_node, rb_link);
+		/* refcnt can be not zero if and only if this node is grabbed
+		 * by map_lru_scan() and in flight between releasing
+		 * map_lru_lock and taking plo->lock. We can skip this entry
+		 * if will be destroyed by map_lru_scan(), because we
+		 * set PLOOP_MAP_DEAD.
+		 */
+		if (atomic_read(&m->refcnt) == 0)
+			map_node_destroy(m);
+		else
+			map_wait(map);
+	}
+	spin_unlock(&map_lru_lock);
+	spin_unlock_irq(&map->plo->lock);
+	BUG_ON(map->pages);
+}
+
+void ploop_map_remove_delta(struct ploop_map * map, int level)
+{
+	/* For now. */
+	ploop_map_destroy(map);
+}
+
+
+int __init ploop_map_init(void)
+{
+	ploop_map_cache = kmem_cache_create("ploop_map",
+						sizeof(struct map_node), 0,
+						SLAB_MEM_SPREAD, NULL
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+						, NULL
+#endif
+						);
+	if (!ploop_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ploop_map_exit(void)
+{
+	if (ploop_map_cache)
+		kmem_cache_destroy(ploop_map_cache);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ploop1_image.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ploop1_image.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ploop1_image.h	2015-01-21 12:02:54.713921351 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ploop1_image.h	2015-01-21 12:02:57.718841587 +0300
@@ -0,0 +1,401 @@
+#ifndef __PLOOP1_IMAGE_H__
+#define __PLOOP1_IMAGE_H__ 1
+
+/* Definition of PVD (Parallels Virtual Disk) format
+ *
+ * 1. All the data are in ?little-endian? format.
+ * 2. All the data except for the first cluster are aligned and padded
+ *    to size of cluster. First cluster is exception - it combines
+ *    PVD header (first 64 bytes of the cluster) with L2 index table
+ *    (L2 index table is an array of indices of blocks)
+ * 3. Image size must be multiple of cluster size. If it is not,
+ *    we assume it is the result of image extension failed in the
+ *    middle of transaction, therefore new allocations start at
+ *    size rounded down to cluster size.
+ * 4. Update of indices must be done only after data clusters
+ *    are committed to reliable storage. If we fail to update index,
+ *    we can get an unused and, maybe, uninitialized or partially
+ *    initialized data cluster. It is lost, forgotten and ignored
+ *    until repair or image rebuild.
+ */
+
+/*
+ * copy/paste of IMAGE_PARAMETERS from DiskImageComp.h
+ */
+#pragma pack(push,1)
+struct ploop_pvd_header
+{
+	__u8  m_Sig[16];          /* Signature */
+	__u32 m_Type;             /* Disk type */
+	__u32 m_Heads;            /* heads count */
+	__u32 m_Cylinders;        /* tracks count */
+	__u32 m_Sectors;          /* Sectors per track count */
+	__u32 m_Size;             /* Size of disk in tracks */
+	union {                   /* Size of disk in 512-byte sectors */
+		struct {
+			__u32 m_SizeInSectors_v1;
+			__u32 Unused;
+		};
+		__u64 m_SizeInSectors_v2;
+	};
+	__u32 m_DiskInUse;        /* Disk in use */
+	__u32 m_FirstBlockOffset; /* First data block offset (in sectors) */
+	__u32 m_Flags;            /* Misc flags */
+	__u8  m_Reserved[8];      /* Reserved */
+};
+#pragma pack(pop)
+
+/* Compressed disk (version 1) */
+#define PRL_IMAGE_COMPRESSED            2
+
+/* Compressed disk v1 signature */
+#define SIGNATURE_STRUCTURED_DISK_V1 "WithoutFreeSpace"
+
+/* Compressed disk v2 signature */
+#define SIGNATURE_STRUCTURED_DISK_V2 "WithouFreSpacExt"
+
+/* Sign that the disk is in "using" state */
+#define SIGNATURE_DISK_IN_USE		0x746F6E59
+
+/**
+ * Compressed disk image flags
+ */
+#define	CIF_NoFlags		0x00000000 /* No any flags */
+#define	CIF_Empty		0x00000001 /* No any data was written */
+#define	CIF_Invalid		0xFFFFFFFF /* Invalid flag */
+
+
+#define PLOOP1_SECTOR_LOG	9
+#define PLOOP1_DEF_CLUSTER_LOG	9 /* 256K cluster-block */
+#define CLUSTER (1UL << (PLOOP1_DEF_CLUSTER_LOG + PLOOP1_SECTOR_LOG))
+
+/* Helpers to generate PVD-header based on requested bdsize */
+
+#define DEFAULT_HEADS_COUNT   16
+#define DEFAULT_SECTORS_COUNT 63
+#define SECTOR_SIZE (1 << 9)
+
+struct CHSData
+{
+	__u32 Sectors;
+	__u32 Heads;
+	__u32 Cylinders;
+};
+
+#ifdef __KERNEL__
+# define ploop_do_div(n, base) do_div(n, base)
+#else
+# define ploop_do_div(n, base) ({		\
+	__u32 __rem = n % base;			\
+	n /= base;				\
+	__rem;					\
+ })
+#endif
+/*
+ * Try to count disk sectors per track value
+ */
+static inline __u32
+CalcSectors(const __u64 uiSize)
+{
+	__u64 size = uiSize;
+
+	/* Try to determine sector count */
+	if (!ploop_do_div(size, DEFAULT_SECTORS_COUNT))
+		return DEFAULT_SECTORS_COUNT;
+
+	if (!(uiSize % 32))
+		return 32;
+
+	if (!(uiSize % 16))
+		return 16;
+
+	if (!(uiSize % 8))
+		return 8;
+
+	return ~0;
+}
+
+/*
+ * Try to count disk heads value
+ */
+static inline __u32
+CalcHeads(const __u64 uiSize)
+{
+	__u64 size = uiSize;
+
+	/* Try to determine heads count */
+	if (!ploop_do_div(size, DEFAULT_HEADS_COUNT))
+		return DEFAULT_HEADS_COUNT;
+
+	if (!(uiSize % 8))
+		return 8;
+
+	if (!(uiSize % 4))
+		return 4;
+
+	if (!(uiSize % 2))
+		return 2;
+
+	return ~0;
+}
+
+/*
+ * Convert size to CHS for disks from 504 Mb to 8 Gb
+ */
+static inline void
+ConvertToCHSLow(__u64 From, struct CHSData *chs)
+{
+	chs->Sectors = DEFAULT_SECTORS_COUNT;
+	chs->Heads = DEFAULT_HEADS_COUNT;
+	ploop_do_div(From, DEFAULT_SECTORS_COUNT * DEFAULT_HEADS_COUNT);
+	chs->Cylinders = From;
+}
+
+/*
+ * Convert size to pure LBA config
+ */
+static inline void
+ConvertToPureLBA(__u64 From, struct CHSData *chs)
+{
+	chs->Sectors = 1;
+	chs->Heads = 1;
+	chs->Cylinders = From;
+}
+
+static inline void
+ConvertToCHS(__u64 From, struct CHSData *chs)
+{
+	__u64 Size;
+
+	/*
+	 * According to ATA2 specs:
+	 *  - If the device is above 1,032,192 sectors then the value should be 63.
+	 *    This value does not exceed 63 (3Fh). But note, that if device size
+	 *    above 16,777,216 the HDD reports proper 'magic' number in CHS values,
+	 *    so the situation in the middle must be handled separately
+	 */
+	if ((From > 1032192) && (From < 16777216))
+	{
+		ConvertToCHSLow(From, chs);
+		return;
+	}
+
+	Size = From;
+
+	/* Store size */
+	chs->Sectors = CalcSectors(Size);
+
+	if (chs->Sectors == (__u32)~0)
+		goto PureLBA;
+
+	ploop_do_div(Size, chs->Sectors);
+
+	chs->Heads = CalcHeads(Size);
+
+	if (chs->Heads == (__u32)~0)
+		goto PureLBA;
+
+	ploop_do_div(Size, chs->Heads);
+
+	chs->Cylinders = Size;
+
+	return;
+
+PureLBA:
+	ConvertToPureLBA(From, chs);
+}
+
+static inline __u32
+GetHeaderSize(__u32 m_Size)
+{
+	__u32 Size = sizeof(struct ploop_pvd_header);
+
+	/* Add BAT */
+	Size += m_Size * sizeof(__u32);
+	/* Align to size of sector */
+	Size = (Size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE - 1);
+
+	return Size;
+}
+
+static inline char *
+ploop1_signature(int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return SIGNATURE_STRUCTURED_DISK_V1;
+	case PLOOP_FMT_V2:
+		return SIGNATURE_STRUCTURED_DISK_V2;
+#ifdef __KERNEL__
+	default:
+		BUG();
+#endif
+	}
+
+	return NULL;
+}
+
+static inline int
+ploop1_version(struct ploop_pvd_header *vh)
+{
+	if (!memcmp(vh->m_Sig, SIGNATURE_STRUCTURED_DISK_V1, sizeof(vh->m_Sig)))
+		return PLOOP_FMT_V1;
+
+	if (!memcmp(vh->m_Sig, SIGNATURE_STRUCTURED_DISK_V2, sizeof(vh->m_Sig)))
+		return PLOOP_FMT_V2;
+
+	return -1;
+}
+
+static inline __u64
+ploop1_max_size(__u32 blocksize, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return (__u32)-1;
+	case PLOOP_FMT_V2:
+		return 0xffffffffUL * blocksize;
+	}
+
+	return 0;
+}
+
+#ifdef __KERNEL__
+static inline u64
+get_SizeInSectors_from_le(struct ploop_pvd_header *vh, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return le32_to_cpu(vh->m_SizeInSectors_v1);
+	case PLOOP_FMT_V2:
+		return le64_to_cpu(vh->m_SizeInSectors_v2);
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static inline void
+put_SizeInSectors(u64 SizeInSectors, struct ploop_pvd_header *vh,
+		  int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		vh->m_SizeInSectors_v1 = SizeInSectors;
+		break;
+	case PLOOP_FMT_V2:
+		vh->m_SizeInSectors_v2 = SizeInSectors;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void
+cpu_to_le_SizeInSectors(struct ploop_pvd_header *vh, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		vh->m_SizeInSectors_v1 = cpu_to_le32(vh->m_SizeInSectors_v1);
+		break;
+	case PLOOP_FMT_V2:
+		vh->m_SizeInSectors_v2 = cpu_to_le64(vh->m_SizeInSectors_v2);
+		break;
+	default:
+		BUG();
+	}
+}
+#endif
+
+/*
+ * Returns: "size to fill" (in bytes)
+ *
+ * NB: m_Flags and m_DiskInUse are being kept as is; our caller
+ * should take care of them.
+ *
+ * NB: Both bdsize and blocksize are measured in sectors.
+ */
+static inline __u32
+generate_pvd_header(struct ploop_pvd_header *vh, __u64 bdsize, __u32 blocksize,
+		    int version)
+{
+	struct CHSData chs;
+	__u32 SizeToFill;
+	__u32 uiAlignmentSize;
+	__u64 SizeInSectors;
+
+	memcpy(vh->m_Sig, ploop1_signature(version) , sizeof(vh->m_Sig));
+	vh->m_Type = PRL_IMAGE_COMPRESSED;
+
+	/* Round up to block size */
+	SizeInSectors = bdsize + blocksize - 1;
+	ploop_do_div(SizeInSectors, blocksize);
+	SizeInSectors *= blocksize;
+	put_SizeInSectors(SizeInSectors, vh, version);
+
+	ConvertToCHS(SizeInSectors, &chs);
+
+	vh->m_Sectors = blocksize;
+	vh->m_Heads = chs.Heads;
+	vh->m_Cylinders = chs.Cylinders;
+
+	ploop_do_div(SizeInSectors, blocksize);
+	vh->m_Size = SizeInSectors;
+
+	uiAlignmentSize = blocksize << 9;
+	SizeToFill = GetHeaderSize(vh->m_Size);
+	/* Align to block size */
+	if (SizeToFill % uiAlignmentSize)
+		SizeToFill += uiAlignmentSize - (SizeToFill % uiAlignmentSize);
+
+	vh->m_FirstBlockOffset = SizeToFill >> 9;
+
+	return SizeToFill;
+}
+
+
+/* Translation of sector number to offset in image */
+
+#if 0
+
+/* Those function are not really used */
+
+/* Calculate virtual cluster number from virtual sector number */
+
+static inline __u32
+ploop1_cluster(struct ploop_img_header * info, __u64 sector)
+{
+	return sector >> info->cluster_log;
+}
+
+/* Get amount of clusters covered by one L2 table, 32K by default,
+ * which can map 4G of data
+ */
+static inline __u32
+ploop1_clusters_per_l2(struct ploop_img_header * info)
+{
+	return 1 << (info->cluster_log + info->sector_log - 2);
+}
+
+/* Calculate index in L1 table mapping a cluster. */
+
+static inline __u32
+ploop1_l1_index(struct ploop_img_header * info, __u32 cluster)
+{
+	return cluster >> (info->cluster_log + info->sector_log - 2);
+}
+
+/* Calculate index in L2 table mapping a cluster. */
+
+static inline __u32
+ploop1_l2_index(struct ploop_img_header * info, __u32 cluster)
+{
+	return cluster & (ploop1_clusters_per_l2(info) - 1);
+}
+
+/* That's all, simple and stupid */
+
+#endif
+
+#endif /* __PLOOP1_IMAGE_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ploop_events.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ploop_events.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ploop_events.c	2015-01-21 12:02:55.232907575 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ploop_events.c	2015-01-21 12:02:55.232907575 +0300
@@ -0,0 +1,9 @@
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#define CREATE_TRACE_POINTS
+#include "ploop_events.h"
+
+EXPORT_TRACEPOINT_SYMBOL(submit);
+EXPORT_TRACEPOINT_SYMBOL(submit_alloc);
+EXPORT_TRACEPOINT_SYMBOL(cached_submit);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ploop_events.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ploop_events.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/ploop_events.h	2015-01-21 12:02:55.233907548 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/ploop_events.h	2015-01-21 12:02:55.359904204 +0300
@@ -0,0 +1,93 @@
+#if !defined(_TRACE_PLOOP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PLOOP_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+#include "events.h"
+
+DEFINE_EVENT(preq_template, submit,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, submit_alloc,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, cached_submit,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, complete_request,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, req_state_process,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, bio_queue,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, add_lockout,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, del_lockout,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+TRACE_EVENT(preq_lockout,
+	TP_PROTO(struct ploop_request *preq,
+		struct ploop_request *ppreq),
+
+	TP_ARGS(preq, ppreq),
+
+	TP_STRUCT__entry(
+		__field(void *,		ppreq)
+		__field(void *,		preq)
+		__field(cluster_t,	clu)
+		__field(iblock_t,	iblk)
+		__field(unsigned int,	size)
+		__field(unsigned long,	eng_state)
+		__field(unsigned long,	state)
+		__field(unsigned int,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->preq		= preq;
+		__entry->ppreq		= ppreq;
+		__entry->clu		= preq->req_cluster;
+		__entry->iblk		= preq->iblock;
+		__entry->size		= preq->req_size;
+		__entry->eng_state	= preq->eng_state;
+		__entry->state		= preq->state;
+		__entry->rw		= preq->req_rw;
+	),
+
+	TP_printk("ppreq=%p "PREQ_FORMAT, __entry->ppreq, PREQ_ARGS)
+);
+
+DEFINE_EVENT(bio_template, make_request,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio));
+
+DEFINE_EVENT(bio_template, bio_fast_map,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio));
+
+#endif /* _TRACE_PLOOP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE ploop_events
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/sysfs.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/sysfs.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/sysfs.c	2015-01-21 12:02:54.713921351 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/sysfs.c	2015-01-21 12:02:57.818838934 +0300
@@ -0,0 +1,679 @@
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <asm/uaccess.h>
+
+#include <linux/ploop/ploop.h>
+
+struct delta_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct ploop_delta *, char *);
+	ssize_t (*store)(struct ploop_delta *, const char *, size_t);
+};
+
+static ssize_t
+delta_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct delta_sysfs_entry *entry = container_of(attr, struct delta_sysfs_entry, attr);
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (!entry->show)
+		return -EIO;
+	return entry->show(delta, page);
+}
+
+static ssize_t
+delta_attr_store(struct kobject *kobj, struct attribute *attr,
+		 const char *page, size_t length)
+{
+	struct delta_sysfs_entry *entry = container_of(attr, struct delta_sysfs_entry, attr);
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (!entry->store)
+		return -EIO;
+
+	return entry->store(delta, page, length);
+}
+
+
+static struct sysfs_ops delta_sysfs_ops = {
+	.show	= delta_attr_show,
+	.store	= delta_attr_store,
+};
+
+static void release_delta(struct kobject *kobj)
+{
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (delta->ops)
+		ploop_format_put(delta->ops);
+	module_put(THIS_MODULE);
+	kfree(delta);
+}
+
+static ssize_t
+delta_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+delta_string_show(char * str, char *page)
+{
+	return sprintf(page, "%s\n", str);
+}
+
+static ssize_t delta_level_show(struct ploop_delta *delta, char *page)
+{
+	return delta_var_show(delta->level, page);
+}
+
+static ssize_t delta_image_show(struct ploop_delta *delta, char *page)
+{
+	char * res;
+	int len = -ENOENT;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (delta->io.files.file) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+		res = d_path(delta->io.files.file->f_dentry,
+			     delta->io.files.file->f_vfsmnt,
+			     page, PAGE_SIZE-1);
+#else
+		res = d_path(&delta->io.files.file->f_path, page, PAGE_SIZE-1);
+#endif
+		len = PTR_ERR(res);
+		if (!IS_ERR(res)) {
+			len = strlen(res);
+			if (res != page)
+				memmove(page, res, len);
+			page[len] = '\n';
+			len++;
+		}
+	}
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return len;
+}
+
+static ssize_t delta_format_show(struct ploop_delta *delta, char *page)
+{
+	return delta_string_show(delta->ops->name, page);
+}
+
+static ssize_t delta_io_show(struct ploop_delta *delta, char *page)
+{
+	return delta_string_show(delta->io.ops->name, page);
+}
+
+static ssize_t delta_ro_show(struct ploop_delta *delta, char *page)
+{
+	return sprintf(page, "%d\n", !!(delta->flags & PLOOP_FMT_RDONLY));
+}
+
+static ssize_t delta_trans_show(struct ploop_delta *delta, char *page)
+{
+	struct ploop_device * plo = delta->plo;
+	int trans = 0;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (plo->trans_map && map_top_delta(plo->trans_map) == delta)
+		trans = 1;
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return sprintf(page, "%d\n", trans);
+}
+
+static ssize_t delta_dump(struct ploop_delta *delta, char *page)
+{
+	int ret = delta->io.ops->dump ? delta->io.ops->dump(&delta->io) : -1;
+	return sprintf(page, "%d\n", ret);
+}
+
+static struct delta_sysfs_entry delta_level_entry = {
+	.attr = {.name = "level", .mode = S_IRUGO },
+	.show = delta_level_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_image_entry = {
+	.attr = {.name = "image", .mode = S_IRUGO },
+	.show = delta_image_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_format_entry = {
+	.attr = {.name = "format", .mode = S_IRUGO },
+	.show = delta_format_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_io_entry = {
+	.attr = {.name = "io", .mode = S_IRUGO },
+	.show = delta_io_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_ro_entry = {
+	.attr = {.name = "ro", .mode = S_IRUGO },
+	.show = delta_ro_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_trans_entry = {
+	.attr = {.name = "transparent", .mode = S_IRUGO },
+	.show = delta_trans_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_dump_entry = {
+	.attr = {.name = "dump", .mode = S_IRUGO },
+	.show = delta_dump,
+};
+
+static struct attribute *default_attrs[] = {
+	&delta_level_entry.attr,
+	&delta_image_entry.attr,
+	&delta_format_entry.attr,
+	&delta_io_entry.attr,
+	&delta_ro_entry.attr,
+	&delta_trans_entry.attr,
+	&delta_dump_entry.attr,
+	NULL,
+};
+
+struct kobj_type ploop_delta_ktype = {
+	.sysfs_ops	= &delta_sysfs_ops,
+	.default_attrs	= default_attrs,
+	.release	= release_delta,
+};
+
+
+static struct {
+#define __DO(_at)	struct attribute _at;
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+} _attr_arr = {
+#define __DO(_at)	._at = { .name = __stringify(_at), .mode = S_IRUGO|S_IWUSR, },
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+};
+
+static struct attribute *stats_attributes[] = {
+#define __DO(_at) &_attr_arr._at,
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+	NULL
+};
+
+static const struct attribute_group stats_group = {
+	.attrs = stats_attributes,
+};
+
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+#define to_disk(obj) dev_to_disk(container_of(obj,struct device,kobj))
+#else
+#define to_disk(obj) container_of(obj,struct gendisk,kobj)
+#endif
+
+static ssize_t pstat_show(struct kobject *kobj, struct attribute *attr,
+			  char *page)
+{
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	int n;
+
+	n = attr - (struct attribute *)&_attr_arr;
+
+	return sprintf(page, "%u\n", ((u32*)&plo->st)[n]);
+}
+
+static ssize_t pstat_store(struct kobject * kobj, struct attribute * attr,
+			   const char *page, size_t count)
+{
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	char *p = (char *) page;
+	unsigned long var;
+	int n;
+
+	var = simple_strtoul(p, &p, 10);
+
+	n = attr - (struct attribute *)&_attr_arr;
+	((u32*)&plo->st)[n] = var;
+	return count;
+}
+
+static u32 show_block_size(struct ploop_device * plo)
+{
+	return 1 << plo->cluster_log;
+}
+
+static u32 show_fmt_version(struct ploop_device * plo)
+{
+	return plo->fmt_version;
+}
+
+static u32 show_total_bios(struct ploop_device * plo)
+{
+	return plo->bio_total;
+}
+
+static u32 show_queued_bios(struct ploop_device * plo)
+{
+	return plo->bio_qlen;
+}
+
+static u32 show_discard_bios(struct ploop_device * plo)
+{
+	return plo->bio_discard_qlen;
+}
+
+static u32 show_active_reqs(struct ploop_device * plo)
+{
+	return plo->active_reqs;
+}
+
+static u32 show_entry_read_sync_reqs(struct ploop_device * plo)
+{
+	return plo->read_sync_reqs;
+}
+
+static u32 show_entry_reqs(struct ploop_device * plo)
+{
+	return plo->entry_qlen;
+}
+
+static u32 show_barrier_reqs(struct ploop_device * plo)
+{
+	return plo->barrier_reqs;
+}
+
+static u32 show_fsync_reqs(struct ploop_device * plo)
+{
+	u32 qlen = 0;
+	mutex_lock(&plo->sysfs_mutex);
+	if (!list_empty(&plo->map.delta_list))
+		qlen = ploop_top_delta(plo)->io.fsync_qlen;
+	mutex_unlock(&plo->sysfs_mutex);
+	return qlen;
+}
+
+static u32 show_fastpath_reqs(struct ploop_device * plo)
+{
+	return plo->fastpath_reqs;
+}
+
+static u32 show_map_pages(struct ploop_device * plo)
+{
+	return plo->map.pages;
+}
+
+static u32 show_running(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_RUNNING, &plo->state);
+}
+
+static u32 show_locked(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_LOCKED, &plo->locking_state);
+}
+
+static u32 show_aborted(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_ABORT, &plo->state);
+}
+
+static int store_aborted(struct ploop_device * plo, u32 val)
+{
+	printk(KERN_INFO "ploop: Force %s aborted state for ploop%d\n",
+	       val ? "set" : "clear", plo->index);
+
+	if (val)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+	else
+		clear_bit(PLOOP_S_ABORT, &plo->state);
+	return 0;
+}
+
+static u32 show_top(struct ploop_device * plo)
+{
+	int top = -1;
+
+	mutex_lock(&plo->sysfs_mutex);
+	if (!list_empty(&plo->map.delta_list))
+		top = ploop_top_delta(plo)->level;
+	if (plo->trans_map)
+		top++;
+	mutex_unlock(&plo->sysfs_mutex);
+	return (u32)top;
+}
+
+static inline u32 get_event_locked(struct ploop_device * plo)
+{
+	if (test_and_clear_bit(PLOOP_S_ENOSPC_EVENT, &plo->state))
+		return PLOOP_EVENT_ENOSPC;
+	else if (test_bit(PLOOP_S_ABORT, &plo->state))
+		return PLOOP_EVENT_ABORTED;
+	else if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return PLOOP_EVENT_STOPPED;
+
+	return 0;
+}
+
+static u32 show_event(struct ploop_device * plo)
+{
+	u32 ret;
+
+	DEFINE_WAIT(_wait);
+	spin_lock_irq(&plo->lock);
+
+	ret = get_event_locked(plo);
+	if (ret) {
+		spin_unlock_irq(&plo->lock);
+		return ret;
+	}
+
+	prepare_to_wait(&plo->event_waitq, &_wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&plo->lock);
+	schedule();
+	spin_lock_irq(&plo->lock);
+	finish_wait(&plo->event_waitq, &_wait);
+
+	ret = get_event_locked(plo);
+
+	spin_unlock_irq(&plo->lock);
+	return ret;
+}
+
+static u32 show_open_count(struct ploop_device * plo)
+{
+	return atomic_read(&plo->open_count);
+}
+
+static ssize_t print_cookie(struct ploop_device * plo, char * page)
+{
+	return sprintf(page, "%s\n", plo->cookie);
+}
+
+#define _TUNE_U32(_name)				\
+static u32 show_##_name(struct ploop_device * plo)	\
+{							\
+	return plo->tune._name;				\
+}							\
+							\
+static int store_##_name(struct ploop_device * plo, u32 val) \
+{							\
+	plo->tune._name = val;				\
+	return 0;					\
+}
+
+#define _TUNE_JIFFIES(_name)				\
+static u32 show_##_name(struct ploop_device * plo)	\
+{							\
+	return (plo->tune._name * 1000) / HZ;		\
+}							\
+							\
+static int store_##_name(struct ploop_device * plo, u32 val) \
+{							\
+	plo->tune._name = (val * HZ) / 1000;		\
+	return 0;					\
+}
+
+#define _TUNE_BOOL	_TUNE_U32
+
+_TUNE_U32(max_requests);
+_TUNE_U32(batch_entry_qlen);
+_TUNE_JIFFIES(batch_entry_delay);
+_TUNE_U32(fsync_max);
+_TUNE_JIFFIES(fsync_delay);
+_TUNE_BOOL(pass_flushes);
+_TUNE_BOOL(pass_fuas);
+_TUNE_BOOL(congestion_detection);
+_TUNE_BOOL(check_zeros);
+_TUNE_U32(min_map_pages);
+_TUNE_JIFFIES(max_map_inactivity);
+_TUNE_BOOL(disable_root_threshold);
+_TUNE_BOOL(disable_user_threshold);
+_TUNE_U32(congestion_high_watermark);
+_TUNE_U32(congestion_low_watermark);
+_TUNE_U32(max_active_requests);
+
+
+struct pattr_sysfs_entry {
+	struct attribute attr;
+	u32 (*show)(struct ploop_device *);
+	int (*store)(struct ploop_device *, __u32 val);
+	ssize_t (*print)(struct ploop_device *, char *page);
+};
+
+#define _A(_name) \
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, .show = show_##_name, }).attr
+
+#define _A2(_name) \
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO|S_IWUSR }, .show = show_##_name, .store = store_##_name, }).attr
+
+#define _A3(_name)							\
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, .print = print_##_name, }).attr
+
+static struct attribute *state_attributes[] = {
+	_A(block_size),
+	_A(fmt_version),
+	_A(total_bios),
+	_A(queued_bios),
+	_A(discard_bios),
+	_A(active_reqs),
+	_A(entry_reqs),
+	_A(entry_read_sync_reqs),
+	_A(barrier_reqs),
+	_A(fastpath_reqs),
+	_A(fsync_reqs),
+	_A(map_pages),
+	_A(running),
+	_A(locked),
+	_A2(aborted),
+	_A(top),
+	_A(event),
+	_A3(cookie),
+	_A(open_count),
+	NULL
+};
+
+static struct attribute *tune_attributes[] = {
+	_A2(max_requests),
+	_A2(batch_entry_qlen),
+	_A2(batch_entry_delay),
+	_A2(fsync_max),
+	_A2(fsync_delay),
+	_A2(min_map_pages),
+	_A2(max_map_inactivity),
+	_A2(pass_flushes),
+	_A2(pass_fuas),
+	_A2(congestion_detection),
+	_A2(check_zeros),
+	_A2(disable_root_threshold),
+	_A2(disable_user_threshold),
+	_A2(congestion_high_watermark),
+	_A2(congestion_low_watermark),
+	_A2(max_active_requests),
+	NULL
+};
+
+static const struct attribute_group state_group = {
+	.attrs = state_attributes,
+};
+
+static const struct attribute_group tune_group = {
+	.attrs = tune_attributes,
+};
+
+static ssize_t
+pattr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct pattr_sysfs_entry *entry = container_of(attr, struct pattr_sysfs_entry, attr);
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	u32 val;
+
+	if (entry->print)
+		return entry->print(plo, page);
+
+	if (!entry->show)
+		return -EIO;
+	val = entry->show(plo);
+	return sprintf(page, "%u\n", val);
+}
+
+static ssize_t
+pattr_store(struct kobject *kobj, struct attribute *attr,
+	    const char *page, size_t length)
+{
+	struct pattr_sysfs_entry *entry = container_of(attr, struct pattr_sysfs_entry, attr);
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	char *p = (char *) page;
+	unsigned long var;
+	int err;
+
+	if (!entry->store)
+		return -EIO;
+
+	var = simple_strtoul(p, &p, 10);
+
+	err = entry->store(plo, var);
+	return err ? : length;
+}
+
+static struct sysfs_ops pattr_sysfs_ops = {
+	.show	= &pattr_show,
+	.store	= &pattr_store,
+};
+
+static struct sysfs_ops pstat_sysfs_ops = {
+	.show	= &pstat_show,
+	.store	= &pstat_store,
+};
+
+static void pattr_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type pattr_ktype = {
+	.release	= pattr_release,
+	.sysfs_ops	= &pattr_sysfs_ops,
+};
+
+static struct kobj_type pstat_ktype = {
+	.release	= pattr_release,
+	.sysfs_ops	= &pstat_sysfs_ops,
+};
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+struct kobject *kobject_add_attr(struct gendisk *gd, const char *name,
+				 struct kobj_type * type)
+{
+	struct kobject *k;
+	int err;
+	struct kobject * parent = &disk_to_dev(gd)->kobj;
+
+	k = kzalloc(sizeof(*k), GFP_KERNEL);
+	if (!k)
+		return NULL;
+
+	kobject_init(k, type);
+
+	err = kobject_add(k, parent, "%s", name);
+	if (err) {
+		kobject_put(k);
+		return NULL;
+	}
+	return k;
+}
+#else
+struct kobject *kobject_add_attr(struct gendisk *gd, const char *name,
+				 struct kobj_type * type)
+{
+	struct kobject *k;
+	int err;
+	struct kobject * parent = &gd->kobj;
+
+	k = kzalloc(sizeof(*k), GFP_KERNEL);
+	if (!k)
+		return NULL;
+
+	snprintf(k->name, KOBJ_NAME_LEN, "%s", name);
+	k->ktype = type;
+	kobject_init(k);
+
+	k->parent = parent;
+	err = kobject_add(k);
+	if (err) {
+		kobject_put(k);
+		return NULL;
+	}
+	return k;
+}
+#endif
+
+void ploop_sysfs_init(struct ploop_device * plo)
+{
+	plo->pstat_dir = kobject_add_attr(plo->disk, "pstat", &pstat_ktype);
+	if (plo->pstat_dir) {
+		if (sysfs_create_group(plo->pstat_dir, &stats_group))
+			printk("ploop: were not able to create pstat dir\n");
+	}
+	plo->pstate_dir = kobject_add_attr(plo->disk, "pstate", &pattr_ktype);
+	if (plo->pstate_dir) {
+		if (sysfs_create_group(plo->pstate_dir, &state_group))
+			printk("ploop: were not able to create pstate dir\n");
+	}
+	plo->ptune_dir = kobject_add_attr(plo->disk, "ptune", &pattr_ktype);
+	if (plo->ptune_dir) {
+		if (sysfs_create_group(plo->ptune_dir, &tune_group))
+			printk("ploop: were not able to create ptune dir\n");
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+	if (kobject_add(&plo->kobj, kobject_get(&disk_to_dev(plo->disk)->kobj), "%s", "pdelta"))
+		printk("ploop: were not able to create pdelta dir\n");
+#else
+	plo->kobj.parent = kobject_get(&plo->disk->kobj);
+	if (kobject_add(&plo->kobj))
+		printk("ploop: were not able to create pdelta dir\n");
+#endif
+}
+
+void ploop_sysfs_uninit(struct ploop_device * plo)
+{
+	if (plo->pstat_dir) {
+		sysfs_remove_group(plo->pstat_dir, &stats_group);
+		kobject_del(plo->pstat_dir);
+		kobject_put(plo->pstat_dir);
+		plo->pstat_dir = NULL;
+	}
+	if (plo->pstate_dir) {
+		sysfs_remove_group(plo->pstate_dir, &state_group);
+		kobject_del(plo->pstate_dir);
+		kobject_put(plo->pstate_dir);
+		plo->pstate_dir = NULL;
+	}
+	if (plo->ptune_dir) {
+		sysfs_remove_group(plo->ptune_dir, &tune_group);
+		kobject_del(plo->ptune_dir);
+		kobject_put(plo->ptune_dir);
+		plo->ptune_dir = NULL;
+	}
+	kobject_del(&plo->kobj);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+	kobject_put(&disk_to_dev(plo->disk)->kobj);
+#else
+	kobject_put(&plo->disk->kobj);
+#endif
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/tracker.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/tracker.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/block/ploop/tracker.c	2015-01-21 12:02:54.714921325 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/block/ploop/tracker.c	2015-01-21 12:02:57.766840314 +0300
@@ -0,0 +1,286 @@
+/* Tracker engine detects and records changed clusters.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+
+#include <linux/ploop/ploop.h>
+
+struct track_record
+{
+	struct rb_node	rb_node;
+	u32		start;
+	u32		end;
+};
+
+static int tree_insert(struct rb_root *root, struct track_record *m)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct track_record * entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct track_record, rb_node);
+
+		if (m->start < entry->start)
+			p = &(*p)->rb_left;
+		else if (m->start >= entry->end)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&m->rb_node, parent, p);
+	rb_insert_color(&m->rb_node, root);
+	return 0;
+}
+
+void ploop_tracker_notify(struct ploop_device * plo, sector_t sec)
+{
+	struct track_record * m;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return;
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		return;
+
+	sec >>= plo->cluster_log;
+
+	m = kmalloc(sizeof(struct track_record), GFP_NOFS);
+	if (m == NULL) {
+		set_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		return;
+	}
+
+	m->start = sec;
+	m->end = sec + 1;
+
+	spin_lock(&plo->track_lock);
+	if (tree_insert(&plo->track_tree, m)) {
+		kfree(m);
+	} else {
+		struct rb_node * rb;
+		struct track_record * merge;
+
+		if (m->start != 0) {
+			rb = rb_prev(&m->rb_node);
+			if (rb) {
+				merge = rb_entry(rb, struct track_record, rb_node);
+				if (m->start == merge->end) {
+					m->start = merge->start;
+					rb_erase(&merge->rb_node, &plo->track_tree);
+					kfree(merge);
+				}
+			}
+		}
+
+		rb = rb_next(&m->rb_node);
+		if (rb) {
+			merge = rb_entry(rb, struct track_record, rb_node);
+			if (m->end == merge->start) {
+				m->end = merge->end;
+				rb_erase(&merge->rb_node, &plo->track_tree);
+				kfree(merge);
+			}
+		}
+	}
+	spin_unlock(&plo->track_lock);
+}
+EXPORT_SYMBOL(ploop_tracker_notify);
+
+int ploop_tracker_init(struct ploop_device * plo, unsigned long arg)
+{
+	struct ploop_track_extent e;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	ploop_quiesce(plo);
+
+	e.start = 0;
+	e.end = (u64)ploop_top_delta(plo)->io.alloc_head << (plo->cluster_log + 9);
+	if (copy_to_user((void*)arg, &e, sizeof(struct ploop_track_extent))) {
+		ploop_relax(plo);
+		return -EFAULT;
+	}
+
+	set_bit(PLOOP_S_TRACK, &plo->state);
+	plo->maintenance_type = PLOOP_MNTN_TRACK;
+	plo->track_end = 0;
+	plo->track_ptr = 0;
+	ploop_relax(plo);
+	return 0;
+}
+
+int ploop_tracker_setpos(struct ploop_device * plo, unsigned long arg)
+{
+	u64 pos;
+
+	if (copy_from_user(&pos, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return -EINVAL;
+
+	pos >>= 9;
+
+	if (pos < plo->track_end) {
+		/* _XXX_ This would be good to trim tail of track tree
+		 * and to rewind tracking. We implement this if it will
+		 * be really useful.
+		 */
+		if (pos)
+			return -EINVAL;
+
+		ploop_quiesce(plo);
+
+		clear_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		ploop_tracker_destroy(plo, 1);
+
+		plo->track_end = pos;
+		plo->track_ptr = 0;
+
+		ploop_relax(plo);
+	} else 
+		plo->track_end = pos;
+
+	return 0;
+}
+
+static struct track_record * find_record(struct rb_root * root, u32 start)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node * prev = NULL;
+
+	while (n) {
+		struct track_record * m;
+
+		m = rb_entry(n, struct track_record, rb_node);
+		prev = n;
+
+		if (start < m->start)
+			n = n->rb_left;
+		else if (start >= m->end)
+			n = n->rb_right;
+		else
+			return m;
+	}
+
+	while (prev && start >= rb_entry(prev, struct track_record, rb_node)->end)
+		prev = rb_next(prev);
+
+	if (!prev)
+		return NULL;
+
+	return rb_entry(prev, struct track_record, rb_node);
+}
+
+
+int ploop_tracker_read(struct ploop_device * plo, unsigned long arg)
+{
+	u64 ptr;
+	struct track_record * m;
+	struct ploop_delta * delta;
+	struct ploop_track_extent e;
+	int err;
+
+	if (copy_from_user(&ptr, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return -EINVAL;
+
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state)) {
+		ploop_tracker_destroy(plo, 1);
+		return -ECONNABORTED;
+	}
+
+	delta = ploop_top_delta(plo);
+
+	spin_lock(&plo->track_lock);
+	m = find_record(&plo->track_tree, plo->track_ptr);
+	if (m == NULL) {
+		if (plo->track_end >= ((sector_t)delta->io.alloc_head << plo->cluster_log) &&
+		    plo->track_ptr)
+			m = find_record(&plo->track_tree, 0);
+	}
+
+	if (m) {
+		rb_erase(&m->rb_node, &plo->track_tree);
+		plo->track_ptr = m->end;
+	} else {
+		plo->track_ptr = 0;
+	}
+	spin_unlock(&plo->track_lock);
+
+	err = -EAGAIN;
+	if (m) {
+		e.start = (u64)m->start << (plo->cluster_log + 9);
+		e.end = (u64)m->end << (plo->cluster_log + 9);
+		kfree(m);
+		err = 0;
+	} else if (plo->track_end < ((sector_t)delta->io.alloc_head << plo->cluster_log)) {
+		e.start = (u64)plo->track_end << 9;
+		e.end = (u64)delta->io.alloc_head << (plo->cluster_log + 9);
+		err = 0;
+	}
+
+	if (!err && copy_to_user((void *)arg, &e, sizeof(e))) {
+		set_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		err = -EFAULT;
+	}
+
+	return err;
+}
+
+int ploop_tracker_stop(struct ploop_device * plo, int force)
+{
+	int err;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return 0;
+
+	ploop_quiesce(plo);
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		force = 1;
+	err = ploop_tracker_destroy(plo, force);
+	if (!err) {
+		clear_bit(PLOOP_S_TRACK, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+	ploop_relax(plo);
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		return -ECONNABORTED;
+	return err;
+}
+
+int ploop_tracker_destroy(struct ploop_device *plo, int force)
+{
+	struct rb_node * n;
+
+	if (RB_EMPTY_ROOT(&plo->track_tree))
+		return 0;
+
+	if (!force)
+		return -EBUSY;
+
+	spin_lock(&plo->track_lock);
+	while ((n = rb_first(&plo->track_tree)) != NULL) {
+		rb_erase(n, &plo->track_tree);
+		kfree(n);
+	}
+	spin_unlock(&plo->track_lock);
+	return 0;
+}
+
+void track_init(struct ploop_device * plo)
+{
+	plo->track_tree = RB_ROOT;
+	spin_lock_init(&plo->track_lock);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/Kconfig	2014-12-12 23:28:52.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/Kconfig	2015-01-21 12:02:47.713107198 +0300
@@ -459,6 +459,7 @@ config UNIX98_PTYS
 config DEVPTS_MULTIPLE_INSTANCES
 	bool "Support multiple instances of devpts"
 	depends on UNIX98_PTYS
+	default y if VE
 	default n
 	---help---
 	  Enable support for multiple instances of devpts filesystem.
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/keyboard.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/keyboard.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/keyboard.c	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/keyboard.c	2015-01-21 12:02:42.618242461 +0300
@@ -43,6 +43,7 @@
 #include <linux/reboot.h>
 #include <linux/notifier.h>
 #include <linux/jiffies.h>
+#include <linux/device.h>
 
 extern void ctrl_alt_del(void);
 
@@ -162,6 +163,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + 
 static int sysrq_down;
 static int sysrq_alt_use;
 #endif
+int sysrq_key_scancode = KEY_SYSRQ;
 static int sysrq_alt;
 
 /*
@@ -1067,6 +1069,9 @@ static int emulate_raw(struct vc_data *v
 {
 	int code;
 
+	if (keycode == sysrq_key_scancode && sysrq_alt)
+		goto sysrq;
+
 	switch (keycode) {
 		case KEY_PAUSE:
 			put_queue(vc, 0xe1);
@@ -1085,6 +1090,7 @@ static int emulate_raw(struct vc_data *v
 			break;
 
 		case KEY_SYSRQ:
+sysrq:
 			/*
 			 * Real AT keyboards (that's what we're trying
 			 * to emulate here emit 0xe0 0x2a 0xe0 0x37 when
@@ -1179,7 +1185,8 @@ static void kbd_keycode(unsigned int key
 				printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode);
 
 #ifdef CONFIG_MAGIC_SYSRQ	       /* Handle the SysRq Hack */
-	if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) {
+	if ((keycode == sysrq_key_scancode || keycode == KEY_SYSRQ) &&
+				(sysrq_down || (down == 1 && sysrq_alt))) {
 		if (!sysrq_down) {
 			sysrq_down = down;
 			sysrq_alt_use = sysrq_alt;
@@ -1311,7 +1318,7 @@ static void kbd_event(struct input_handl
  * likes it, it can open it and get events from it. In this (kbd_connect)
  * function, we should decide which VT to bind that keyboard to initially.
  */
-static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+static int __kbd_connect(struct input_handler *handler, struct input_dev *dev,
 			const struct input_device_id *id)
 {
 	struct input_handle *handle;
@@ -1350,13 +1357,81 @@ static int kbd_connect(struct input_hand
 	return error;
 }
 
-static void kbd_disconnect(struct input_handle *handle)
+static void __kbd_disconnect(struct input_handle *handle)
 {
 	input_close_device(handle);
 	input_unregister_handle(handle);
 	kfree(handle);
 }
 
+extern struct mutex input_mutex;
+/*
+ * To unbind keyboard need write "unbind" in kbd_bind
+ * To bind keyboard to all TTYs need write "all" in kbd_bind (by default)
+ * To bind keyboard to specified TTY... (not implemented)
+ */
+static ssize_t kbd_bind_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t len)
+{
+	struct list_head *node;
+	int ret = -EINVAL;
+	struct input_dev *idev;
+	char *s;
+
+	if (buf[len] != '\0')
+		return -EINVAL;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	s = strchr(buf, '\n');
+	if (s)
+		*s = '\0';
+
+	mutex_lock(&input_mutex);
+	if (!strcmp(buf, "unbind")) {
+		list_for_each(node, &kbd_handler.h_list) {
+			struct input_handle *handle = to_handle_h(node);
+			idev = handle->dev;
+			if (&idev->dev == dev) {
+				__kbd_disconnect(handle);
+				ret = len;
+				break;
+			}
+		}
+	} else if (!strcmp(buf, "all")) {
+		idev = container_of(dev, struct input_dev, dev);
+		ret = __kbd_connect(&kbd_handler, idev, NULL);
+		if (!ret)
+			ret = len;
+	}
+	mutex_unlock(&input_mutex);
+
+	return ret;
+}
+
+static DEVICE_ATTR(kbd_bind, S_IWUSR, NULL , kbd_bind_store);
+
+static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+			const struct input_device_id *id)
+{
+	int error;
+	error = device_create_file(&dev->dev, &dev_attr_kbd_bind);
+	if (error)
+		return error;
+	error = __kbd_connect(handler, dev, id);
+	if (error)
+		device_remove_file(&dev->dev, &dev_attr_kbd_bind);
+	return error;
+}
+
+static void kbd_disconnect(struct input_handle *handle)
+{
+	device_remove_file(&handle->dev->dev, &dev_attr_kbd_bind);
+	__kbd_disconnect(handle);
+}
+
 /*
  * Start keyboard handler on the new keyboard by refreshing LED state to
  * match the rest of the system.
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/mem.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/mem.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/mem.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/mem.c	2015-01-21 12:02:44.938180868 +0300
@@ -845,6 +845,9 @@ static ssize_t kmsg_write(struct file * 
 	char *tmp;
 	ssize_t ret;
 
+	if (!ve_is_super(get_exec_env()))
+		return count;
+
 	tmp = kmalloc(count + 1, GFP_KERNEL);
 	if (tmp == NULL)
 		return -ENOMEM;
@@ -921,12 +924,13 @@ static const struct file_operations memo
 	.open		= memory_open,
 };
 
-static char *mem_devnode(struct device *dev, mode_t *mode)
+char *mem_devnode(struct device *dev, mode_t *mode)
 {
 	if (mode && devlist[MINOR(dev->devt)].mode)
 		*mode = devlist[MINOR(dev->devt)].mode;
 	return NULL;
 }
+EXPORT_SYMBOL(mem_devnode);
 
 static struct class *mem_class;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/pty.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/pty.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/pty.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/pty.c	2015-01-21 12:02:47.783105339 +0300
@@ -29,6 +29,8 @@
 #include <linux/bitops.h>
 #include <linux/devpts_fs.h>
 
+#include <bc/misc.h>
+
 #include <asm/system.h>
 
 #ifdef CONFIG_UNIX98_PTYS
@@ -39,6 +41,8 @@ static struct tty_driver *pts_driver;
 static void pty_close(struct tty_struct *tty, struct file *filp)
 {
 	BUG_ON(!tty);
+
+	ub_pty_uncharge(tty);
 	if (tty->driver->subtype == PTY_TYPE_MASTER)
 		WARN_ON(tty->count > 1);
 	else {
@@ -57,7 +61,8 @@ static void pty_close(struct tty_struct 
 	if (tty->driver->subtype == PTY_TYPE_MASTER) {
 		set_bit(TTY_OTHER_CLOSED, &tty->flags);
 #ifdef CONFIG_UNIX98_PTYS
-		if (tty->driver == ptm_driver)
+		if (tty->link->driver_data &&
+		    tty->driver->flags & TTY_DRIVER_DEVPTS_MEM)
 			devpts_pty_kill(tty->link);
 #endif
 		tty_vhangup(tty->link);
@@ -200,6 +205,10 @@ static int pty_open(struct tty_struct *t
 	if (tty->link->count != 1)
 		goto out;
 
+	retval = -ENOMEM;
+	if (ub_pty_charge(tty))
+		goto out;
+
 	clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
 	set_bit(TTY_THROTTLED, &tty->flags);
 	retval = 0;
@@ -357,9 +366,12 @@ static const struct tty_operations slave
 	.resize = pty_resize
 };
 
+struct tty_driver *pty_driver, *pty_slave_driver;
+EXPORT_SYMBOL(pty_driver);
+EXPORT_SYMBOL(pty_slave_driver);
+
 static void __init legacy_pty_init(void)
 {
-	struct tty_driver *pty_driver, *pty_slave_driver;
 
 	if (legacy_count <= 0)
 		return;
@@ -590,7 +602,7 @@ static int __ptmx_open(struct inode *ino
 		return index;
 
 	mutex_lock(&tty_mutex);
-	tty = tty_init_dev(ptm_driver, index, 1);
+	tty = tty_init_dev(ptm_driver, index, NULL, 1);
 	mutex_unlock(&tty_mutex);
 
 	if (IS_ERR(tty)) {
@@ -599,8 +611,8 @@ static int __ptmx_open(struct inode *ino
 	}
 
 	set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
-	filp->private_data = tty;
-	file_move(filp, &tty->tty_files);
+
+	tty_add_file(tty, filp);
 
 	retval = devpts_pty_new(inode, tty->link);
 	if (retval)
@@ -693,10 +705,469 @@ static void __init unix98_pty_init(void)
 static inline void unix98_pty_init(void) { }
 #endif
 
+#ifdef CONFIG_VTTYS
+
+static struct tty_driver *vttm_driver;
+struct tty_driver *vtty_driver;
+
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+
+static struct tty_struct *vtty_masters;
+static DEFINE_SPINLOCK(vtty_lock);
+
+static void vtty_line_name(int veid, int idx, char *p)
+{
+	snprintf(p, 64, "v%dtty%d", veid, idx+1);
+}
+
+static void vtty_install_master(int veid, struct tty_struct *vtty)
+{
+	pr_debug("%s %d %d %p\n", __func__, veid, vtty->index, vtty);
+	vtty_line_name(veid, vtty->index, vtty->name);
+	spin_lock(&vtty_lock);
+	vtty->driver_data = vtty_masters;
+	vtty_masters = vtty;
+	spin_unlock(&vtty_lock);
+}
+
+static struct tty_struct *vtty_lookup_master(const char *name)
+{
+	struct tty_struct *tty;
+
+	spin_lock(&vtty_lock);
+	for ( tty = vtty_masters ; tty ; tty = tty->driver_data ) {
+		if (!strcmp(tty->name, name))
+			break;
+	}
+	spin_unlock(&vtty_lock);
+	pr_debug("%s %s %p\n", __func__, name, tty);
+	return tty;
+}
+
+static void vtty_remove_master(struct tty_struct *vtty)
+{
+	struct tty_struct **ptty;
+
+	pr_debug("%s %s %d %p\n", __func__, vtty->name, vtty->index, vtty);
+	spin_lock(&vtty_lock);
+	for ( ptty = &vtty_masters ; *ptty ;
+			ptty = (struct tty_struct **)&(*ptty)->driver_data ) {
+		if (*ptty == vtty) {
+			*ptty = vtty->driver_data;
+			break;
+		}
+	}
+	spin_unlock(&vtty_lock);
+}
+
+static void vtty_install_slave(struct ve_struct *ve, struct tty_struct *vtty)
+{
+	pr_debug("%s %d %d %p\n", __func__, ve->veid, vtty->index, vtty);
+	spin_lock(&vtty_lock);
+	vtty->owner_env = ve;
+	ve->vtty[vtty->index] = vtty;
+	spin_unlock(&vtty_lock);
+}
+
+static struct tty_struct *vtty_lookup_slave(struct ve_struct *ve, int idx)
+{
+	pr_debug("%s %d %d %p\n", __func__, ve->veid, idx, ve->vtty[idx]);
+	return ve->vtty[idx];
+}
+
+static void vtty_remove_slave(struct tty_struct *vtty)
+{
+	pr_debug("%s %d %d %p\n", __func__,
+			vtty->owner_env->veid, vtty->index, vtty);
+	spin_lock(&vtty_lock);
+	if (vtty->owner_env->vtty[vtty->index] == vtty) {
+		vtty->owner_env->vtty[vtty->index] = NULL;
+		vtty->owner_env = get_ve0();
+	}
+	spin_unlock(&vtty_lock);
+}
+
+static struct tty_struct *vtty_lookup(struct tty_driver *driver,
+				      struct inode *inode, int idx)
+{
+	struct tty_struct *tty;
+	struct ve_struct *ve = get_exec_env();
+
+	BUG_ON(driver != vtty_driver);
+
+	if (idx < 0 || idx >= MAX_NR_VTTY)
+		return ERR_PTR(-EIO);
+	tty = vtty_lookup_slave(ve, idx);
+	if (!tty) {
+		char name[64];
+
+		vtty_line_name(ve->veid, idx, name);
+		tty = vtty_lookup_master(name);
+		if (tty) {
+			tty = tty->link;
+			vtty_install_slave(ve, tty);
+		}
+	}
+	if (tty && test_bit(TTY_CLOSING, &tty->flags)) {
+		if (test_bit(TTY_CLOSING, &tty->link->flags)) {
+			pr_debug("%s %d %d %p close-race\n", __func__,
+					ve->veid, idx, tty);
+			tty = NULL;
+		} else
+			clear_bit(TTY_CLOSING, &tty->flags);
+	}
+	pr_debug("%s %s %d %p %d\n", __func__,
+			driver->name, idx, tty, tty ? tty->count : -1);
+	return tty;
+}
+
+static int vtty_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+	struct tty_struct *o_tty;
+	int idx = tty->index;
+	struct ve_struct *ve = get_exec_env();
+
+	BUG_ON(driver != vtty_driver);
+
+	o_tty = alloc_tty_struct();
+	if (!o_tty)
+		return -ENOMEM;
+	if (!try_module_get(driver->other->owner)) {
+		/* This cannot in fact currently happen */
+		free_tty_struct(o_tty);
+		return -ENOMEM;
+	}
+	initialize_tty_struct(o_tty, driver->other, idx);
+
+	tty->termios = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL);
+	if (tty->termios == NULL)
+		goto free_mem_out;
+	*tty->termios = driver->init_termios;
+	tty->termios_locked = tty->termios + 1;
+
+	o_tty->termios = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL);
+	if (o_tty->termios == NULL)
+		goto free_mem_out;
+	*o_tty->termios = driver->other->init_termios;
+	o_tty->termios_locked = o_tty->termios + 1;
+
+	tty_driver_kref_get(driver->other);
+	tty_driver_kref_get(driver);
+
+	tty->link   = o_tty;
+	o_tty->link = tty;
+
+	vtty_install_slave(ve, tty);
+	vtty_install_master(ve->veid, o_tty);
+
+	tty->count++;
+	tty->count++;	 /* master hold slave reference */
+	o_tty->count++;  /* slave hold master reference */
+	set_bit(TTY_EXTRA_REFERENCE, &o_tty->flags);
+
+	pr_debug("%s %s %d %p %d\n", __func__,
+			driver->name, idx, tty, tty->count);
+	pr_debug("%s %s %d %p %d\n", __func__,
+			driver->other->name, idx, o_tty, o_tty->count);
+
+	return 0;
+
+free_mem_out:
+	kfree(tty->termios);
+	module_put(o_tty->driver->owner);
+	free_tty_struct(o_tty);
+	return -ENOMEM;
+}
+
+static int vtty_open(struct tty_struct *tty, struct file *filp)
+{
+	pr_debug("%s %s %d %p %d\n", __func__,
+			tty->driver->name, tty->index, tty, tty->count);
+	set_bit(TTY_THROTTLED, &tty->flags);
+	return 0;
+}
+
+static void vtty_close(struct tty_struct *tty, struct file *filp)
+{
+	struct tty_struct *o_tty = tty->link;
+
+	pr_debug("%s %s %d %p %d\n", __func__,
+			tty->driver->name, tty->index, tty, tty->count);
+	pr_debug("%s %s %d %p %d\n", __func__,
+			o_tty->driver->name, o_tty->index, o_tty, o_tty->count);
+
+	if (tty->count > 2)
+		return;
+
+	if (tty->driver == vtty_driver) {
+		if (o_tty->count == 1) {
+			clear_bit(TTY_EXTRA_REFERENCE, &o_tty->flags);
+			o_tty->count--;
+			tty->count--;
+		}
+	} else {
+		if (o_tty->count == 1) {
+			clear_bit(TTY_EXTRA_REFERENCE, &tty->flags);
+			tty->count--;
+		} else
+			o_tty->count++;
+	}
+
+	pr_debug("%s %s %d %p %d\n", __func__,
+			tty->driver->name, tty->index, tty, tty->count);
+	pr_debug("%s %s %d %p %d\n", __func__,
+			o_tty->driver->name, o_tty->index, o_tty, o_tty->count);
+
+	if (tty->count == 1) {
+		set_bit(TTY_OTHER_CLOSED, &tty->flags);
+		set_bit(TTY_OTHER_CLOSED, &o_tty->flags);
+	}
+
+	tty->packet = 0;
+	wake_up_interruptible(&tty->read_wait);
+	wake_up_interruptible(&tty->write_wait);
+	o_tty->packet = 0;
+	wake_up_interruptible(&o_tty->read_wait);
+	wake_up_interruptible(&o_tty->write_wait);
+}
+
+static void vtty_remove(struct tty_driver *driver, struct tty_struct *tty)
+{
+	pr_debug("%s %s %d %p %d\n", __func__,
+			driver->name, tty->index, tty, tty->count);
+}
+
+static void vtty_shutdown(struct tty_struct *tty)
+{
+	pr_debug("%s %s %d %p %d\n", __func__,
+			tty->driver->name, tty->index, tty, tty->count);
+	if (tty->driver == vtty_driver)
+		vtty_remove_slave(tty);
+	else
+		vtty_remove_master(tty);
+	kfree(tty->termios);
+}
+
+static int vtty_write(struct tty_struct *tty, const unsigned char *buf, int c)
+{
+	struct tty_struct *to = tty->link;
+
+	if (tty->stopped)
+		return 0;
+
+	if (c > 0) {
+		/* Stuff the data into the input queue of the other end */
+		c = tty_insert_flip_string(to, buf, c);
+		/* And shovel */
+		if (c) {
+			tty_flip_buffer_push(to);
+			tty_wakeup(tty);
+		} else if (to->count < 2)
+			/* thow out data */
+			tty_perform_flush(to, TCIFLUSH);
+	}
+	return c;
+}
+
+static int vtty_write_room(struct tty_struct *tty)
+{
+	struct tty_struct *to = tty->link;
+
+	if (tty->stopped)
+		return 0;
+	if (to->count < 2)
+		return 4096;
+	return pty_space(to);
+}
+
+static const struct tty_operations vttm_ops = {
+	.lookup = vtty_lookup,
+	.install = vtty_install,
+	.remove = vtty_remove,
+	.open = vtty_open,
+	.close = vtty_close,
+	.shutdown = vtty_shutdown,
+	.write = vtty_write,
+	.write_room = vtty_write_room,
+	.flush_buffer = pty_flush_buffer,
+	.chars_in_buffer = pty_chars_in_buffer,
+	.unthrottle = pty_unthrottle,
+	.set_termios = pty_set_termios,
+	.resize = pty_resize,
+};
+
+static const struct tty_operations vtty_ops = {
+	.lookup = vtty_lookup,
+	.install = vtty_install,
+	.remove = vtty_remove,
+	.open = vtty_open,
+	.close = vtty_close,
+	.shutdown = vtty_shutdown,
+	.write = vtty_write,
+	.write_room = vtty_write_room,
+	.flush_buffer = pty_flush_buffer,
+	.chars_in_buffer = pty_chars_in_buffer,
+	.unthrottle = pty_unthrottle,
+	.set_termios = pty_set_termios,
+};
+
+static struct file_operations vtty_fops;
+
+int vtty_open_master(int veid, int idx)
+{
+	struct tty_struct *tty;
+	struct file *file;
+	int fd, err;
+	char name[64];
+
+	err = -ENODEV;
+	if (idx < 0 || idx >= MAX_NR_VTTY)
+		goto err_out;
+
+	fd = get_unused_fd_flags(0);
+	err = fd;
+	if (fd < 0)
+		goto err_out;
+
+	vtty_line_name(veid, idx, name);
+	file = anon_inode_getfile(name, &vtty_fops, NULL, O_RDWR);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto err_file;
+
+	lock_kernel();
+	mutex_lock(&tty_mutex);
+	tty = vtty_lookup_master(name);
+	if (!tty || (test_bit(TTY_CLOSING, &tty->flags) &&
+		     test_bit(TTY_CLOSING, &tty->link->flags))) {
+		tty = tty_init_dev(vtty_driver, idx, tty, 1);
+		err = PTR_ERR(tty);
+		if (IS_ERR(tty))
+			goto err_tty;
+		tty->count--;
+		vtty_remove_slave(tty);
+		tty = tty->link;
+		vtty_remove_master(tty);
+		vtty_install_master(veid, tty);
+	}
+
+	pr_debug("%s %s %d %p %d\n", __func__,
+			tty->driver->name, tty->index, tty, tty->count);
+
+	err = -EBUSY;
+	if (tty->count > 1)
+		goto err_tty;
+
+	tty->count++;
+	mutex_unlock(&tty_mutex);
+	tty_add_file(tty, file);
+	clear_bit(TTY_CLOSING, &tty->flags);
+	set_bit(TTY_THROTTLED, &tty->flags);
+	unlock_kernel();
+
+	fd_install(fd, file);
+	return fd;
+
+err_tty:
+	mutex_unlock(&tty_mutex);
+	unlock_kernel();
+	file->f_op = NULL;
+	fput(file);
+err_file:
+	put_unused_fd(fd);
+err_out:
+	return err;
+}
+EXPORT_SYMBOL(vtty_open_master);
+
+#include <linux/ve_proto.h>
+
+static int vtty_ve_init(void *data)
+{
+	return 0;
+}
+
+static void vtty_ve_fini(void *data)
+{
+	struct ve_struct *ve = data;
+	struct tty_struct *vtty;
+	int idx;
+
+	mutex_lock(&tty_mutex);
+	for (idx = 0 ; idx < MAX_NR_VTTY ; idx++) {
+		vtty = vtty_lookup_slave(ve, idx);
+		if (vtty)
+			vtty_remove_slave(vtty);
+	}
+	mutex_unlock(&tty_mutex);
+}
+
+static struct ve_hook vtty_ve_hook = {
+	.init		= vtty_ve_init,
+	.fini		= vtty_ve_fini,
+	.owner		= THIS_MODULE,
+};
+
+static void __init vtty_init(void)
+{
+	tty_default_fops(&vtty_fops);
+
+	vttm_driver = alloc_tty_driver(MAX_NR_VTTY);
+	if (!vttm_driver)
+		panic("Couldn't allocate vttm driver");
+	vtty_driver = alloc_tty_driver(MAX_NR_VTTY);
+	if (!vtty_driver)
+		panic("Couldn't allocate vtty driver");
+
+	vttm_driver->owner = THIS_MODULE;
+	vttm_driver->driver_name = "vttm";
+	vttm_driver->name = "vttm";
+	vttm_driver->name_base = 1;
+	vttm_driver->major = TTY_MAJOR;
+	vttm_driver->minor_start = 1;
+	vttm_driver->type = TTY_DRIVER_TYPE_PTY;
+	vttm_driver->subtype = PTY_TYPE_MASTER;
+	vttm_driver->init_termios = tty_std_termios;
+	vttm_driver->init_termios.c_iflag = 0;
+	vttm_driver->init_termios.c_oflag = 0;
+	vttm_driver->init_termios.c_cflag = B38400 | CS8 | CREAD;
+	vttm_driver->init_termios.c_lflag = 0;
+	vttm_driver->flags = TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW |
+		TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM |
+		TTY_DRIVER_INSTALLED;
+	vttm_driver->other = vtty_driver;
+	tty_set_operations(vttm_driver, &vttm_ops);
+	cdev_init(&vttm_driver->cdev, &vtty_fops);
+
+	vtty_driver->owner = THIS_MODULE;
+	vtty_driver->driver_name = "vtty";
+	vtty_driver->name = "vtty";
+	vtty_driver->name_base = 1;
+	vtty_driver->major = TTY_MAJOR;
+	vtty_driver->minor_start = 1;
+	vtty_driver->type = TTY_DRIVER_TYPE_PTY;
+	vtty_driver->subtype = PTY_TYPE_SLAVE;
+	vtty_driver->init_termios = tty_std_termios;
+	vtty_driver->init_termios.c_cflag = B38400 | CS8 | CREAD;
+	vtty_driver->flags = TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW |
+		TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM |
+		TTY_DRIVER_INSTALLED;
+	vtty_driver->other = vttm_driver;
+	tty_set_operations(vtty_driver, &vtty_ops);
+	cdev_init(&vtty_driver->cdev, &vtty_fops);
+
+	ve_hook_register(VE_SS_CHAIN, &vtty_ve_hook);
+}
+#else
+static inline void ve_pty_init(void) { }
+#endif
+
 static int __init pty_init(void)
 {
 	legacy_pty_init();
 	unix98_pty_init();
+	vtty_init();
 	return 0;
 }
 module_init(pty_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/random.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/random.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/random.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/random.c	2015-01-21 12:02:44.323197195 +0300
@@ -1134,6 +1134,11 @@ static ssize_t random_write(struct file 
 {
 	size_t ret;
 
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		return count;
+#endif
+
 	ret = write_pool(&blocking_pool, buffer, count);
 	if (ret)
 		return ret;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/sysrq.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/sysrq.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/sysrq.c	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/sysrq.c	2015-01-21 12:02:58.673816242 +0300
@@ -37,7 +37,13 @@
 #include <linux/vt_kern.h>
 #include <linux/workqueue.h>
 #include <linux/hrtimer.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
 #include <linux/oom.h>
+#include <linux/nmi.h>
+#include <net/dst.h>
+
+#include <bc/oom_kill.h>
 
 #include <asm/ptrace.h>
 #include <asm/irq_regs.h>
@@ -197,6 +203,19 @@ static struct sysrq_key_op sysrq_showloc
 #define sysrq_showlocks_op (*(struct sysrq_key_op *)0)
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
+static void sysrq_handle_sched_debug(int key, struct tty_struct *tty)
+{
+	show_sched_debug();
+}
+
+static struct sysrq_key_op sysrq_sched_debug_op = {
+	.handler	= sysrq_handle_sched_debug,
+	.help_msg	= "show-sched-state(A)",
+	.action_msg	= "CPU Scheduler State",
+};
+#endif
+
 #ifdef CONFIG_SMP
 static DEFINE_SPINLOCK(show_lock);
 
@@ -250,8 +269,8 @@ static struct sysrq_key_op sysrq_showall
 static void sysrq_handle_showregs(int key, struct tty_struct *tty)
 {
 	struct pt_regs *regs = get_irq_regs();
-	if (regs)
-		show_regs(regs);
+
+	nmi_show_regs(regs, 0);
 	perf_event_print_debug();
 }
 static struct sysrq_key_op sysrq_showregs_op = {
@@ -302,7 +321,15 @@ static struct sysrq_key_op sysrq_ftrace_
 
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
 {
+	struct user_beancounter *ub;
+
+	rcu_read_lock();
+	for_each_beancounter(ub)
+		show_ub_mem(ub);
+	rcu_read_unlock();
+
 	show_mem(0);
+	show_slab_info();
 }
 static struct sysrq_key_op sysrq_showmem_op = {
 	.handler	= sysrq_handle_showmem,
@@ -318,7 +345,7 @@ static void send_sig_all(int sig)
 {
 	struct task_struct *p;
 
-	for_each_process(p) {
+	for_each_process_all(p) {
 		if (p->mm && !is_global_init(p))
 			/* Not swapper, init nor kernel thread */
 			force_sig(sig, p);
@@ -339,6 +366,8 @@ static struct sysrq_key_op sysrq_term_op
 
 static void moom_callback(struct work_struct *ignored)
 {
+	ub_oom_start(&global_oom_ctrl);
+	global_oom_ctrl.kill_counter = 0;
 	out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL);
 }
 
@@ -394,7 +423,357 @@ static struct sysrq_key_op sysrq_unrt_op
 /* Key Operations table and lock */
 static DEFINE_SPINLOCK(sysrq_key_table_lock);
 
-static struct sysrq_key_op *sysrq_key_table[36] = {
+#define SYSRQ_KEY_TABLE_LENGTH 37
+static struct sysrq_key_op **sysrq_key_table;
+static struct sysrq_key_op *sysrq_default_key_table[];
+
+#ifdef CONFIG_SYSRQ_DEBUG
+#define SYSRQ_NAMELEN_MAX	64
+#define SYSRQ_DUMP_LINES	32
+
+static struct sysrq_key_op *sysrq_debug_key_table[];
+static struct sysrq_key_op *sysrq_input_key_table[];
+static unsigned long *dump_address;
+static struct kmem_cache *dump_slab_ptr;
+static int orig_console_loglevel;
+static void (*sysrq_input_return)(char *) = NULL;
+
+static unsigned long dump_offset, dump_index, dump_count;
+
+static bool dump_skip(void)
+{
+	if (!dump_count || ++dump_index <= dump_offset)
+		return true;
+	dump_count--;
+	return false;
+}
+
+static void dump_mem(void)
+{
+	unsigned long value[4];
+	mm_segment_t old_fs;
+	int line, err;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	err = 0;
+
+	for (line = 0; line < SYSRQ_DUMP_LINES; line++) {
+		err |= __get_user(value[0], dump_address++);
+		err |= __get_user(value[1], dump_address++);
+		err |= __get_user(value[2], dump_address++);
+		err |= __get_user(value[3], dump_address++);
+		if (err) {
+			printk("Invalid address %p\n", dump_address - 4);
+			break;
+		}
+#if BITS_PER_LONG == 32
+		printk("0x%p: %08lx %08lx %08lx %08lx\n",
+				dump_address - 4,
+				value[0], value[1], value[2], value[3]);
+#else
+		printk("0x%p: %016lx %016lx %016lx %016lx\n",
+				dump_address - 4,
+				value[0], value[1], value[2], value[3]);
+#endif
+	}
+	set_fs(old_fs);
+}
+
+static void write_mem(unsigned long val)
+{
+	mm_segment_t old_fs;
+	unsigned long old_val;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	if (__get_user(old_val, dump_address)) {
+		printk("Invalid address %p\n", dump_address);
+		goto out;
+	}
+
+#if BITS_PER_LONG == 32
+	printk("Changing [%p] from %08lx to %08lx\n",
+			dump_address, old_val, val);
+#else
+	printk("Changing [%p] from %016lx to %016lx\n",
+			dump_address, old_val, val);
+#endif
+	__put_user(val, dump_address);
+out:
+	set_fs(old_fs);
+}
+
+static void handle_read(int key, struct tty_struct *tty)
+{
+	static int pos;
+	static int upper_case;
+	static char str[SYSRQ_NAMELEN_MAX];
+
+	if (key == 0) {
+		/* actually 0 is not shift only... */
+		upper_case = 1;
+		return;
+	}
+
+	if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) {
+		/* enter */
+		sysrq_key_table = sysrq_debug_key_table;
+		str[pos] = '\0';
+		pos = upper_case = 0;
+		printk("\n");
+		if (sysrq_input_return == NULL)
+			printk("No return handler!!!\n");
+		else
+			sysrq_input_return(str);
+		return;
+	};
+
+	/* check for alowed symbols */
+	if (key == '-') {
+		if (upper_case)
+			key = '_';
+		goto correct;
+	};
+	if (key >= 'a' && key <= 'z') {
+		if (upper_case)
+			key = key - 'a' + 'A';
+		goto correct;
+	};
+	if (key >= '0' && key <= '9')
+		goto correct;
+
+	upper_case = 0;
+	return;
+
+correct:
+	str[pos] = key;
+	printk("%c", (char)key);
+	pos++;
+	upper_case = 0;
+}
+
+static struct sysrq_key_op input_read = {
+	.handler	= handle_read,
+	.help_msg	= "",
+	.action_msg	= NULL,
+};
+
+static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
+	[0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read,
+};
+
+static void return_dump_mem(char *str)
+{
+	unsigned long address;
+	char *end;
+
+	address = simple_strtoul(str, &end, 0);
+	if (*end != '\0') {
+		printk("Bad address [%s]\n", str);
+		return;
+	}
+
+	dump_address = (unsigned long *)address;
+	dump_mem();
+}
+
+static void handle_dump_mem(int key, struct tty_struct *tty)
+{
+	sysrq_input_return = return_dump_mem;
+	sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_dump_mem = {
+	.handler	= handle_dump_mem,
+	.help_msg	= "Dump",
+	.action_msg	= "Enter address:",
+};
+
+static void dump_slab_obj(void *obj)
+{
+	struct user_beancounter *ubc = NULL;
+
+	if (dump_skip())
+		return;
+
+	if (dump_slab_ptr->flags & SLAB_UBC)
+		ubc = *ub_slab_ptr(dump_slab_ptr, obj);
+
+	printk(KERN_DEBUG"obj %p idx %lu ubc %p\n", obj, dump_index, ubc);
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET,
+			16, sizeof(long), obj, dump_slab_ptr->buffer_size, false);
+}
+
+static void dump_slab(void)
+{
+	dump_index = 0;
+	dump_count = 100;
+	slab_obj_walk(dump_slab_ptr, dump_slab_obj);
+	dump_offset = dump_index;
+}
+
+static void return_dump_slab(char *str)
+{
+	unsigned long address;
+	char *end;
+
+	address = simple_strtoul(str, &end, 0);
+	if (*end != '\0') {
+		printk("Bad address [%s]\n", str);
+		return;
+	}
+
+	dump_slab_ptr = (struct kmem_cache *)address;
+	if (!virt_addr_valid(dump_slab_ptr) ||
+	    !PageSlab(virt_to_page(dump_slab_ptr))) {
+		printk("Non-slab address [%s]\n", str);
+		dump_slab_ptr = NULL;
+		return;
+	}
+
+	printk(KERN_DEBUG "SLAB %p %s size %d objuse %d\n",
+			dump_slab_ptr, dump_slab_ptr->name,
+			dump_slab_ptr->buffer_size, dump_slab_ptr->objuse);
+
+	dump_address = NULL;
+	dump_offset = 0;
+	dump_slab();
+}
+
+static void handle_dump_slab(int key, struct tty_struct *tty)
+{
+	sysrq_input_return = return_dump_slab;
+	sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_dump_slab = {
+	.handler	= handle_dump_slab,
+	.help_msg	= "Slab",
+	.action_msg	= "Enter address:",
+};
+
+static void handle_dump_net(int key, struct tty_struct *tty)
+{
+	dst_cache_dump();
+}
+
+static struct sysrq_key_op debug_dump_net = {
+	.handler	= handle_dump_net,
+	.help_msg	= "Net",
+	.action_msg	= "Dumping networking guts:",
+};
+
+static void return_resolve(char *str)
+{
+	unsigned long address;
+
+	address = kallsyms_lookup_name(str);
+	printk("%s : %lx\n", str, address);
+	if (address) {
+		dump_address = (unsigned long *)address;
+		printk("Now you can dump it via X\n");
+	}
+}
+
+static void handle_resolve(int key, struct tty_struct *tty)
+{
+	sysrq_input_return = return_resolve;
+	sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_resolve = {
+	.handler	= handle_resolve,
+	.help_msg	= "Resolve",
+	.action_msg	= "Enter symbol name:",
+};
+
+static void return_write_mem(char *str)
+{
+	unsigned long address;
+	unsigned long value;
+	char *end;
+
+	address = simple_strtoul(str, &end, 0);
+	if (*end != '-') {
+		printk("Bad address in %s\n", str);
+		return;
+	}
+	value = simple_strtoul(end + 1, &end, 0);
+	if (*end != '\0') {
+		printk("Bad value in %s\n", str);
+		return;
+	}
+
+	dump_address = (unsigned long *)address;
+	write_mem(value);
+}
+
+static void handle_write_mem(int key, struct tty_struct *tty)
+{
+	sysrq_input_return = return_write_mem;
+	sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_write_mem = {
+	.handler	= handle_write_mem,
+	.help_msg	= "Writemem",
+	.action_msg	= "Enter address-value:",
+};
+
+static void handle_next(int key, struct tty_struct *tty)
+{
+	if (dump_address)
+		dump_mem();
+	else if (dump_slab_ptr)
+		dump_slab();
+}
+
+static struct sysrq_key_op debug_next = {
+	.handler	= handle_next,
+	.help_msg	= "neXt",
+	.action_msg	= "continuing",
+};
+
+static void handle_quit(int key, struct tty_struct *tty)
+{
+	sysrq_key_table = sysrq_default_key_table;
+	console_loglevel = orig_console_loglevel;
+}
+
+static struct sysrq_key_op debug_quit = {
+	.handler	= handle_quit,
+	.help_msg	= "Quit",
+	.action_msg	= "Thank you for using debugger",
+};
+
+static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
+	[13] = &debug_dump_mem,		/* d */
+	[23] = &debug_dump_net,		/* n */
+	[26] = &debug_quit,		/* q */
+	[27] = &debug_resolve,		/* r */
+	[28] = &debug_dump_slab,	/* s */
+	[32] = &debug_write_mem,	/* w */
+	[33] = &debug_next,		/* x */
+};
+
+static void sysrq_handle_debug(int key, struct tty_struct *tty)
+{
+	orig_console_loglevel = console_loglevel;
+	console_loglevel = 8;
+	sysrq_key_table = sysrq_debug_key_table;
+	printk("Welcome sysrq debugging mode\n"
+			"Press H for help\n");
+}
+
+static struct sysrq_key_op sysrq_debug_op = {
+	.handler        = sysrq_handle_debug,
+	.help_msg       = "debuG",
+	.action_msg     = "Select desired action",
+};
+#endif
+
+static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
 	&sysrq_loglevel_op,		/* 0 */
 	&sysrq_loglevel_op,		/* 1 */
 	&sysrq_loglevel_op,		/* 2 */
@@ -410,14 +789,22 @@ static struct sysrq_key_op *sysrq_key_ta
 	 * a: Don't use for system provided sysrqs, it is handled specially on
 	 * sparc and will never arrive.
 	 */
+#ifdef CONFIG_SCHED_DEBUG
+	&sysrq_sched_debug_op,		/* a */
+#else
 	NULL,				/* a */
+#endif
 	&sysrq_reboot_op,		/* b */
 	&sysrq_crash_op,		/* c & ibm_emac driver debug */
 	&sysrq_showlocks_op,		/* d */
 	&sysrq_term_op,			/* e */
 	&sysrq_moom_op,			/* f */
 	/* g: May be registered for the kernel debugger */
+#ifdef CONFIG_SYSRQ_DEBUG
+	&sysrq_debug_op,		/* g */
+#else
 	NULL,				/* g */
+#endif
 	NULL,				/* h - reserved for help */
 	&sysrq_kill_op,			/* i */
 #ifdef CONFIG_BLOCK
@@ -449,8 +836,11 @@ static struct sysrq_key_op *sysrq_key_ta
 	/* y: May be registered on sparc64 for global register dump */
 	NULL,				/* y */
 	&sysrq_ftrace_dump_op,		/* z */
+	NULL,				/* for debugger */
 };
 
+static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table;
+
 /* key2index calculation, -1 on invalid index */
 static int sysrq_key_table_key2index(int key)
 {
@@ -460,6 +850,10 @@ static int sysrq_key_table_key2index(int
 		retval = key - '0';
 	else if ((key >= 'a') && (key <= 'z'))
 		retval = key + 10 - 'a';
+#ifdef CONFIG_SYSRQ_DEBUG
+	else if (key == 0 || key == 0x0d || key == '-')
+		retval = SYSRQ_KEY_TABLE_LENGTH - 1;
+#endif
 	else
 		retval = -1;
 	return retval;
@@ -470,21 +864,21 @@ static int sysrq_key_table_key2index(int
  */
 struct sysrq_key_op *__sysrq_get_key_op(int key)
 {
-        struct sysrq_key_op *op_p = NULL;
-        int i;
+	struct sysrq_key_op *op_p = NULL;
+	int i;
 
 	i = sysrq_key_table_key2index(key);
 	if (i != -1)
-	        op_p = sysrq_key_table[i];
-        return op_p;
+		op_p = sysrq_key_table[i];
+	return op_p;
 }
 
 static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p)
 {
-        int i = sysrq_key_table_key2index(key);
+	int i = sysrq_key_table_key2index(key);
 
-        if (i != -1)
-                sysrq_key_table[i] = op_p;
+	if (i != -1)
+		sysrq_key_table[i] = op_p;
 }
 
 /*
@@ -507,25 +901,25 @@ void __handle_sysrq(int key, struct tty_
 	 */
 	orig_log_level = console_loglevel;
 	console_loglevel = 7;
-	printk(KERN_INFO "SysRq : ");
 
-        op_p = __sysrq_get_key_op(key);
-        if (op_p) {
+	op_p = __sysrq_get_key_op(key);
+	if (op_p) {
 		/*
 		 * Should we check for enabled operations (/proc/sysrq-trigger
 		 * should not) and is the invoked operation enabled?
 		 */
 		if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
-			printk("%s\n", op_p->action_msg);
+			if (op_p->action_msg)
+				printk("%s\n", op_p->action_msg);
 			console_loglevel = orig_log_level;
 			op_p->handler(key, tty);
 		} else {
 			printk("This sysrq operation is disabled.\n");
 		}
 	} else {
-		printk("HELP : ");
+		printk("SysRq HELP : ");
 		/* Only print the help msg once per handler */
-		for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) {
+		for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) {
 			if (sysrq_key_table[i]) {
 				int j;
 
@@ -555,7 +949,7 @@ void handle_sysrq(int key, struct tty_st
 EXPORT_SYMBOL(handle_sysrq);
 
 static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p,
-                                struct sysrq_key_op *remove_op_p)
+				struct sysrq_key_op *remove_op_p)
 {
 
 	int retval;
@@ -591,12 +985,29 @@ EXPORT_SYMBOL(unregister_sysrq_key);
 static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
 				   size_t count, loff_t *ppos)
 {
+	struct ve_struct *cur = get_exec_env();
+	static int pnum = 10;
+
 	if (count) {
-		char c;
+		int i, cnt;
+		char c[32];
 
-		if (get_user(c, buf))
+		cnt = min(count, sizeof(c));
+		if (copy_from_user(c, buf, cnt))
 			return -EFAULT;
-		__handle_sysrq(c, NULL, 0);
+
+
+		for (i = 0; i < cnt && c[i] != '\n'; i++) {
+			if (!ve_is_super(cur))	{
+				if (!pnum)
+					continue;
+				printk("SysRq: CT#%u sent '%c' magic key.\n",
+						cur->veid, c[i]);
+				pnum--;
+				continue;
+			}
+			__handle_sysrq(c[i], NULL, 0);
+		}
 	}
 	return count;
 }
@@ -607,7 +1018,7 @@ static const struct file_operations proc
 
 static int __init sysrq_init(void)
 {
-	proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
+	proc_create("sysrq-trigger", S_IWUSR, &glob_proc_root, &proc_sysrq_trigger_operations);
 	return 0;
 }
 module_init(sysrq_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/tty_io.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/tty_io.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/tty_io.c	2014-12-12 23:28:56.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/tty_io.c	2015-01-21 12:02:49.064071334 +0300
@@ -96,6 +96,8 @@
 #include <linux/bitops.h>
 #include <linux/delay.h>
 #include <linux/seq_file.h>
+#include <linux/nsproxy.h>
+#include <linux/ve.h>
 
 #include <linux/uaccess.h>
 #include <asm/system.h>
@@ -106,6 +108,7 @@
 
 #include <linux/kmod.h>
 #include <linux/nsproxy.h>
+#include <bc/kmem.h>
 
 #undef TTY_DEBUG_HANGUP
 
@@ -130,12 +133,16 @@ EXPORT_SYMBOL(tty_std_termios);
    into this file */
 
 LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
+EXPORT_SYMBOL(tty_drivers);
 
 /* Mutex to protect creating and releasing a tty. This is shared with
    vt.c for deeply disgusting hack reasons */
 DEFINE_MUTEX(tty_mutex);
 EXPORT_SYMBOL(tty_mutex);
 
+/* Spinlock to protect the tty->tty_files list */
+DEFINE_SPINLOCK(tty_files_lock);
+
 static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
 static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
 ssize_t redirected_tty_write(struct file *, const char __user *,
@@ -166,7 +173,7 @@ static void proc_set_tty(struct task_str
 
 struct tty_struct *alloc_tty_struct(void)
 {
-	return kzalloc(sizeof(struct tty_struct), GFP_KERNEL);
+	return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC);
 }
 
 /**
@@ -185,6 +192,36 @@ void free_tty_struct(struct tty_struct *
 	kfree(tty);
 }
 
+/* Associate a new file with the tty structure */
+void tty_add_file(struct tty_struct *tty, struct file *file)
+{
+	struct tty_file_private *priv;
+
+	/* XXX: must implement proper error handling in callers */
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL|__GFP_NOFAIL);
+
+	priv->tty = tty;
+	priv->file = file;
+	file->private_data = priv;
+
+	spin_lock(&tty_files_lock);
+	list_add(&priv->list, &tty->tty_files);
+	spin_unlock(&tty_files_lock);
+}
+
+/* Delete file from its tty */
+void tty_del_file(struct file *file)
+{
+	struct tty_file_private *priv = file->private_data;
+
+	spin_lock(&tty_files_lock);
+	list_del(&priv->list);
+	spin_unlock(&tty_files_lock);
+	file->private_data = NULL;
+	kfree(priv);
+}
+
+
 #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base)
 
 /**
@@ -235,15 +272,17 @@ static int check_tty_count(struct tty_st
 	struct list_head *p;
 	int count = 0;
 
-	file_list_lock();
+	spin_lock(&tty_files_lock);
 	list_for_each(p, &tty->tty_files) {
 		count++;
 	}
-	file_list_unlock();
+	spin_unlock(&tty_files_lock);
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_SLAVE &&
 	    tty->link && tty->link->count)
 		count++;
+	if (test_bit(TTY_EXTRA_REFERENCE, &tty->flags))
+		count++;
 	if (tty->count != count) {
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
@@ -274,9 +313,22 @@ static struct tty_driver *get_tty_driver
 		if (device < base || device >= base + p->num)
 			continue;
 		*index = device - base;
-		return tty_driver_kref_get(p);
+#ifdef CONFIG_VE
+		if (in_interrupt())
+			goto found;
+		if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR)
+			goto found;
+		if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env()))
+			goto found;
+		if (!ve_accessible_strict(p->owner_env, get_exec_env()))
+			continue;
+#endif
+		goto found;
 	}
 	return NULL;
+
+found:
+	return tty_driver_kref_get(p);
 }
 
 #ifdef CONFIG_CONSOLE_POLL
@@ -499,6 +551,7 @@ static void do_tty_hangup(struct work_st
 	struct file *cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
+	struct tty_file_private *priv;
 	int    closecount = 0, n;
 	unsigned long flags;
 	int refs = 0;
@@ -510,16 +563,19 @@ static void do_tty_hangup(struct work_st
 	lock_kernel();
 
 	spin_lock(&redirect_lock);
-	if (redirect && redirect->private_data == tty) {
+	if (redirect && file_tty(redirect) == tty) {
 		f = redirect;
 		redirect = NULL;
 	}
 	spin_unlock(&redirect_lock);
 
+	set_bit(TTY_HUPPING, &tty->flags);
+
 	check_tty_count(tty, "do_tty_hangup");
-	file_list_lock();
+	spin_lock(&tty_files_lock);
 	/* This breaks for file handles being sent over AF_UNIX sockets ? */
-	list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) {
+	list_for_each_entry(priv, &tty->tty_files, list) {
+		filp = priv->file;
 		if (filp->f_op->write == redirected_tty_write)
 			cons_filp = filp;
 		if (filp->f_op->write != tty_write)
@@ -528,7 +584,7 @@ static void do_tty_hangup(struct work_st
 		tty_fasync(-1, filp, 0);	/* can't block */
 		filp->f_op = &hung_up_tty_fops;
 	}
-	file_list_unlock();
+	spin_unlock(&tty_files_lock);
 
 	tty_ldisc_hangup(tty);
 
@@ -567,7 +623,6 @@ static void do_tty_hangup(struct work_st
 	tty->session = NULL;
 	tty->pgrp = NULL;
 	tty->ctrl_status = 0;
-	set_bit(TTY_HUPPED, &tty->flags);
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
 	/* Account for the p->signal references we killed */
@@ -593,6 +648,7 @@ static void do_tty_hangup(struct work_st
 	 * can't yet guarantee all that.
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
+	clear_bit(TTY_HUPPING, &tty->flags);
 	tty_ldisc_enable(tty);
 	unlock_kernel();
 	if (f)
@@ -874,12 +930,10 @@ static ssize_t tty_read(struct file *fil
 			loff_t *ppos)
 {
 	int i;
-	struct tty_struct *tty;
-	struct inode *inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct tty_struct *tty = file_tty(file);
 	struct tty_ldisc *ld;
 
-	tty = (struct tty_struct *)file->private_data;
-	inode = file->f_path.dentry->d_inode;
 	if (tty_paranoia_check(tty, inode, "tty_read"))
 		return -EIO;
 	if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags)))
@@ -1048,12 +1102,11 @@ void tty_write_message(struct tty_struct
 static ssize_t tty_write(struct file *file, const char __user *buf,
 						size_t count, loff_t *ppos)
 {
-	struct tty_struct *tty;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct tty_struct *tty = file_tty(file);
+ 	struct tty_ldisc *ld;
 	ssize_t ret;
-	struct tty_ldisc *ld;
 
-	tty = (struct tty_struct *)file->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_write"))
 		return -EIO;
 	if (!tty || !tty->ops->write ||
@@ -1169,7 +1222,7 @@ int tty_init_termios(struct tty_struct *
 
 	tp = tty->driver->termios[idx];
 	if (tp == NULL) {
-		tp = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL);
+		tp = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL_UBC);
 		if (tp == NULL)
 			return -ENOMEM;
 		memcpy(tp, &tty->driver->init_termios,
@@ -1247,7 +1300,9 @@ static int tty_reopen(struct tty_struct 
 {
 	struct tty_driver *driver = tty->driver;
 
-	if (test_bit(TTY_CLOSING, &tty->flags))
+	if (test_bit(TTY_CLOSING, &tty->flags) ||
+			test_bit(TTY_HUPPING, &tty->flags) ||
+			test_bit(TTY_LDISC_CHANGING, &tty->flags))
 		return -EIO;
 
 	if (driver->type == TTY_DRIVER_TYPE_PTY &&
@@ -1297,7 +1352,7 @@ static int tty_reopen(struct tty_struct 
  */
 
 struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
-								int first_ok)
+					struct tty_struct *i_tty, int first_ok)
 {
 	struct tty_struct *tty;
 	int retval;
@@ -1404,9 +1459,9 @@ static void release_one_tty(struct work_
 	tty_driver_kref_put(driver);
 	module_put(driver->owner);
 
-	file_list_lock();
+	spin_lock(&tty_files_lock);
 	list_del_init(&tty->tty_files);
-	file_list_unlock();
+	spin_unlock(&tty_files_lock);
 
 	put_pid(tty->pgrp);
 	put_pid(tty->session);
@@ -1476,7 +1531,8 @@ static void release_tty(struct tty_struc
  */
 void tty_release_dev(struct file *filp)
 {
-	struct tty_struct *tty, *o_tty;
+	struct tty_struct *tty = file_tty(filp);
+	struct tty_struct *o_tty;
 	int	pty_master, tty_closing, o_tty_closing, do_sleep;
 	int	devpts;
 	int	idx;
@@ -1484,7 +1540,6 @@ void tty_release_dev(struct file *filp)
 	struct 	inode *inode;
 
 	inode = filp->f_path.dentry->d_inode;
-	tty = (struct tty_struct *)filp->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_release_dev"))
 		return;
 
@@ -1495,7 +1550,8 @@ void tty_release_dev(struct file *filp)
 	idx = tty->index;
 	pty_master = (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 		      tty->driver->subtype == PTY_TYPE_MASTER);
-	devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0;
+	devpts = tty->driver->major == UNIX98_PTY_SLAVE_MAJOR ||
+		 tty->driver->major == UNIX98_PTY_MASTER_MAJOR;
 	o_tty = tty->link;
 
 #ifdef TTY_PARANOIA_CHECK
@@ -1504,7 +1560,7 @@ void tty_release_dev(struct file *filp)
 				  "free (%s)\n", tty->name);
 		return;
 	}
-	if (!devpts) {
+	if ((tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) == 0) {
 		if (tty != tty->driver->ttys[idx]) {
 			printk(KERN_DEBUG "tty_release_dev: driver.table[%d] not tty "
 			       "for (%s)\n", idx, tty->name);
@@ -1632,8 +1688,7 @@ void tty_release_dev(struct file *filp)
 	 *  - do_tty_hangup no longer sees this file descriptor as
 	 *    something that needs to be handled for hangups.
 	 */
-	file_kill(filp);
-	filp->private_data = NULL;
+	tty_del_file(filp);
 
 	/*
 	 * Perform some housekeeping before deciding whether to return.
@@ -1707,7 +1762,7 @@ void tty_release_dev(struct file *filp)
 
 static int __tty_open(struct inode *inode, struct file *filp)
 {
-	struct tty_struct *tty = NULL;
+	struct tty_struct *tty = NULL, *c_tty = NULL;
 	int noctty, retval;
 	struct tty_driver *driver;
 	int index;
@@ -1731,15 +1786,36 @@ retry_open:
 		}
 		driver = tty_driver_kref_get(tty->driver);
 		index = tty->index;
+		c_tty = tty;
 		filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
 		/* noctty = 1; */
 		/* FIXME: Should we take a driver reference ? */
 		tty_kref_put(tty);
 		goto got_driver;
 	}
+#ifdef CONFIG_VTTYS
+	if (!ve_is_super(get_exec_env()) &&
+		MAJOR(device) == TTY_MAJOR &&
+		MINOR(device) <= MAX_NR_VTTY) {
+		driver = tty_driver_kref_get(vtty_driver);
+		if (MINOR(device)) {
+			index = MINOR(device) - 1;
+		} else {
+			index = 0;
+			noctty = 1;
+		}
+		goto got_driver;
+	}
+#endif
 #ifdef CONFIG_VT
 	if (device == MKDEV(TTY_MAJOR, 0)) {
 		extern struct tty_driver *console_driver;
+#ifdef CONFIG_VE
+		if (!ve_is_super(get_exec_env())) {
+			mutex_unlock(&tty_mutex);
+			return -ENODEV;
+		}
+#endif
 		driver = tty_driver_kref_get(console_driver);
 		index = fg_console;
 		noctty = 1;
@@ -1747,7 +1823,20 @@ retry_open:
 	}
 #endif
 	if (device == MKDEV(TTYAUX_MAJOR, 1)) {
-		struct tty_driver *console_driver = console_device(&index);
+		struct tty_driver *console_driver = NULL;
+#ifdef CONFIG_VE
+		if (!ve_is_super(get_exec_env())) {
+# ifdef CONFIG_VTTYS
+			console_driver = vtty_driver;
+			index = 0;
+			/* reset fops, sometimes there might be console_fops
+			 * picked from inode->i_cdev in chrdev_open() */
+			filp->f_op = &tty_fops;
+# endif
+		} else
+#endif
+			console_driver = console_device(&index);
+
 		if (console_driver) {
 			driver = tty_driver_kref_get(console_driver);
 			if (driver) {
@@ -1782,15 +1871,15 @@ got_driver:
 		if (retval)
 			tty = ERR_PTR(retval);
 	} else
-		tty = tty_init_dev(driver, index, 0);
+		tty = tty_init_dev(driver, index, c_tty, 0);
 
 	mutex_unlock(&tty_mutex);
 	tty_driver_kref_put(driver);
 	if (IS_ERR(tty))
 		return PTR_ERR(tty);
 
-	filp->private_data = tty;
-	file_move(filp, &tty->tty_files);
+	tty_add_file(tty, filp);
+
 	check_tty_count(tty, "tty_open");
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_MASTER)
@@ -1889,11 +1978,10 @@ static int tty_release(struct inode *ino
 
 static unsigned int tty_poll(struct file *filp, poll_table *wait)
 {
-	struct tty_struct *tty;
+	struct tty_struct *tty = file_tty(filp);
 	struct tty_ldisc *ld;
 	int ret = 0;
 
-	tty = (struct tty_struct *)filp->private_data;
 	if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
 		return 0;
 
@@ -1906,12 +1994,11 @@ static unsigned int tty_poll(struct file
 
 static int tty_fasync(int fd, struct file *filp, int on)
 {
-	struct tty_struct *tty;
+	struct tty_struct *tty = file_tty(filp);
 	unsigned long flags;
 	int retval = 0;
 
 	lock_kernel();
-	tty = (struct tty_struct *)filp->private_data;
 	if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_fasync"))
 		goto out;
 
@@ -2078,6 +2165,8 @@ static int tioccons(struct file *file)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+	if (!ve_is_super(get_exec_env()))
+		return -EACCES;
 	if (file->f_op->write == redirected_tty_write) {
 		struct file *f;
 		spin_lock(&redirect_lock);
@@ -2459,13 +2548,13 @@ EXPORT_SYMBOL(tty_pair_get_pty);
  */
 long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct tty_struct *tty, *real_tty;
+	struct tty_struct *tty = file_tty(file);
+	struct tty_struct *real_tty;
 	void __user *p = (void __user *)arg;
 	int retval;
 	struct tty_ldisc *ld;
 	struct inode *inode = file->f_dentry->d_inode;
 
-	tty = (struct tty_struct *)file->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 
@@ -2565,6 +2654,11 @@ long tty_ioctl(struct file *file, unsign
 			break;
 		}
 		break;
+	case TIOSAK:
+		if (real_tty == tty && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		__do_SAK(real_tty);
+		return 0;
 	}
 	if (tty->ops->ioctl) {
 		retval = (tty->ops->ioctl)(tty, file, cmd, arg);
@@ -2587,7 +2681,7 @@ static long tty_compat_ioctl(struct file
 				unsigned long arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct tty_struct *tty = file->private_data;
+	struct tty_struct *tty = file_tty(file);
 	struct tty_ldisc *ld;
 	int retval = -ENOIOCTLCMD;
 
@@ -2658,7 +2752,7 @@ void __do_SAK(struct tty_struct *tty)
 	/* Now kill any processes that happen to have the
 	 * tty open.
 	 */
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		if (p->signal->tty == tty) {
 			printk(KERN_NOTICE "SAK: killed process %d"
 			    " (%s): task_session(p)==tty->session\n",
@@ -2679,7 +2773,7 @@ void __do_SAK(struct tty_struct *tty)
 				if (!filp)
 					continue;
 				if (filp->f_op->read == tty_read &&
-				    filp->private_data == tty) {
+				    file_tty(filp) == tty) {
 					printk(KERN_NOTICE "SAK: killed process %d"
 					    " (%s): fd#%d opened to the tty\n",
 					    task_pid_nr(p), p->comm, i);
@@ -2690,7 +2784,7 @@ void __do_SAK(struct tty_struct *tty)
 			spin_unlock(&p->files->file_lock);
 		}
 		task_unlock(p);
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 	read_unlock(&tasklist_lock);
 #endif
 }
@@ -2757,6 +2851,7 @@ void initialize_tty_struct(struct tty_st
 	tty->ops = driver->ops;
 	tty->index = idx;
 	tty_line_name(driver, idx, tty->name);
+	tty->owner_env = driver->owner_env;
 }
 
 /**
@@ -2849,6 +2944,7 @@ struct tty_driver *alloc_tty_driver(int 
 		driver->magic = TTY_DRIVER_MAGIC;
 		driver->num = lines;
 		/* later we'll move allocation of tables here */
+		driver->owner_env = get_ve(get_exec_env());
 	}
 	return driver;
 }
@@ -2883,6 +2979,7 @@ static void destruct_tty_driver(struct k
 		kfree(p);
 		cdev_del(&driver->cdev);
 	}
+	put_ve(driver->owner_env);
 	kfree(driver);
 }
 
@@ -2957,6 +3054,7 @@ int tty_register_driver(struct tty_drive
 	}
 
 	mutex_lock(&tty_mutex);
+	driver->owner_env = get_exec_env();
 	list_add(&driver->tty_drivers, &tty_drivers);
 	mutex_unlock(&tty_mutex);
 
@@ -3130,3 +3228,61 @@ static int __init tty_init(void)
 	return 0;
 }
 module_init(tty_init);
+
+#ifdef CONFIG_UNIX98_PTYS
+int init_ve_tty_class(void)
+{
+	struct class * ve_tty_class;
+	struct device * res;
+
+	ve_tty_class = class_create(THIS_MODULE, "tty");
+	if (IS_ERR(ve_tty_class))
+		return -ENOMEM;
+
+	res = device_create(ve_tty_class, NULL,
+				MKDEV(TTYAUX_MAJOR, 0), NULL, "tty");
+	if (IS_ERR(res))
+		goto err_class;
+
+	res = device_create(ve_tty_class, NULL,
+				MKDEV(TTYAUX_MAJOR, 1), NULL, "console");
+	if (IS_ERR(res))
+		goto err_tty;
+
+	res = device_create(ve_tty_class, NULL,
+				MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx");
+	if (IS_ERR(res))
+		goto err_console;
+
+	get_exec_env()->tty_class = ve_tty_class;
+	return 0;
+
+err_console:
+	device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 1));
+err_tty:
+	device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 0));
+err_class:
+	class_destroy(ve_tty_class);
+	return PTR_ERR(res);
+}
+
+void fini_ve_tty_class(void)
+{
+	struct class *ve_tty_class = get_exec_env()->tty_class;
+
+	device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 0));
+	device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 1));
+	device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2));
+	class_destroy(ve_tty_class);
+}
+#else
+int init_ve_tty_class(void)
+{
+	return 0;
+}
+void fini_ve_tty_class(void)
+{
+}
+#endif
+EXPORT_SYMBOL(init_ve_tty_class);
+EXPORT_SYMBOL(fini_ve_tty_class);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/tty_ldisc.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/tty_ldisc.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/tty_ldisc.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/tty_ldisc.c	2015-01-21 12:02:42.982232797 +0300
@@ -647,7 +647,7 @@ int tty_set_ldisc(struct tty_struct *tty
 		goto enable;
 	}
 
-	if (test_bit(TTY_HUPPED, &tty->flags)) {
+	if (test_bit(TTY_HUPPING, &tty->flags)) {
 		/* We were raced by the hangup method. It will have stomped
 		   the ldisc data and closed the ldisc down */
 		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/char/vc_screen.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/vc_screen.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/char/vc_screen.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/char/vc_screen.c	2015-01-21 12:02:44.422194567 +0300
@@ -35,6 +35,8 @@
 #include <linux/console.h>
 #include <linux/device.h>
 #include <linux/smp_lock.h>
+#include <linux/ve_task.h>
+
 
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
@@ -481,16 +483,22 @@ static struct class *vc_class;
 
 void vcs_make_sysfs(int index)
 {
+	struct ve_struct *ve = set_exec_env(get_ve0());
+
 	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 1), NULL,
 		      "vcs%u", index + 1);
 	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 129), NULL,
 		      "vcsa%u", index + 1);
+	set_exec_env(ve);
 }
 
 void vcs_remove_sysfs(int index)
 {
+	struct ve_struct *ve = set_exec_env(get_ve0());
+
 	device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 1));
 	device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 129));
+	set_exec_env(ve);
 }
 
 int __init vcs_init(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/cpufreq/cpufreq.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/cpufreq/cpufreq.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/cpufreq/cpufreq.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/cpufreq/cpufreq.c	2015-01-21 12:02:41.561270525 +0300
@@ -1075,6 +1075,7 @@ err_out_unregister:
 
 err_unlock_policy:
 	unlock_policy_rwsem_write(cpu);
+	free_cpumask_var(policy->related_cpus);
 err_free_cpumask:
 	free_cpumask_var(policy->cpus);
 err_free_policy:
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/cpufreq/cpufreq_stats.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/cpufreq/cpufreq_stats.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/cpufreq/cpufreq_stats.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/cpufreq/cpufreq_stats.c	2015-01-21 12:02:41.563270472 +0300
@@ -347,18 +347,21 @@ static int __init cpufreq_stats_init(voi
 	if (ret)
 		return ret;
 
+	register_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
+	for_each_online_cpu(cpu)
+		cpufreq_update_policy(cpu);
+
 	ret = cpufreq_register_notifier(&notifier_trans_block,
 				CPUFREQ_TRANSITION_NOTIFIER);
 	if (ret) {
 		cpufreq_unregister_notifier(&notifier_policy_block,
 				CPUFREQ_POLICY_NOTIFIER);
+		unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
+		for_each_online_cpu(cpu)
+			cpufreq_stats_free_table(cpu);
 		return ret;
 	}
 
-	register_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
-	for_each_online_cpu(cpu) {
-		cpufreq_update_policy(cpu);
-	}
 	return 0;
 }
 static void __exit cpufreq_stats_exit(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/cpuidle/governors/menu.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/cpuidle/governors/menu.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/cpuidle/governors/menu.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/cpuidle/governors/menu.c	2015-01-21 12:02:54.226934278 +0300
@@ -121,18 +121,6 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
-static int get_loadavg(void)
-{
-	unsigned long this = this_cpu_load();
-
-
-	return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
-}
-
 static inline int which_bucket(unsigned int duration)
 {
 	int bucket = 0;
@@ -172,7 +160,7 @@ static inline int performance_multiplier
 
 	/* for higher loadavg, we are more reluctant */
 
-	mult += 2 * get_loadavg();
+	mult += 10 * nr_active_cpu();
 
 	/* for IO wait tasks (per cpu!) we add 5x each */
 	mult += 10 * nr_iowait_cpu(smp_processor_id());
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/gpu/drm/radeon/evergreen.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/gpu/drm/radeon/evergreen.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/gpu/drm/radeon/evergreen.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/gpu/drm/radeon/evergreen.c	2015-01-21 12:02:42.695240416 +0300
@@ -4381,7 +4381,7 @@ int evergreen_irq_set(struct radeon_devi
 	u32 thermal_int = 0;
 
 	if (!rdev->irq.installed) {
-		WARN(1, "Can't enable IRQ/MSI because no handler is installed\n");
+		printk(KERN_ALERT "Can't enable IRQ/MSI because no handler is installed\n");
 		return -EINVAL;
 	}
 	/* don't enable anything if the ih is disabled */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/gpu/drm/radeon/r600.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/gpu/drm/radeon/r600.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/gpu/drm/radeon/r600.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/gpu/drm/radeon/r600.c	2015-01-21 12:02:42.691240524 +0300
@@ -3514,7 +3514,7 @@ int r600_irq_set(struct radeon_device *r
 	u32 thermal_int = 0;
 
 	if (!rdev->irq.installed) {
-		WARN(1, "Can't enable IRQ/MSI because no handler is installed\n");
+		printk(KERN_ALERT "Can't enable IRQ/MSI because no handler is installed\n");
 		return -EINVAL;
 	}
 	/* don't enable anything if the ih is disabled */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/hwmon/coretemp.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/hwmon/coretemp.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/hwmon/coretemp.c	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/hwmon/coretemp.c	2015-01-21 12:02:41.624268851 +0300
@@ -730,7 +730,7 @@ static void __cpuinit get_core_online(un
 	 * sensors. We check this bit only, all the early CPUs
 	 * without thermal sensors will be filtered out.
 	 */
-	if (!cpu_has(c, X86_FEATURE_DTS))
+	if (!cpu_has(c, X86_FEATURE_DTHERM))
 		return;
 
 	if (!pdev) {
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/infiniband/ulp/ipoib/ipoib_netlink.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/infiniband/ulp/ipoib/ipoib_netlink.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/infiniband/ulp/ipoib/ipoib_netlink.c	2015-01-21 12:02:51.142016168 +0300
@@ -136,7 +136,7 @@ static int ipoib_new_child_link(struct n
 	return err;
 }
 
-static void ipoib_unregister_child_dev(struct net_device *dev)
+static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head)
 {
 	struct ipoib_dev_priv *priv, *ppriv;
 
@@ -144,7 +144,7 @@ static void ipoib_unregister_child_dev(s
 	ppriv = netdev_priv(priv->parent);
 
 	down_write(&ppriv->vlan_rwsem);
-	unregister_netdevice_queue(dev, NULL);
+	unregister_netdevice_queue(dev, head);
 	list_del(&priv->list);
 	up_write(&ppriv->vlan_rwsem);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/input/input.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/input/input.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/input/input.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/input/input.c	2015-01-21 12:02:42.615242542 +0300
@@ -40,7 +40,7 @@ static LIST_HEAD(input_handler_list);
  * be mutually exclusive which simplifies locking in drivers implementing
  * input handlers.
  */
-static DEFINE_MUTEX(input_mutex);
+DEFINE_MUTEX(input_mutex);
 
 static struct input_handler *input_table[8];
 
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/isdn/mISDN/dsp_core.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/isdn/mISDN/dsp_core.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/isdn/mISDN/dsp_core.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/isdn/mISDN/dsp_core.c	2015-01-21 12:02:41.378275384 +0300
@@ -1216,8 +1216,7 @@ static void dsp_cleanup(void)
 {
 	mISDN_unregister_Bprotocol(&DSP);
 
-	if (timer_pending(&dsp_spl_tl))
-		del_timer(&dsp_spl_tl);
+	del_timer_sync(&dsp_spl_tl);
 
 	if (!list_empty(&dsp_ilist)) {
 		printk(KERN_ERR "mISDN_dsp: Audio DSP object inst list not "
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/media/rc/ir-raw.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/media/rc/ir-raw.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/media/rc/ir-raw.c	2014-12-12 23:29:04.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/media/rc/ir-raw.c	2015-01-21 12:02:41.340276392 +0300
@@ -29,11 +29,6 @@ static DEFINE_MUTEX(ir_raw_handler_lock)
 static LIST_HEAD(ir_raw_handler_list);
 static u64 available_protocols;
 
-#ifdef MODULE
-/* Used to load the decoders */
-static struct work_struct wq_load;
-#endif
-
 static int ir_raw_event_thread(void *data)
 {
 	struct ir_raw_event ev;
@@ -343,8 +338,7 @@ void ir_raw_handler_unregister(struct ir
 }
 EXPORT_SYMBOL(ir_raw_handler_unregister);
 
-#ifdef MODULE
-static void init_decoders(struct work_struct *work)
+void ir_raw_init(void)
 {
 	/* Load the decoder modules */
 
@@ -359,12 +353,3 @@ static void init_decoders(struct work_st
 	   it is needed to change the CONFIG_MODULE test at rc-core.h
 	 */
 }
-#endif
-
-void ir_raw_init(void)
-{
-#ifdef MODULE
-	INIT_WORK(&wq_load, init_decoders);
-	schedule_work(&wq_load);
-#endif
-}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/media/rc/rc-core-priv.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/media/rc/rc-core-priv.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/media/rc/rc-core-priv.h	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/media/rc/rc-core-priv.h	2015-01-21 12:02:41.340276392 +0300
@@ -149,42 +149,42 @@ void ir_raw_init(void);
 
 /* from ir-nec-decoder.c */
 #ifdef CONFIG_IR_NEC_DECODER_MODULE
-#define load_nec_decode()	request_module("ir-nec-decoder")
+#define load_nec_decode()	request_module_nowait("ir-nec-decoder")
 #else
 #define load_nec_decode()	0
 #endif
 
 /* from ir-rc5-decoder.c */
 #ifdef CONFIG_IR_RC5_DECODER_MODULE
-#define load_rc5_decode()	request_module("ir-rc5-decoder")
+#define load_rc5_decode()	request_module_nowait("ir-rc5-decoder")
 #else
 #define load_rc5_decode()	0
 #endif
 
 /* from ir-rc6-decoder.c */
 #ifdef CONFIG_IR_RC6_DECODER_MODULE
-#define load_rc6_decode()	request_module("ir-rc6-decoder")
+#define load_rc6_decode()	request_module_nowait("ir-rc6-decoder")
 #else
 #define load_rc6_decode()	0
 #endif
 
 /* from ir-jvc-decoder.c */
 #ifdef CONFIG_IR_JVC_DECODER_MODULE
-#define load_jvc_decode()	request_module("ir-jvc-decoder")
+#define load_jvc_decode()	request_module_nowait("ir-jvc-decoder")
 #else
 #define load_jvc_decode()	0
 #endif
 
 /* from ir-sony-decoder.c */
 #ifdef CONFIG_IR_SONY_DECODER_MODULE
-#define load_sony_decode()	request_module("ir-sony-decoder")
+#define load_sony_decode()	request_module_nowait("ir-sony-decoder")
 #else
 #define load_sony_decode()	0
 #endif
 
 /* from ir-lirc-codec.c */
 #ifdef CONFIG_IR_LIRC_CODEC_MODULE
-#define load_lirc_codec()	request_module("ir-lirc-codec")
+#define load_lirc_codec()	request_module_nowait("ir-lirc-codec")
 #else
 #define load_lirc_codec()	0
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/Makefile	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/Makefile	2015-01-21 12:02:45.178174496 +0300
@@ -46,6 +46,10 @@ ucc_geth_driver-objs := ucc_geth.o ucc_g
 
 obj-$(CONFIG_FSL_PQ_MDIO) += fsl_pq_mdio.o
 
+obj-$(CONFIG_VE_NETDEV) += vznetdev.o
+vznetdev-objs := venetdev.o veip_mgmt.o
+obj-$(CONFIG_VE_ETHDEV) += vzethdev.o
+
 #
 # link order important here
 #
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/atlx/atl1.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/atlx/atl1.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/atlx/atl1.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/atlx/atl1.c	2015-01-21 12:02:42.657241427 +0300
@@ -2478,7 +2478,7 @@ static irqreturn_t atl1_intr(int irq, vo
 					"pcie phy link down %x\n", status);
 			if (netif_running(adapter->netdev)) {	/* reset MAC */
 				iowrite32(0, adapter->hw.hw_addr + REG_IMR);
-				schedule_work(&adapter->pcie_dma_to_rst_task);
+				schedule_work(&adapter->reset_dev_task);
 				return IRQ_HANDLED;
 			}
 		}
@@ -2490,7 +2490,7 @@ static irqreturn_t atl1_intr(int irq, vo
 					"pcie DMA r/w error (status = 0x%x)\n",
 					status);
 			iowrite32(0, adapter->hw.hw_addr + REG_IMR);
-			schedule_work(&adapter->pcie_dma_to_rst_task);
+			schedule_work(&adapter->reset_dev_task);
 			return IRQ_HANDLED;
 		}
 
@@ -2635,10 +2635,10 @@ static void atl1_down(struct atl1_adapte
 	atl1_clean_rx_ring(adapter);
 }
 
-static void atl1_tx_timeout_task(struct work_struct *work)
+static void atl1_reset_dev_task(struct work_struct *work)
 {
 	struct atl1_adapter *adapter =
-		container_of(work, struct atl1_adapter, tx_timeout_task);
+		container_of(work, struct atl1_adapter, reset_dev_task);
 	struct net_device *netdev = adapter->netdev;
 
 	netif_device_detach(netdev);
@@ -3049,12 +3049,10 @@ static int __devinit atl1_probe(struct p
 		    (unsigned long)adapter);
 	adapter->phy_timer_pending = false;
 
-	INIT_WORK(&adapter->tx_timeout_task, atl1_tx_timeout_task);
+	INIT_WORK(&adapter->reset_dev_task, atl1_reset_dev_task);
 
 	INIT_WORK(&adapter->link_chg_task, atlx_link_chg_task);
 
-	INIT_WORK(&adapter->pcie_dma_to_rst_task, atl1_tx_timeout_task);
-
 	err = register_netdev(netdev);
 	if (err)
 		goto err_common;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/atlx/atl1.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/atlx/atl1.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/atlx/atl1.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/atlx/atl1.h	2015-01-21 12:02:42.658241400 +0300
@@ -762,9 +762,8 @@ struct atl1_adapter {
 	u16 link_speed;
 	u16 link_duplex;
 	spinlock_t lock;
-	struct work_struct tx_timeout_task;
+	struct work_struct reset_dev_task;
 	struct work_struct link_chg_task;
-	struct work_struct pcie_dma_to_rst_task;
 
 	struct timer_list phy_config_timer;
 	bool phy_timer_pending;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/atlx/atlx.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/atlx/atlx.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/atlx/atlx.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/atlx/atlx.c	2015-01-21 12:02:42.658241400 +0300
@@ -189,7 +189,7 @@ static void atlx_tx_timeout(struct net_d
 {
 	struct atlx_adapter *adapter = netdev_priv(netdev);
 	/* Do the reset outside of interrupt context */
-	schedule_work(&adapter->tx_timeout_task);
+	schedule_work(&adapter->reset_dev_task);
 }
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/e1000/e1000.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/e1000/e1000.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/e1000/e1000.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/e1000/e1000.h	2015-01-21 12:02:42.647241692 +0300
@@ -311,8 +311,6 @@ struct e1000_adapter {
 	struct delayed_work watchdog_task;
 	struct delayed_work fifo_stall_task;
 	struct delayed_work phy_info_task;
-
-	struct mutex mutex;
 };
 
 enum e1000_state_t {
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/e1000/e1000_main.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/e1000/e1000_main.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/e1000/e1000_main.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/e1000/e1000_main.c	2015-01-21 12:02:42.648241666 +0300
@@ -484,13 +484,20 @@ static void e1000_down_and_stop(struct e
 {
 	set_bit(__E1000_DOWN, &adapter->flags);
 
-	/* Only kill reset task if adapter is not resetting */
-	if (!test_bit(__E1000_RESETTING, &adapter->flags))
-		cancel_work_sync(&adapter->reset_task);
-
 	cancel_delayed_work_sync(&adapter->watchdog_task);
+
+	/*
+	 * Since the watchdog task can reschedule other tasks, we should cancel
+	 * it first, otherwise we can run into the situation when a work is
+	 * still running after the adapter has been turned down.
+	 */
+
 	cancel_delayed_work_sync(&adapter->phy_info_task);
 	cancel_delayed_work_sync(&adapter->fifo_stall_task);
+
+	/* Only kill reset task if adapter is not resetting */
+	if (!test_bit(__E1000_RESETTING, &adapter->flags))
+		cancel_work_sync(&adapter->reset_task);
 }
 
 void e1000_down(struct e1000_adapter *adapter)
@@ -535,21 +542,8 @@ void e1000_down(struct e1000_adapter *ad
 	e1000_clean_all_rx_rings(adapter);
 }
 
-static void e1000_reinit_safe(struct e1000_adapter *adapter)
-{
-	while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
-		msleep(1);
-	mutex_lock(&adapter->mutex);
-	e1000_down(adapter);
-	e1000_up(adapter);
-	mutex_unlock(&adapter->mutex);
-	clear_bit(__E1000_RESETTING, &adapter->flags);
-}
-
 void e1000_reinit_locked(struct e1000_adapter *adapter)
 {
-	/* if rtnl_lock is not held the call path is bogus */
-	ASSERT_RTNL();
 	WARN_ON(in_interrupt());
 	while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
 		msleep(1);
@@ -1268,7 +1262,6 @@ static int __devinit e1000_sw_init(struc
 	e1000_irq_disable(adapter);
 
 	spin_lock_init(&adapter->stats_lock);
-	mutex_init(&adapter->mutex);
 
 	set_bit(__E1000_DOWN, &adapter->flags);
 
@@ -2285,11 +2278,8 @@ static void e1000_update_phy_info_task(s
 	struct e1000_adapter *adapter = container_of(work,
 						     struct e1000_adapter,
 						     phy_info_task.work);
-	if (test_bit(__E1000_DOWN, &adapter->flags))
-		return;
-	mutex_lock(&adapter->mutex);
+
 	e1000_phy_get_info(&adapter->hw, &adapter->phy_info);
-	mutex_unlock(&adapter->mutex);
 }
 
 /**
@@ -2305,9 +2295,6 @@ static void e1000_82547_tx_fifo_stall_ta
 	struct net_device *netdev = adapter->netdev;
 	u32 tctl;
 
-	if (test_bit(__E1000_DOWN, &adapter->flags))
-		return;
-	mutex_lock(&adapter->mutex);
 	if (atomic_read(&adapter->tx_fifo_stall)) {
 		if ((er32(TDT) == er32(TDH)) &&
 		   (er32(TDFT) == er32(TDFH)) &&
@@ -2328,7 +2315,6 @@ static void e1000_82547_tx_fifo_stall_ta
 			schedule_delayed_work(&adapter->fifo_stall_task, 1);
 		}
 	}
-	mutex_unlock(&adapter->mutex);
 }
 
 bool e1000_has_link(struct e1000_adapter *adapter)
@@ -2382,10 +2368,6 @@ static void e1000_watchdog(struct work_s
 	struct e1000_tx_ring *txdr = adapter->tx_ring;
 	u32 link, tctl;
 
-	if (test_bit(__E1000_DOWN, &adapter->flags))
-		return;
-
-	mutex_lock(&adapter->mutex);
 	link = e1000_has_link(adapter);
 	if ((netif_carrier_ok(netdev)) && link)
 		goto link_up;
@@ -2475,7 +2457,7 @@ link_up:
 			adapter->tx_timeout_count++;
 			schedule_work(&adapter->reset_task);
 			/* exit immediately since reset is imminent */
-			goto unlock;
+			return;
 		}
 	}
 
@@ -2504,9 +2486,6 @@ link_up:
 	/* Reschedule the task */
 	if (!test_bit(__E1000_DOWN, &adapter->flags))
 		schedule_delayed_work(&adapter->watchdog_task, 2 * HZ);
-
-unlock:
-	mutex_unlock(&adapter->mutex);
 }
 
 enum latency_range {
@@ -3435,10 +3414,8 @@ static void e1000_reset_task(struct work
 	struct e1000_adapter *adapter =
 		container_of(work, struct e1000_adapter, reset_task);
 
-	if (test_bit(__E1000_DOWN, &adapter->flags))
-		return;
 	e_err(drv, "Reset adapter\n");
-	e1000_reinit_safe(adapter);
+	e1000_reinit_locked(adapter);
 }
 
 /**
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/e1000e/82571.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/e1000e/82571.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/e1000e/82571.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/e1000e/82571.c	2015-01-21 12:02:42.650241612 +0300
@@ -364,6 +364,17 @@ static s32 e1000_get_variants_82571(stru
 		if (global_quad_port_a == 4)
 			global_quad_port_a = 0;
 		break;
+	case E1000_DEV_ID_82574L:
+	case E1000_DEV_ID_82574LA:
+		/*
+		 * 82574 family can hang when switching PCIe power on.
+		 * Intel will not fix it, so need to disable PCIe power
+		 * management for this family.
+		 */
+		pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S |
+					     PCIE_LINK_STATE_L1 |
+					     PCIE_LINK_STATE_CLKPM);
+
 	default:
 		break;
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/loopback.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/loopback.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/loopback.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/loopback.c	2015-01-21 12:02:51.191014870 +0300
@@ -77,6 +77,13 @@ static netdev_tx_t loopback_xmit(struct 
 	struct pcpu_lstats *lb_stats;
 	int len;
 
+#ifdef CONFIG_VE
+	if (unlikely(get_exec_env()->disable_net)) {
+		kfree_skb(skb);
+		return 0;
+	}
+#endif
+
 	skb_orphan(skb);
 
 	skb->protocol = eth_type_trans(skb, dev);
@@ -152,9 +159,15 @@ static void loopback_dev_free(struct net
 	free_netdev(dev);
 }
 
+static void loopback_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+}
+
 static const struct net_device_ops loopback_ops = {
 	.ndo_init      = loopback_dev_init,
 	.ndo_start_xmit= loopback_xmit,
+	.ndo_cpt = loopback_cpt,
 };
 
 static const struct net_device_ops_ext loopback_ops_ext = {
@@ -186,6 +199,7 @@ static void loopback_setup(struct net_de
 		| NETIF_F_NETNS_LOCAL
 		| NETIF_F_VLAN_CHALLENGED
 		| NETIF_F_LOOPBACK;
+	dev->vz_features |= NETIF_F_VIRTUAL;
 	dev->ethtool_ops	= &loopback_ethtool_ops;
 	dev->header_ops		= &eth_header_ops;
 	dev->netdev_ops		= &loopback_ops;
@@ -221,15 +235,7 @@ out:
 	return err;
 }
 
-static __net_exit void loopback_net_exit(struct net *net)
-{
-	struct net_device *dev = net->loopback_dev;
-
-	unregister_netdev(dev);
-}
-
 /* Registered in net/core/dev.c */
 struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
-       .exit = loopback_net_exit,
 };
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/macvlan.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/macvlan.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/macvlan.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/macvlan.c	2015-01-21 12:02:51.142016168 +0300
@@ -712,13 +712,13 @@ static int macvlan_newlink(struct net_de
 				      dev_forward_skb);
 }
 
-void macvlan_dellink(struct net_device *dev)
+void macvlan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvlan_port *port = vlan->port;
 
 	list_del_rcu(&vlan->list);
-	unregister_netdevice(dev);
+	unregister_netdevice_queue(dev, head);
 
 	if (list_empty(&port->vlans))
 		macvlan_port_destroy(port->dev);
@@ -808,7 +808,7 @@ static int macvlan_device_event(struct n
 			break;
 
 		list_for_each_entry_safe(vlan, next, &port->vlans, list)
-			vlan->dev->rtnl_link_ops->dellink(vlan->dev);
+			vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/macvtap.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/macvtap.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/macvtap.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/macvtap.c	2015-01-21 12:02:51.142016168 +0300
@@ -323,7 +323,7 @@ out:
 	return err;
 }
 
-static void macvtap_dellink(struct net_device *dev)
+static void macvtap_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct macvlan_dev *vlan;
 
@@ -332,7 +332,7 @@ static void macvtap_dellink(struct net_d
 		       MKDEV(MAJOR(macvtap_major), vlan->minor));
 
 	macvtap_del_queues(dev);
-	macvlan_dellink(dev);
+	macvlan_dellink(dev, head);
 	macvtap_free_minor(vlan);
 }
 
@@ -445,12 +445,8 @@ static inline struct sk_buff *macvtap_al
 {
 	struct sk_buff *skb;
 
-	/* Under a page?  Don't bother with paged skb. */
-	if (prepad + len < PAGE_SIZE || !linear)
-		linear = len;
-
-	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err);
+	linear = len;
+	skb = sock_alloc_send_skb(sk, prepad + linear, noblock, err);
 	if (!skb)
 		return NULL;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/ppp_generic.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/ppp_generic.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/ppp_generic.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/ppp_generic.c	2015-01-21 12:02:51.236013675 +0300
@@ -53,6 +53,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+#include <linux/ve_task.h>
+#include <linux/vzcalluser.h>
+
 #define PPP_VERSION	"2.4.2"
 
 /*
@@ -379,8 +382,10 @@ static int ppp_open(struct inode *inode,
 	/*
 	 * This could (should?) be enforced by the permissions on /dev/ppp.
 	 */
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
+	if (!net_generic(get_exec_env()->ve_netns, ppp_net_id)) /* no VE_FEATURE_PPP */
+		return -EACCES;
 	return 0;
 }
 
@@ -880,6 +885,9 @@ static __net_init int ppp_init_net(struc
 	struct ppp_net *pn;
 	int err;
 
+	if (!(get_exec_env()->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, ppp_net_id, NULL);
+
 	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
 	if (!pn)
 		return -ENOMEM;
@@ -906,6 +914,9 @@ static __net_exit void ppp_exit_net(stru
 	struct ppp_net *pn;
 
 	pn = net_generic(net, ppp_net_id);
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
 	idr_destroy(&pn->units_idr);
 	/*
 	 * if someone has cached our net then
@@ -918,6 +929,7 @@ static __net_exit void ppp_exit_net(stru
 static struct pernet_operations ppp_net_ops = {
 	.init = ppp_init_net,
 	.exit = ppp_exit_net,
+	.id = &ppp_net_id,
 };
 
 #define PPP_MAJOR	108
@@ -930,7 +942,7 @@ static int __init ppp_init(void)
 
 	printk(KERN_INFO "PPP generic driver version " PPP_VERSION "\n");
 
-	err = register_pernet_gen_device(&ppp_net_id, &ppp_net_ops);
+	err = register_pernet_device(&ppp_net_ops);
 	if (err) {
 		printk(KERN_ERR "failed to register PPP pernet device (%d)\n", err);
 		goto out;
@@ -956,7 +968,7 @@ static int __init ppp_init(void)
 out_chrdev:
 	unregister_chrdev(PPP_MAJOR, "ppp");
 out_net:
-	unregister_pernet_gen_device(ppp_net_id, &ppp_net_ops);
+	unregister_pernet_device(&ppp_net_ops);
 out:
 	return err;
 }
@@ -1096,6 +1108,7 @@ static void ppp_setup(struct net_device 
 	dev->type = ARPHRD_PPP;
 	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
 	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->vz_features |= NETIF_F_VIRTUAL;
 	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 	set_netdev_ops_ext(dev, &ppp_netdev_ops_ext);
 }
@@ -2164,12 +2177,14 @@ int ppp_register_net_channel(struct net 
 	struct channel *pch;
 	struct ppp_net *pn;
 
+	pn = ppp_pernet(net);
+	if (!pn)
+		return -EACCES;
+
 	pch = kzalloc(sizeof(struct channel), GFP_KERNEL);
 	if (!pch)
 		return -ENOMEM;
 
-	pn = ppp_pernet(net);
-
 	pch->ppp = NULL;
 	pch->chan = chan;
 	pch->chan_net = net;
@@ -2611,16 +2626,16 @@ ppp_create_interface(struct net *net, in
 	 */
 	dev_net_set(dev, net);
 
-	ret = -EEXIST;
 	mutex_lock(&pn->all_ppp_mutex);
 
 	if (unit < 0) {
 		unit = unit_get(&pn->units_idr, ppp);
 		if (unit < 0) {
-			*retp = unit;
+			ret = unit;
 			goto out2;
 		}
 	} else {
+		ret = -EEXIST;
 		if (unit_find(&pn->units_idr, unit))
 			goto out2; /* unit already exists */
 		/*
@@ -2695,10 +2710,10 @@ static void ppp_shutdown_interface(struc
 		ppp->closing = 1;
 		ppp_unlock(ppp);
 		unregister_netdev(ppp->dev);
+		unit_put(&pn->units_idr, ppp->file.index);
 	} else
 		ppp_unlock(ppp);
 
-	unit_put(&pn->units_idr, ppp->file.index);
 	ppp->file.dead = 1;
 	ppp->owner = NULL;
 	wake_up_interruptible(&ppp->file.rwait);
@@ -2878,7 +2893,7 @@ static void __exit ppp_cleanup(void)
 	unregister_chrdev(PPP_MAJOR, "ppp");
 	device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0));
 	class_destroy(ppp_class);
-	unregister_pernet_gen_device(ppp_net_id, &ppp_net_ops);
+	unregister_pernet_device(&ppp_net_ops);
 }
 
 /*
@@ -2886,8 +2901,7 @@ static void __exit ppp_cleanup(void)
  * by holding all_ppp_mutex
  */
 
-/* associate pointer with specified number */
-static int unit_set(struct idr *p, void *ptr, int n)
+static int __unit_alloc(struct idr *p, void *ptr, int n)
 {
 	int unit, err;
 
@@ -2898,10 +2912,24 @@ again:
 	}
 
 	err = idr_get_new_above(p, ptr, n, &unit);
-	if (err == -EAGAIN)
-		goto again;
+	if (err < 0) {
+		if (err == -EAGAIN)
+			goto again;
+		return err;
+	}
 
-	if (unit != n) {
+	return unit;
+}
+
+/* associate pointer with specified number */
+static int unit_set(struct idr *p, void *ptr, int n)
+{
+	int unit;
+
+	unit = __unit_alloc(p, ptr, n);
+	if (unit < 0)
+		return unit;
+	else if (unit != n) {
 		idr_remove(p, unit);
 		return -EINVAL;
 	}
@@ -2912,19 +2940,7 @@ again:
 /* get new free unit number and associate pointer with it */
 static int unit_get(struct idr *p, void *ptr)
 {
-	int unit, err;
-
-again:
-	if (!idr_pre_get(p, GFP_KERNEL)) {
-		printk(KERN_ERR "PPP: No free memory for idr\n");
-		return -ENOMEM;
-	}
-
-	err = idr_get_new_above(p, ptr, 0, &unit);
-	if (err == -EAGAIN)
-		goto again;
-
-	return unit;
+	return __unit_alloc(p, ptr, 0);
 }
 
 /* put unit number back to a pool */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/pppoe.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/pppoe.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/pppoe.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/pppoe.c	2015-01-21 12:02:51.237013648 +0300
@@ -77,6 +77,7 @@
 #include <linux/file.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/vzcalluser.h>
 
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -452,6 +453,8 @@ static int pppoe_rcv(struct sk_buff *skb
 		goto drop;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		goto drop;
 
 	/* Note that get_item does a sock_hold(), so sk_pppox(po)
 	 * is known to be safe.
@@ -494,6 +497,9 @@ static int pppoe_disc_rcv(struct sk_buff
 		goto abort;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		goto abort;
+
 	po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex);
 	if (po) {
 		struct sock *sk = sk_pppox(po);
@@ -547,6 +553,9 @@ static int pppoe_create(struct net *net,
 {
 	struct sock *sk;
 
+	if (!(get_exec_env()->features & VE_FEATURE_PPP))
+		return -EACCES;
+
 	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
 	if (!sk)
 		return -ENOMEM;
@@ -1144,6 +1153,9 @@ static __net_init int pppoe_init_net(str
 	struct proc_dir_entry *pde;
 	int err;
 
+	if (!(get_exec_env()->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, pppoe_net_id, NULL);;
+
 	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
 	if (!pn)
 		return -ENOMEM;
@@ -1173,8 +1185,11 @@ static __net_exit void pppoe_exit_net(st
 {
 	struct pppoe_net *pn;
 
-	proc_net_remove(net, "pppoe");
 	pn = net_generic(net, pppoe_net_id);
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
+	proc_net_remove(net, "pppoe");
 	/*
 	 * if someone has cached our net then
 	 * further net_generic call will return NULL
@@ -1186,13 +1201,14 @@ static __net_exit void pppoe_exit_net(st
 static struct pernet_operations pppoe_net_ops = {
 	.init = pppoe_init_net,
 	.exit = pppoe_exit_net,
+	.id = &pppoe_net_id,
 };
 
 static int __init pppoe_init(void)
 {
 	int err;
 
-	err = register_pernet_gen_device(&pppoe_net_id, &pppoe_net_ops);
+	err = register_pernet_device(&pppoe_net_ops);
 	if (err)
 		goto out;
 
@@ -1213,7 +1229,7 @@ static int __init pppoe_init(void)
 out_unregister_pppoe_proto:
 	proto_unregister(&pppoe_sk_proto);
 out_unregister_net_ops:
-	unregister_pernet_gen_device(pppoe_net_id, &pppoe_net_ops);
+	unregister_pernet_device(&pppoe_net_ops);
 out:
 	return err;
 }
@@ -1225,7 +1241,7 @@ static void __exit pppoe_exit(void)
 	dev_remove_pack(&pppoes_ptype);
 	unregister_pppox_proto(PX_PROTO_OE);
 	proto_unregister(&pppoe_sk_proto);
-	unregister_pernet_gen_device(pppoe_net_id, &pppoe_net_ops);
+	unregister_pernet_device(&pppoe_net_ops);
 }
 
 module_init(pppoe_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/pppol2tp.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/pppol2tp.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/pppol2tp.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/pppol2tp.c	2015-01-21 12:02:51.237013648 +0300
@@ -97,6 +97,7 @@
 #include <net/ip.h>
 #include <net/udp.h>
 #include <net/xfrm.h>
+#include <linux/vzcalluser.h>
 
 #include <asm/byteorder.h>
 #include <asm/atomic.h>
@@ -1589,6 +1590,9 @@ static int pppol2tp_create(struct net *n
 	int error = -ENOMEM;
 	struct sock *sk;
 
+	if (!(get_exec_env()->features & VE_FEATURE_PPP))
+		return -EACCES;
+
 	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
 	if (!sk)
 		goto out;
@@ -2606,6 +2610,9 @@ static __net_init int pppol2tp_init_net(
 	struct proc_dir_entry *pde;
 	int err;
 
+	if (!(get_exec_env()->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, pppol2tp_net_id, NULL);
+
 	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
 	if (!pn)
 		return -ENOMEM;
@@ -2636,8 +2643,11 @@ static __net_exit void pppol2tp_exit_net
 {
 	struct pppoe_net *pn;
 
-	proc_net_remove(net, "pppol2tp");
 	pn = net_generic(net, pppol2tp_net_id);
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
+	proc_net_remove(net, "pppol2tp");
 	/*
 	 * if someone has cached our net then
 	 * further net_generic call will return NULL
@@ -2649,13 +2659,14 @@ static __net_exit void pppol2tp_exit_net
 static struct pernet_operations pppol2tp_net_ops = {
 	.init = pppol2tp_init_net,
 	.exit = pppol2tp_exit_net,
+	.id = &pppol2tp_net_id,
 };
 
 static int __init pppol2tp_init(void)
 {
 	int err;
 
-	err = register_pernet_gen_device(&pppol2tp_net_id, &pppol2tp_net_ops);
+	err = register_pernet_device(&pppol2tp_net_ops);
 	if (err)
 		goto out;
 
@@ -2675,7 +2686,7 @@ out:
 out_unregister_pppol2tp_proto:
 	proto_unregister(&pppol2tp_sk_proto);
 out_unregister_pernet_dev:
-	unregister_pernet_gen_device(pppol2tp_net_id, &pppol2tp_net_ops);
+	unregister_pernet_device(&pppol2tp_net_ops);
 	goto out;
 }
 
@@ -2683,7 +2694,7 @@ static void __exit pppol2tp_exit(void)
 {
 	unregister_pppox_proto(PX_PROTO_OL2TP);
 	proto_unregister(&pppol2tp_sk_proto);
-	unregister_pernet_gen_device(pppol2tp_net_id, &pppol2tp_net_ops);
+	unregister_pernet_device(&pppol2tp_net_ops);
 }
 
 module_init(pppol2tp_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/r8169.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/r8169.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/r8169.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/r8169.c	2015-01-21 12:02:42.669241108 +0300
@@ -6074,13 +6074,6 @@ static int rtl_rx(struct net_device *dev
 			tp->rx_stats.bytes += pkt_size;
 			u64_stats_update_end(&tp->rx_stats.syncp);
 		}
-
-		/* Work around for AMD plateform. */
-		if ((desc->opts2 & cpu_to_le32(0xfffe000)) &&
-		    (tp->mac_version == RTL_GIGA_MAC_VER_05)) {
-			desc->opts2 = 0;
-			cur_rx++;
-		}
 	}
 
 	count = cur_rx - tp->cur_rx;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/sky2.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/sky2.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/sky2.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/sky2.c	2015-01-21 12:02:42.637241958 +0300
@@ -140,6 +140,7 @@ static DEFINE_PCI_DEVICE_TABLE(sky2_id_t
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436D) }, /* 88E8055 */
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4370) }, /* 88E8075 */
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4380) }, /* 88E8057 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4381) }, /* 88E8059 */
 	{ 0 }
 };
 
@@ -603,6 +604,16 @@ static void sky2_phy_init(struct sky2_hw
 		/* apply workaround for integrated resistors calibration */
 		gm_phy_write(hw, port, PHY_MARV_PAGE_ADDR, 17);
 		gm_phy_write(hw, port, PHY_MARV_PAGE_DATA, 0x3f60);
+	} else if (hw->chip_id == CHIP_ID_YUKON_OPT && hw->chip_rev == 0) {
+		/* apply fixes in PHY AFE */
+		gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 0x00ff);
+
+		/* apply RDAC termination workaround */
+		gm_phy_write(hw, port, 24, 0x2800);
+		gm_phy_write(hw, port, 23, 0x2001);
+
+		/* set page register back to 0 */
+		gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 0);
 	} else if (hw->chip_id != CHIP_ID_YUKON_EX &&
 		   hw->chip_id < CHIP_ID_YUKON_SUPR) {
 		/* no effect on Yukon-XL */
@@ -2096,6 +2107,27 @@ out:
 	spin_unlock(&sky2->phy_lock);
 }
 
+/* Special quick link interrupt (Yukon-2 Optima only) */
+static void sky2_qlink_intr(struct sky2_hw *hw)
+{
+	struct sky2_port *sky2 = netdev_priv(hw->dev[0]);
+	u32 imask;
+	u16 phy;
+
+	/* disable irq */
+	imask = sky2_read32(hw, B0_IMSK);
+	imask &= ~Y2_IS_PHY_QLNK;
+	sky2_write32(hw, B0_IMSK, imask);
+
+	/* reset PHY Link Detect */
+	phy = sky2_pci_read16(hw, PSM_CONFIG_REG4);
+	sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
+	sky2_pci_write16(hw, PSM_CONFIG_REG4, phy | 1);
+	sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
+
+	sky2_link_up(sky2);
+}
+
 /* Transmit timeout is only called if we are running, carrier is up
  * and tx queue is full (stopped).
  */
@@ -2751,6 +2783,9 @@ static int sky2_poll(struct napi_struct 
 	if (status & Y2_IS_IRQ_PHY2)
 		sky2_phy_intr(hw, 1);
 
+	if (status & Y2_IS_PHY_QLNK)
+		sky2_qlink_intr(hw);
+
 	while ((idx = sky2_read16(hw, STAT_PUT_IDX)) != hw->st_idx) {
 		work_done += sky2_status_intr(hw, work_limit - work_done, idx);
 
@@ -2800,6 +2835,7 @@ static u32 sky2_mhz(const struct sky2_hw
 	case CHIP_ID_YUKON_EX:
 	case CHIP_ID_YUKON_SUPR:
 	case CHIP_ID_YUKON_UL_2:
+	case CHIP_ID_YUKON_OPT:
 		return 125;
 
 	case CHIP_ID_YUKON_FE:
@@ -2893,6 +2929,12 @@ static int __devinit sky2_init(struct sk
 			| SKY2_HW_ADV_POWER_CTL;
 		break;
 
+	case CHIP_ID_YUKON_OPT:
+		hw->flags = SKY2_HW_GIGABIT
+			| SKY2_HW_NEW_LE
+			| SKY2_HW_ADV_POWER_CTL;
+		break;
+
 	default:
 		dev_err(&hw->pdev->dev, "unsupported chip type 0x%x\n",
 			hw->chip_id);
@@ -2973,6 +3015,48 @@ static void sky2_reset(struct sky2_hw *h
 				     | GMC_BYP_RETR_ON);
 	}
 
+	if (hw->chip_id == CHIP_ID_YUKON_OPT) {
+		u16 reg;
+		u32 msk;
+
+		if (hw->chip_rev == 0) {
+			/* disable PCI-E PHY power down (set PHY reg 0x80, bit 7 */
+			sky2_write32(hw, Y2_PEX_PHY_DATA, (0x80UL << 16) | (1 << 7));
+
+			/* set PHY Link Detect Timer to 1.1 second (11x 100ms) */
+			reg = 10;
+		} else {
+			/* set PHY Link Detect Timer to 0.4 second (4x 100ms) */
+			reg = 3;
+		}
+
+		reg <<= PSM_CONFIG_REG4_TIMER_PHY_LINK_DETECT_BASE;
+
+		/* reset PHY Link Detect */
+		sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
+		sky2_pci_write16(hw, PSM_CONFIG_REG4,
+				 reg | PSM_CONFIG_REG4_RST_PHY_LINK_DETECT);
+		sky2_pci_write16(hw, PSM_CONFIG_REG4, reg);
+
+
+		/* enable PHY Quick Link */
+		msk = sky2_read32(hw, B0_IMSK);
+		msk |= Y2_IS_PHY_QLNK;
+		sky2_write32(hw, B0_IMSK, msk);
+
+		/* check if PSMv2 was running before */
+		reg = sky2_pci_read16(hw, PSM_CONFIG_REG3);
+		if (reg & PCI_EXP_LNKCTL_ASPMC) {
+			int cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+			/* restore the PCIe Link Control register */
+			sky2_pci_write16(hw, cap + PCI_EXP_LNKCTL, reg);
+		}
+		sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
+
+		/* re-enable PEX PM in PEX PHY debug reg. 8 (clear bit 12) */
+		sky2_write32(hw, Y2_PEX_PHY_DATA, PEX_DB_ACCESS | (0x08UL << 16));
+	}
+
 	/* Clear I2C IRQ noise */
 	sky2_write32(hw, B2_I2C_IRQ, 1);
 
@@ -4446,9 +4530,11 @@ static const char *sky2_name(u8 chipid, 
 		"FE+",		/* 0xb8 */
 		"Supreme",	/* 0xb9 */
 		"UL 2",		/* 0xba */
+		"Unknown",	/* 0xbb */
+		"Optima",	/* 0xbc */
 	};
 
-	if (chipid >= CHIP_ID_YUKON_XL && chipid < CHIP_ID_YUKON_UL_2)
+	if (chipid >= CHIP_ID_YUKON_XL && chipid < CHIP_ID_YUKON_OPT)
 		strncpy(buf, name[chipid - CHIP_ID_YUKON_XL], sz);
 	else
 		snprintf(buf, sz, "(chip %#x)", chipid);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/sky2.h linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/sky2.h
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/sky2.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/sky2.h	2015-01-21 12:02:42.626242248 +0300
@@ -16,6 +16,13 @@ enum {
 	PCI_DEV_REG5    = 0x88,
 	PCI_CFG_REG_0	= 0x90,
 	PCI_CFG_REG_1	= 0x94,
+
+	PSM_CONFIG_REG0  = 0x98,
+	PSM_CONFIG_REG1	 = 0x9C,
+	PSM_CONFIG_REG2  = 0x160,
+	PSM_CONFIG_REG3  = 0x164,
+	PSM_CONFIG_REG4  = 0x168,
+
 };
 
 /* Yukon-2 */
@@ -48,6 +55,37 @@ enum pci_dev_reg_2 {
 	PCI_USEDATA64	= 1<<0,		/* Use 64Bit Data bus ext */
 };
 
+/*	PCI_OUR_REG_3		32 bit	Our Register 3 (Yukon-ECU only) */
+enum pci_dev_reg_3 {
+	P_CLK_ASF_REGS_DIS	= 1<<18,/* Disable Clock ASF (Yukon-Ext.) */
+	P_CLK_COR_REGS_D0_DIS	= 1<<17,/* Disable Clock Core Regs D0 */
+	P_CLK_MACSEC_DIS	= 1<<17,/* Disable Clock MACSec (Yukon-Ext.) */
+	P_CLK_PCI_REGS_D0_DIS	= 1<<16,/* Disable Clock PCI  Regs D0 */
+	P_CLK_COR_YTB_ARB_DIS	= 1<<15,/* Disable Clock YTB  Arbiter */
+	P_CLK_MAC_LNK1_D3_DIS	= 1<<14,/* Disable Clock MAC  Link1 D3 */
+	P_CLK_COR_LNK1_D0_DIS	= 1<<13,/* Disable Clock Core Link1 D0 */
+	P_CLK_MAC_LNK1_D0_DIS	= 1<<12,/* Disable Clock MAC  Link1 D0 */
+	P_CLK_COR_LNK1_D3_DIS	= 1<<11,/* Disable Clock Core Link1 D3 */
+	P_CLK_PCI_MST_ARB_DIS	= 1<<10,/* Disable Clock PCI  Master Arb. */
+	P_CLK_COR_REGS_D3_DIS	= 1<<9,	/* Disable Clock Core Regs D3 */
+	P_CLK_PCI_REGS_D3_DIS	= 1<<8,	/* Disable Clock PCI  Regs D3 */
+	P_CLK_REF_LNK1_GM_DIS	= 1<<7,	/* Disable Clock Ref. Link1 GMAC */
+	P_CLK_COR_LNK1_GM_DIS	= 1<<6,	/* Disable Clock Core Link1 GMAC */
+	P_CLK_PCI_COMMON_DIS	= 1<<5,	/* Disable Clock PCI  Common */
+	P_CLK_COR_COMMON_DIS	= 1<<4,	/* Disable Clock Core Common */
+	P_CLK_PCI_LNK1_BMU_DIS	= 1<<3,	/* Disable Clock PCI  Link1 BMU */
+	P_CLK_COR_LNK1_BMU_DIS	= 1<<2,	/* Disable Clock Core Link1 BMU */
+	P_CLK_PCI_LNK1_BIU_DIS	= 1<<1,	/* Disable Clock PCI  Link1 BIU */
+	P_CLK_COR_LNK1_BIU_DIS	= 1<<0,	/* Disable Clock Core Link1 BIU */
+	PCIE_OUR3_WOL_D3_COLD_SET = P_CLK_ASF_REGS_DIS |
+				    P_CLK_COR_REGS_D0_DIS |
+				    P_CLK_COR_LNK1_D0_DIS |
+				    P_CLK_MAC_LNK1_D0_DIS |
+				    P_CLK_PCI_MST_ARB_DIS |
+				    P_CLK_COR_COMMON_DIS |
+				    P_CLK_COR_LNK1_BMU_DIS,
+};
+
 /*	PCI_OUR_REG_4		32 bit	Our Register 4 (Yukon-ECU only) */
 enum pci_dev_reg_4 {
 				/* (Link Training & Status State Machine) */
@@ -114,7 +152,7 @@ enum pci_dev_reg_5 {
 				     P_GAT_PCIE_RX_EL_IDLE,
 };
 
-#/*	PCI_CFG_REG_1			32 bit	Config Register 1 (Yukon-Ext only) */
+/*	PCI_CFG_REG_1			32 bit	Config Register 1 (Yukon-Ext only) */
 enum pci_cfg_reg1 {
 	P_CF1_DIS_REL_EVT_RST	= 1<<24, /* Dis. Rel. Event during PCIE reset */
 										/* Bit 23..21: Release Clock on Event */
@@ -145,6 +183,72 @@ enum pci_cfg_reg1 {
 					P_CF1_ENA_TXBMU_WR_IDLE,
 };
 
+/* Yukon-Optima */
+enum {
+	PSM_CONFIG_REG1_AC_PRESENT_STATUS = 1<<31,   /* AC Present Status */
+
+	PSM_CONFIG_REG1_PTP_CLK_SEL	  = 1<<29,   /* PTP Clock Select */
+	PSM_CONFIG_REG1_PTP_MODE	  = 1<<28,   /* PTP Mode */
+
+	PSM_CONFIG_REG1_MUX_PHY_LINK	  = 1<<27,   /* PHY Energy Detect Event */
+
+	PSM_CONFIG_REG1_EN_PIN63_AC_PRESENT = 1<<26,  /* Enable LED_DUPLEX for ac_present */
+	PSM_CONFIG_REG1_EN_PCIE_TIMER	  = 1<<25,    /* Enable PCIe Timer */
+	PSM_CONFIG_REG1_EN_SPU_TIMER	  = 1<<24,    /* Enable SPU Timer */
+	PSM_CONFIG_REG1_POLARITY_AC_PRESENT = 1<<23,  /* AC Present Polarity */
+
+	PSM_CONFIG_REG1_EN_AC_PRESENT	  = 1<<21,    /* Enable AC Present */
+
+	PSM_CONFIG_REG1_EN_GPHY_INT_PSM	= 1<<20,      /* Enable GPHY INT for PSM */
+	PSM_CONFIG_REG1_DIS_PSM_TIMER	= 1<<19,      /* Disable PSM Timer */
+};
+
+/* Yukon-Supreme */
+enum {
+	PSM_CONFIG_REG1_GPHY_ENERGY_STS	= 1<<31, /* GPHY Energy Detect Status */
+
+	PSM_CONFIG_REG1_UART_MODE_MSK	= 3<<29, /* UART_Mode */
+	PSM_CONFIG_REG1_CLK_RUN_ASF	= 1<<28, /* Enable Clock Free Running for ASF Subsystem */
+	PSM_CONFIG_REG1_UART_CLK_DISABLE= 1<<27, /* Disable UART clock */
+	PSM_CONFIG_REG1_VAUX_ONE	= 1<<26, /* Tie internal Vaux to 1'b1 */
+	PSM_CONFIG_REG1_UART_FC_RI_VAL	= 1<<25, /* Default value for UART_RI_n */
+	PSM_CONFIG_REG1_UART_FC_DCD_VAL	= 1<<24, /* Default value for UART_DCD_n */
+	PSM_CONFIG_REG1_UART_FC_DSR_VAL	= 1<<23, /* Default value for UART_DSR_n */
+	PSM_CONFIG_REG1_UART_FC_CTS_VAL	= 1<<22, /* Default value for UART_CTS_n */
+	PSM_CONFIG_REG1_LATCH_VAUX	= 1<<21, /* Enable Latch current Vaux_avlbl */
+	PSM_CONFIG_REG1_FORCE_TESTMODE_INPUT= 1<<20, /* Force Testmode pin as input PAD */
+	PSM_CONFIG_REG1_UART_RST	= 1<<19, /* UART_RST */
+	PSM_CONFIG_REG1_PSM_PCIE_L1_POL	= 1<<18, /* PCIE L1 Event Polarity for PSM */
+	PSM_CONFIG_REG1_TIMER_STAT	= 1<<17, /* PSM Timer Status */
+	PSM_CONFIG_REG1_GPHY_INT	= 1<<16, /* GPHY INT Status */
+	PSM_CONFIG_REG1_FORCE_TESTMODE_ZERO= 1<<15, /* Force internal Testmode as 1'b0 */
+	PSM_CONFIG_REG1_EN_INT_ASPM_CLKREQ = 1<<14, /* ENABLE INT for CLKRUN on ASPM and CLKREQ */
+	PSM_CONFIG_REG1_EN_SND_TASK_ASPM_CLKREQ	= 1<<13, /* ENABLE Snd_task for CLKRUN on ASPM and CLKREQ */
+	PSM_CONFIG_REG1_DIS_CLK_GATE_SND_TASK	= 1<<12, /* Disable CLK_GATE control snd_task */
+	PSM_CONFIG_REG1_DIS_FF_CHIAN_SND_INTA	= 1<<11, /* Disable flip-flop chain for sndmsg_inta */
+
+	PSM_CONFIG_REG1_DIS_LOADER	= 1<<9, /* Disable Loader SM after PSM Goes back to IDLE */
+	PSM_CONFIG_REG1_DO_PWDN		= 1<<8, /* Do Power Down, Start PSM Scheme */
+	PSM_CONFIG_REG1_DIS_PIG		= 1<<7, /* Disable Plug-in-Go SM after PSM Goes back to IDLE */
+	PSM_CONFIG_REG1_DIS_PERST	= 1<<6, /* Disable Internal PCIe Reset after PSM Goes back to IDLE */
+	PSM_CONFIG_REG1_EN_REG18_PD	= 1<<5, /* Enable REG18 Power Down for PSM */
+	PSM_CONFIG_REG1_EN_PSM_LOAD	= 1<<4, /* Disable EEPROM Loader after PSM Goes back to IDLE */
+	PSM_CONFIG_REG1_EN_PSM_HOT_RST	= 1<<3, /* Enable PCIe Hot Reset for PSM */
+	PSM_CONFIG_REG1_EN_PSM_PERST	= 1<<2, /* Enable PCIe Reset Event for PSM */
+	PSM_CONFIG_REG1_EN_PSM_PCIE_L1	= 1<<1, /* Enable PCIe L1 Event for PSM */
+	PSM_CONFIG_REG1_EN_PSM		= 1<<0, /* Enable PSM Scheme */
+};
+
+/*	PSM_CONFIG_REG4				0x0168	PSM Config Register 4 */
+enum {
+						/* PHY Link Detect Timer */
+	PSM_CONFIG_REG4_TIMER_PHY_LINK_DETECT_MSK = 0xf<<4,
+	PSM_CONFIG_REG4_TIMER_PHY_LINK_DETECT_BASE = 4,
+
+	PSM_CONFIG_REG4_DEBUG_TIMER	    = 1<<1, /* Debug Timer */
+	PSM_CONFIG_REG4_RST_PHY_LINK_DETECT = 1<<0, /* Reset GPHY Link Detect */
+};
+
 
 #define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \
 			       PCI_STATUS_SIG_SYSTEM_ERROR | \
@@ -197,6 +301,9 @@ enum csr_regs {
 	B2_I2C_IRQ	= 0x0168,
 	B2_I2C_SW	= 0x016c,
 
+	Y2_PEX_PHY_DATA = 0x0170,
+	Y2_PEX_PHY_ADDR = 0x0172,
+
 	B3_RAM_ADDR	= 0x0180,
 	B3_RAM_DATA_LO	= 0x0184,
 	B3_RAM_DATA_HI	= 0x0188,
@@ -317,6 +424,10 @@ enum {
 	Y2_IS_CHK_TXS2	= 1<<9,		/* Descriptor error TXS 2 */
 	Y2_IS_CHK_TXA2	= 1<<8,		/* Descriptor error TXA 2 */
 
+	Y2_IS_PSM_ACK	= 1<<7,		/* PSM Acknowledge (Yukon-Optima only) */
+	Y2_IS_PTP_TIST	= 1<<6,		/* PTP Time Stamp (Yukon-Optima only) */
+	Y2_IS_PHY_QLNK	= 1<<5,		/* PHY Quick Link (Yukon-Optima only) */
+
 	Y2_IS_IRQ_PHY1	= 1<<4,		/* Interrupt from PHY 1 */
 	Y2_IS_IRQ_MAC1	= 1<<3,		/* Interrupt from MAC 1 */
 	Y2_IS_CHK_RX1	= 1<<2,		/* Descriptor error Rx 1 */
@@ -435,6 +546,7 @@ enum {
  	CHIP_ID_YUKON_FE_P = 0xb8, /* YUKON-2 FE+ */
 	CHIP_ID_YUKON_SUPR = 0xb9, /* YUKON-2 Supreme */
 	CHIP_ID_YUKON_UL_2 = 0xba, /* YUKON-2 Ultra 2 */
+	CHIP_ID_YUKON_OPT  = 0xbc, /* YUKON-2 Optima */
 };
 enum yukon_ec_rev {
 	CHIP_REV_YU_EC_A1    = 0,  /* Chip Rev. for Yukon-EC A1/A0 */
@@ -459,6 +571,8 @@ enum yukon_ex_rev {
 };
 enum yukon_supr_rev {
 	CHIP_REV_YU_SU_A0    = 0,
+	CHIP_REV_YU_SU_B0    = 1,
+	CHIP_REV_YU_SU_B1    = 3,
 };
 
 
@@ -513,6 +627,12 @@ enum {
 	TIM_T_STEP	= 1<<0,	/* Test step */
 };
 
+/*	Y2_PEX_PHY_ADDR/DATA		PEX PHY address and data reg  (Yukon-2 only) */
+enum {
+	PEX_RD_ACCESS	= 1<<31, /* Access Mode Read = 1, Write = 0 */
+	PEX_DB_ACCESS	= 1<<30, /* Access to debug register */
+};
+
 /*	B3_RAM_ADDR		32 bit	RAM Address, to read or write */
 					/* Bit 31..19:	reserved */
 #define RAM_ADR_RAN	0x0007ffffL	/* Bit 18.. 0:	RAM Address Range */
@@ -754,6 +874,42 @@ enum {
 	BMU_TX_CLR_IRQ_TCP	= 1<<11, /* Clear IRQ on TCP segment length mismatch */
 };
 
+/*	TBMU_TEST			0x06B8	Transmit BMU Test Register */
+enum {
+	TBMU_TEST_BMU_TX_CHK_AUTO_OFF		= 1<<31, /* BMU Tx Checksum Auto Calculation Disable */
+	TBMU_TEST_BMU_TX_CHK_AUTO_ON		= 1<<30, /* BMU Tx Checksum Auto Calculation Enable */
+	TBMU_TEST_HOME_ADD_PAD_FIX1_EN		= 1<<29, /* Home Address Paddiing FIX1 Enable */
+	TBMU_TEST_HOME_ADD_PAD_FIX1_DIS		= 1<<28, /* Home Address Paddiing FIX1 Disable */
+	TBMU_TEST_ROUTING_ADD_FIX_EN		= 1<<27, /* Routing Address Fix Enable */
+	TBMU_TEST_ROUTING_ADD_FIX_DIS		= 1<<26, /* Routing Address Fix Disable */
+	TBMU_TEST_HOME_ADD_FIX_EN		= 1<<25, /* Home address checksum fix enable */
+	TBMU_TEST_HOME_ADD_FIX_DIS		= 1<<24, /* Home address checksum fix disable */
+
+	TBMU_TEST_TEST_RSPTR_ON			= 1<<22, /* Testmode Shadow Read Ptr On */
+	TBMU_TEST_TEST_RSPTR_OFF		= 1<<21, /* Testmode Shadow Read Ptr Off */
+	TBMU_TEST_TESTSTEP_RSPTR		= 1<<20, /* Teststep Shadow Read Ptr */
+
+	TBMU_TEST_TEST_RPTR_ON			= 1<<18, /* Testmode Read Ptr On */
+	TBMU_TEST_TEST_RPTR_OFF			= 1<<17, /* Testmode Read Ptr Off */
+	TBMU_TEST_TESTSTEP_RPTR			= 1<<16, /* Teststep Read Ptr */
+
+	TBMU_TEST_TEST_WSPTR_ON			= 1<<14, /* Testmode Shadow Write Ptr On */
+	TBMU_TEST_TEST_WSPTR_OFF		= 1<<13, /* Testmode Shadow Write Ptr Off */
+	TBMU_TEST_TESTSTEP_WSPTR		= 1<<12, /* Teststep Shadow Write Ptr */
+
+	TBMU_TEST_TEST_WPTR_ON			= 1<<10, /* Testmode Write Ptr On */
+	TBMU_TEST_TEST_WPTR_OFF			= 1<<9, /* Testmode Write Ptr Off */
+	TBMU_TEST_TESTSTEP_WPTR			= 1<<8,			/* Teststep Write Ptr */
+
+	TBMU_TEST_TEST_REQ_NB_ON		= 1<<6, /* Testmode Req Nbytes/Addr On */
+	TBMU_TEST_TEST_REQ_NB_OFF		= 1<<5, /* Testmode Req Nbytes/Addr Off */
+	TBMU_TEST_TESTSTEP_REQ_NB		= 1<<4, /* Teststep Req Nbytes/Addr */
+
+	TBMU_TEST_TEST_DONE_IDX_ON		= 1<<2, /* Testmode Done Index On */
+	TBMU_TEST_TEST_DONE_IDX_OFF		= 1<<1, /* Testmode Done Index Off */
+	TBMU_TEST_TESTSTEP_DONE_IDX		= 1<<0,	/* Teststep Done Index */
+};
+
 /* Queue Prefetch Unit Offsets, use Y2_QADDR() to address (Yukon-2 only)*/
 /* PREF_UNIT_CTRL	32 bit	Prefetch Control register */
 enum {
@@ -1674,6 +1830,12 @@ enum {
 
 /*	RX_GMF_CTRL_T	32 bit	Rx GMAC FIFO Control/Test */
 enum {
+	RX_GCLKMAC_ENA	= 1<<31,	/* RX MAC Clock Gating Enable */
+	RX_GCLKMAC_OFF	= 1<<30,
+
+	RX_STFW_DIS	= 1<<29,	/* RX Store and Forward Enable */
+	RX_STFW_ENA	= 1<<28,
+
 	RX_TRUNC_ON	= 1<<27,  	/* enable  packet truncation */
 	RX_TRUNC_OFF	= 1<<26, 	/* disable packet truncation */
 	RX_VLAN_STRIP_ON = 1<<25,	/* enable  VLAN stripping */
@@ -1711,6 +1873,20 @@ enum {
 	GMF_RX_CTRL_DEF	= GMF_OPER_ON | GMF_RX_F_FL_ON,
 };
 
+/*	RX_GMF_FL_CTRL	16 bit	Rx GMAC FIFO Flush Control (Yukon-Supreme) */
+enum {
+	RX_IPV6_SA_MOB_ENA	= 1<<9,	/* IPv6 SA Mobility Support Enable */
+	RX_IPV6_SA_MOB_DIS	= 1<<8,	/* IPv6 SA Mobility Support Disable */
+	RX_IPV6_DA_MOB_ENA	= 1<<7,	/* IPv6 DA Mobility Support Enable */
+	RX_IPV6_DA_MOB_DIS	= 1<<6,	/* IPv6 DA Mobility Support Disable */
+	RX_PTR_SYNCDLY_ENA	= 1<<5,	/* Pointers Delay Synch Enable */
+	RX_PTR_SYNCDLY_DIS	= 1<<4,	/* Pointers Delay Synch Disable */
+	RX_ASF_NEWFLAG_ENA	= 1<<3,	/* RX ASF Flag New Logic Enable */
+	RX_ASF_NEWFLAG_DIS	= 1<<2,	/* RX ASF Flag New Logic Disable */
+	RX_FLSH_MISSPKT_ENA	= 1<<1,	/* RX Flush Miss-Packet Enable */
+	RX_FLSH_MISSPKT_DIS	= 1<<0,	/* RX Flush Miss-Packet Disable */
+};
+
 /*	TX_GMF_EA		32 bit	Tx GMAC FIFO End Address */
 enum {
 	TX_DYN_WM_ENA	= 3,	/* Yukon-FE+ specific */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/tg3.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/tg3.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/tg3.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/tg3.c	2015-01-21 12:02:42.665241214 +0300
@@ -237,6 +237,11 @@ static int tg3_debug = -1;	/* -1 == use 
 module_param(tg3_debug, int, 0);
 MODULE_PARM_DESC(tg3_debug, "Tigon3 bitmapped debugging message enable value");
 
+static int tg3_pcie_mrrs_boost = 0;
+module_param(tg3_pcie_mrrs_boost, int, S_IRUGO);
+MODULE_PARM_DESC(tg3_pcie_mrrs_boost, "Increase Tigon3 PCI MRRS "
+				      "(can result in a performance boost)");
+
 #define TG3_DRV_DATA_FLAG_10_100_ONLY	0x0001
 #define TG3_DRV_DATA_FLAG_5705_10_100	0x0002
 
@@ -8903,6 +8908,10 @@ static void tg3_restore_pci_state(struct
 
 	pci_write_config_word(tp->pdev, PCI_COMMAND, tp->pci_cmd);
 
+	if (tg3_pcie_mrrs_boost && tg3_flag(tp, PCI_EXPRESS) &&
+	    tg3_asic_rev(tp) != ASIC_REV_5785)
+		pcie_set_readrq(tp->pdev, tp->pcie_readrq);
+
 	if (!tg3_flag(tp, PCI_EXPRESS)) {
 		pci_write_config_byte(tp->pdev, PCI_CACHE_LINE_SIZE,
 				      tp->pci_cacheline_sz);
@@ -9139,6 +9148,9 @@ static int tg3_chip_reset(struct tg3 *tp
 			val16 |= PCI_EXP_DEVCTL_PAYLOAD;
 		pcie_capability_clear_word(tp->pdev, PCI_EXP_DEVCTL, val16);
 
+		if (tg3_pcie_mrrs_boost)
+			pcie_set_readrq(tp->pdev, tp->pcie_readrq);
+
 		/* Clear error status */
 		pcie_capability_write_word(tp->pdev, PCI_EXP_DEVSTA,
 				      PCI_EXP_DEVSTA_CED |
@@ -16381,6 +16393,14 @@ static int __devinit tg3_get_invariants(
 
 		tg3_flag_set(tp, PCI_EXPRESS);
 
+		if (tg3_pcie_mrrs_boost) {
+			tp->pcie_readrq = 4096;
+			if (tg3_asic_rev(tp) == ASIC_REV_5719 ||
+			    tg3_asic_rev(tp) == ASIC_REV_5720)
+				tp->pcie_readrq = 2048;
+			pcie_set_readrq(tp->pdev, tp->pcie_readrq);
+		}
+
 		pcie_capability_read_word(tp->pdev, PCI_EXP_LNKCTL, &lnkctl);
 		if (lnkctl & PCI_EXP_LNKCTL_CLKREQ_EN) {
 			if (tg3_asic_rev(tp) == ASIC_REV_5906) {
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/tun.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/tun.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/tun.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/tun.c	2015-01-21 12:02:48.012099260 +0300
@@ -61,6 +61,7 @@
 #include <linux/crc32.h>
 #include <linux/nsproxy.h>
 #include <linux/virtio_net.h>
+#include <linux/file.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
@@ -69,6 +70,9 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
 
@@ -453,6 +457,10 @@ static void tun_poll_controller(struct n
 	return;
 }
 #endif
+
+static void tun_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context * ctx);
+
 static const struct net_device_ops tun_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
@@ -462,6 +470,7 @@ static const struct net_device_ops tun_n
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= tun_poll_controller,
 #endif
+	.ndo_cpt		= tun_cpt,
 };
 
 static const struct net_device_ops tap_netdev_ops = {
@@ -476,6 +485,7 @@ static const struct net_device_ops tap_n
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= tun_poll_controller,
 #endif
+	.ndo_cpt		= tun_cpt,
 };
 
 /* Initialize net device. */
@@ -557,12 +567,8 @@ static struct sk_buff *tun_alloc_skb(str
 
 	sock_update_classid(sk);
 
-	/* Under a page?  Don't bother with paged skb. */
-	if (prepad + len < PAGE_SIZE || !linear)
-		linear = len;
-
-	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   &err);
+	linear = len;
+	skb = sock_alloc_send_skb(sk, prepad + linear, noblock, &err);
 	if (!skb)
 		return ERR_PTR(err);
 
@@ -904,6 +910,7 @@ static void tun_setup(struct net_device 
 
 	dev->ethtool_ops = &tun_ethtool_ops;
 	dev->destructor = tun_free_netdev;
+	dev->vz_features |= NETIF_F_VIRTUAL;
 }
 
 /* Trivial set of netlink ops to allow deleting tun or tap
@@ -1072,7 +1079,7 @@ static int tun_set_iff(struct net *net, 
 
 		if (((tun->owner != -1 && cred->euid != tun->owner) ||
 		     (tun->group != -1 && !in_egroup_p(tun->group))) &&
-		    !capable(CAP_NET_ADMIN))
+		    !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			return -EPERM;
 		err = security_tun_dev_open(tun->security);
 		if (err < 0)
@@ -1086,7 +1093,7 @@ static int tun_set_iff(struct net *net, 
 		char *name;
 		unsigned long flags = 0;
 
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			return -EPERM;
 		err = security_tun_dev_create();
 		if (err < 0)
@@ -1141,9 +1148,10 @@ static int tun_set_iff(struct net *net, 
 		if (err < 0)
 			goto err_free_dev;
 
-		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
-		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
-		    device_create_file(&tun->dev->dev, &dev_attr_group))
+		if ((dev_net(tun->dev) == &init_net) &&
+			(device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
+			device_create_file(&tun->dev->dev, &dev_attr_owner) ||
+			device_create_file(&tun->dev->dev, &dev_attr_group)))
 			printk(KERN_ERR "Failed to create tun sysfs files\n");
 
 		err = tun_attach(tun, file);
@@ -1622,6 +1630,223 @@ static const struct ethtool_ops tun_etht
 	.set_rx_csum	= tun_set_rx_csum
 };
 
+static void cpt_dump_tap_filter(struct tap_filter *flt,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+	struct cpt_tap_filter_image v;
+	loff_t saved_obj;
+
+	ops->push_object(&saved_obj, ctx);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_TAP_FILTER;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	v.cpt_count = flt->count;
+
+	BUILD_BUG_ON(sizeof(flt->mask) != sizeof(v.cpt_mask));
+	memcpy(v.cpt_mask, flt->mask, sizeof(v.cpt_mask));
+
+	BUILD_BUG_ON(sizeof(flt->addr) != sizeof(v.cpt_addr));
+	memcpy(v.cpt_addr, flt->addr, sizeof(v.cpt_addr));
+
+	ops->write(&v, sizeof(v), ctx);
+
+	ops->pop_object(&saved_obj, ctx);
+}
+
+static void tun_cpt(struct net_device *dev, 
+		struct cpt_ops *ops, struct cpt_context * ctx)
+{
+	struct cpt_tuntap_image v;
+	struct tun_struct *tun;
+
+	tun = netdev_priv(dev);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_TUNTAP;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	v.cpt_owner = tun->owner;
+	v.cpt_flags = tun->flags;
+	v.cpt_bindfile = 0;
+
+	if (tun->tfile)
+		v.cpt_bindfile = ops->lookup_object(CPT_OBJ_FILE, tun->tfile->socket.file, ctx);
+
+	v.cpt_if_flags = 0;
+	memset(v.cpt_dev_addr, 0, sizeof(v.cpt_dev_addr));
+	memset(v.cpt_chr_filter, 0, sizeof(v.cpt_chr_filter));
+	memset(v.cpt_net_filter, 0, sizeof(v.cpt_net_filter));
+
+	ops->write(&v, sizeof(v), ctx);
+
+	cpt_dump_tap_filter(&tun->txflt, ops, ctx);
+}
+
+static int rst_restore_tap_filter(loff_t start, struct cpt_tuntap_image *ti,
+			struct tap_filter *flt, struct rst_ops *ops,
+			struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_tap_filter_image fi;
+	loff_t pos;
+
+	/* disable filtering */
+	flt->count = 0;
+
+	pos = start + ti->cpt_hdrlen;
+
+	/* no tap filter image? */
+	if (pos >= start + ti->cpt_next)
+		goto convert;
+
+	err  = ops->get_object(CPT_OBJ_NET_TAP_FILTER, pos,
+			&fi, sizeof(fi), ctx);
+	if (err)
+		return err;
+
+	BUILD_BUG_ON(sizeof(flt->mask) != sizeof(fi.cpt_mask));
+	memcpy(flt->mask, fi.cpt_mask, sizeof(fi.cpt_mask));
+
+	BUILD_BUG_ON(sizeof(flt->addr) != sizeof(fi.cpt_addr));
+	memcpy(flt->addr, fi.cpt_addr, sizeof(fi.cpt_addr));
+
+	flt->count = fi.cpt_count;
+
+	return 0;
+
+convert:
+	/** From OLD filtering code:
+	 * Decide whether to accept this packet. This code is designed to
+	 * behave identically to an Ethernet interface. Accept the packet if
+	 * - we are promiscuous.
+	 * - the packet is addressed to us.
+	 * - the packet is broadcast.
+	 * - the packet is multicast and
+	 *   - we are multicast promiscous.
+	 *   - we belong to the multicast group.
+	 */
+
+	/* accept all, this is default if filter is untouched */
+	if (ti->cpt_if_flags & IFF_PROMISC)
+		return 0;
+
+	/* accept packets addressed to character device's hardware address */
+	BUILD_BUG_ON(sizeof(flt->addr[0]) != sizeof(ti->cpt_dev_addr));
+	memcpy(flt->addr[0], ti->cpt_dev_addr, sizeof(ti->cpt_dev_addr));
+
+	/* accept broadcast */
+	memset(flt->addr[1], ~0, sizeof(flt->addr[1]));
+
+	/* accept hashed multicast: hash function the same as in old code */
+	BUILD_BUG_ON(sizeof(flt->mask) != sizeof(ti->cpt_chr_filter));
+	memcpy(flt->mask, ti->cpt_chr_filter, sizeof(ti->cpt_chr_filter));
+
+	/* accept all multicast */
+	if (ti->cpt_if_flags & IFF_ALLMULTI)
+		memset(flt->mask, ~0, sizeof(flt->mask));
+
+	/* two exact filters: hw addr and broadcast */
+	flt->count = 2;
+
+	return 0;
+}
+
+static int tun_rst(loff_t start, struct cpt_netdev_image *di,
+		struct rst_ops *ops, struct cpt_context *ctx)
+{
+	int err = -ENODEV;
+	struct cpt_tuntap_image ti;
+	struct net_device *dev;
+	struct file *bind_file = NULL;
+	struct tun_struct *tun;
+	struct net *net = current->nsproxy->net_ns;
+	loff_t pos;
+
+	pos = start + di->cpt_hdrlen;
+	err = ops->get_object(CPT_OBJ_NET_TUNTAP, pos,
+			&ti, sizeof(ti), ctx);
+	if (err)
+		return err;
+
+	if (ti.cpt_bindfile) {
+		bind_file = ops->rst_file(ti.cpt_bindfile, -1, ctx);
+		if (IS_ERR(bind_file))
+			return PTR_ERR(bind_file);
+
+		if (bind_file->private_data == NULL) {
+			err = tun_chr_open(NULL, bind_file);
+			if (err)
+				goto out_tf;
+		}
+	}
+
+	err = -ENOMEM;
+	dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup);
+	if (!dev)
+		goto out_tf;
+
+	dev_net_set(dev, net);
+	dev->rtnl_link_ops = &tun_link_ops;
+
+	tun = netdev_priv(dev);
+
+	tun->dev = dev;
+	tun->owner = ti.cpt_owner;
+	tun->flags = ti.cpt_flags;
+	tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+
+	tun->sndbuf = INT_MAX;
+
+	tun_net_init(dev);
+
+	err = rst_restore_tap_filter(pos, &ti, &tun->txflt, ops, ctx);
+	if (err < 0)
+		goto out_sk;
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto out_sk;
+
+	pos += ti.cpt_next;
+	if (pos < start + di->cpt_next) {
+		struct cpt_hwaddr_image hw;
+		/* Restore hardware address */
+		err = ops->get_object(CPT_OBJ_NET_HWADDR, pos,
+				&hw, sizeof(hw), ctx);
+		if (err)
+			goto out_unreg;
+
+		memcpy(dev->dev_addr, hw.cpt_dev_addr,
+				sizeof(hw.cpt_dev_addr));
+	}
+
+	if (bind_file) {
+		err = tun_attach(tun, bind_file);
+		if (err)
+			goto out_unreg;
+		fput(bind_file);
+	}
+
+	return 0;
+
+out_unreg:
+	unregister_netdevice(dev);
+out_sk:
+	tun_free_netdev(dev);
+out_tf:
+	if (bind_file)
+		fput(bind_file);
+	return err;
+}
+
+static struct netdev_rst tun_netdev_rst = {
+	.cpt_object = CPT_OBJ_NET_TUNTAP,
+	.ndo_rst = tun_rst,
+};
 
 static int __init tun_init(void)
 {
@@ -1641,6 +1866,8 @@ static int __init tun_init(void)
 		printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
 		goto err_misc;
 	}
+
+	register_netdev_rst(&tun_netdev_rst);
 	return  0;
 err_misc:
 	rtnl_link_unregister(&tun_link_ops);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/veip_mgmt.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/veip_mgmt.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/veip_mgmt.c	2015-01-21 12:02:45.178174496 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/veip_mgmt.c	2015-01-21 12:02:45.675161301 +0300
@@ -0,0 +1,173 @@
+/*
+ *  veip_mgmt.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * Virtual Networking device used to change VE ownership on packets
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <linux/inet.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <linux/venet.h>
+
+static void veip_free(struct veip_struct *veip)
+{
+	kfree(veip);
+}
+
+static void veip_release(struct ve_struct *ve)
+{
+	struct veip_struct *veip;
+
+	veip = ve->veip;
+	ve->veip = NULL;
+	barrier();
+	veip_put(veip);
+}
+
+static int veip_create(struct ve_struct *ve)
+{
+	struct veip_struct *veip;
+
+	veip = veip_findcreate(ve->veid);
+	if (veip == NULL)
+		return -ENOMEM;
+
+	ve->veip = veip;
+	return 0;
+}
+
+static int skb_extract_addr(struct sk_buff *skb,
+		struct ve_addr_struct *addr, int dir)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		addr->family = AF_INET;
+		addr->key[0] = 0;
+		addr->key[1] = 0;
+		addr->key[2] = 0;
+		addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr);
+		return 0;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case __constant_htons(ETH_P_IPV6):
+		addr->family = AF_INET6;
+		memcpy(&addr->key, dir ?
+				ipv6_hdr(skb)->daddr.s6_addr32 :
+				ipv6_hdr(skb)->saddr.s6_addr32,
+				sizeof(addr->key));
+		return 0;
+#endif
+	}
+
+	return -EAFNOSUPPORT;
+}
+
+static struct ve_struct *venet_find_ve(struct ve_addr_struct *addr, int dir)
+{
+	struct ip_entry_struct *entry;
+	struct ve_struct *ve = NULL;
+
+	entry = venet_entry_lookup(addr);
+	if (entry != NULL)
+		ve = ACCESS_ONCE(entry->active_env);
+
+	return ve;
+}
+
+static struct ve_struct *veip_lookup(struct sk_buff *skb)
+{
+	struct ve_struct *ve, *ve_old;
+	int dir;
+	struct ve_addr_struct addr;
+
+	ve_old = skb->owner_env;
+	dir = ve_is_super(ve_old);
+	if (skb_extract_addr(skb, &addr, dir) < 0)
+		goto out_drop_nolock;
+
+	rcu_read_lock();
+	if (!dir) {
+		/* from VE to host */
+		ve = venet_find_ve(&addr, 0);
+		if (ve == NULL) {
+			if (!venet_ext_lookup(ve_old, &addr))
+				goto out_drop;
+		} else {
+			if (!ve_accessible_strict(ve, ve_old))
+				goto out_source;
+		}
+
+		ve = get_ve0();
+	} else {
+		/* from host to VE */
+		ve = venet_find_ve(&addr, 1);
+		if (ve == NULL)
+			goto out_drop;
+	}
+	rcu_read_unlock();
+
+	return ve;
+
+out_drop:
+	rcu_read_unlock();
+out_drop_nolock:
+	return ERR_PTR(-ESRCH);
+
+out_source:
+	rcu_read_unlock();
+	if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) {
+		printk(KERN_WARNING "Dropped packet, source wrong "
+		       "veid=%u src-IP=%u.%u.%u.%u "
+		       "dst-IP=%u.%u.%u.%u\n",
+		       skb->owner_env->veid,
+		       NIPQUAD(ip_hdr(skb)->saddr),
+		       NIPQUAD(ip_hdr(skb)->daddr));
+	}
+	return ERR_PTR(-EACCES);
+}
+
+void veip_cleanup(void)
+{
+	int i;
+	struct veip_struct *veip;
+
+	spin_lock(&veip_lock);
+	for (i = 0; i < VEIP_HASH_SZ; i++)
+		while (!hlist_empty(ip_entry_hash_table + i)) {
+			struct ip_entry_struct *entry;
+
+			entry = hlist_entry(ip_entry_hash_table[i].first,
+					struct ip_entry_struct, ip_hash);
+			hlist_del(&entry->ip_hash);
+			list_del(&entry->ve_list);
+			kfree(entry);
+		}
+
+	/*vzredir may remain some veip-s*/
+	while (!list_empty(&veip_lh)) {
+		veip = list_first_entry(&veip_lh, struct veip_struct, list);
+		veip_put(veip);
+	}
+	spin_unlock(&veip_lock);
+}
+
+static struct veip_pool_ops open_pool_ops = {
+	.veip_create = veip_create,
+	.veip_release = veip_release,
+	.veip_free = veip_free,
+	.veip_lookup = veip_lookup,
+};
+
+struct veip_pool_ops *veip_pool_ops = &open_pool_ops;
+EXPORT_SYMBOL(veip_pool_ops);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/venetdev.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/venetdev.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/venetdev.c	2015-01-21 12:02:45.179174469 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/venetdev.c	2015-01-21 12:02:51.373010036 +0300
@@ -0,0 +1,1131 @@
+/*
+ *  venet_core.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * Common part for Virtuozzo virtual network devices
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/addrconf.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/if_ether.h>	/* For the statistics structure. */
+#include <linux/if_arp.h>	/* For ARPHRD_ETHER */
+#include <linux/ethtool.h>
+#include <linux/venet.h>
+#include <linux/ve_proto.h>
+#include <linux/vzctl.h>
+#include <linux/vzctl_venet.h>
+
+struct hlist_head ip_entry_hash_table[VEIP_HASH_SZ];
+DEFINE_SPINLOCK(veip_lock);
+LIST_HEAD(veip_lh);
+
+#define ip_entry_hash_function(ip)  (ntohl(ip) & (VEIP_HASH_SZ - 1))
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip)
+{
+	hlist_add_head_rcu(&entry->ip_hash,
+			ip_entry_hash_table +
+			ip_entry_hash_function(entry->addr.key[3]));
+	list_add(&entry->ve_list, &veip->ip_lh);
+}
+
+static void ip_entry_free(struct rcu_head *rcu)
+{
+	struct ip_entry_struct *e;
+
+	e = container_of(rcu, struct ip_entry_struct, rcu);
+	kfree(e);
+}
+
+void ip_entry_unhash(struct ip_entry_struct *entry)
+{
+	list_del(&entry->ve_list);
+	hlist_del_rcu(&entry->ip_hash);
+	call_rcu(&entry->rcu, ip_entry_free);
+}
+
+static void veip_free(struct rcu_head *rcu)
+{
+	struct veip_struct *veip;
+
+	veip = container_of(rcu, struct veip_struct, rcu);
+	veip_pool_ops->veip_free(veip);
+}
+
+int veip_put(struct veip_struct *veip)
+{
+	if (!list_empty(&veip->ip_lh))
+		return 0;
+	if (!list_empty(&veip->src_lh))
+		return 0;
+	if (!list_empty(&veip->dst_lh))
+		return 0;
+
+	list_del(&veip->list);
+	call_rcu(&veip->rcu, veip_free);
+	return 1;
+}
+
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *entry;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(entry, n, ip_entry_hash_table +
+			ip_entry_hash_function(addr->key[3]), ip_hash)
+		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+			return entry;
+	return NULL;
+}
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+		struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *entry;
+	struct veip_struct *veip;
+
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		return NULL;
+
+	list_for_each_entry_rcu (entry, &veip->ext_lh, list)
+		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+			return entry;
+	return NULL;
+}
+
+static int venet_ext_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *entry, *found;
+	int err;
+
+	if (ve->veip == NULL)
+		return -ENONET;
+
+	entry = kzalloc(sizeof(struct ext_entry_struct), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	spin_lock(&veip_lock);
+	err = -EADDRINUSE;
+	found = venet_ext_lookup(ve, addr);
+	if (found != NULL)
+		goto out_unlock;
+
+	entry->addr = *addr;
+	list_add_rcu(&entry->list, &ve->veip->ext_lh);
+	err = 0;
+	entry = NULL;
+out_unlock:
+	spin_unlock(&veip_lock);
+	if (entry != NULL)
+		kfree(entry);
+	return err;
+}
+
+static void venet_ext_free(struct rcu_head *rcu)
+{
+	struct ext_entry_struct *e;
+
+	e = container_of(rcu, struct ext_entry_struct, rcu);
+	kfree(e);
+}
+
+static void venet_ext_release(struct ext_entry_struct *e)
+{
+	list_del_rcu(&e->list);
+	call_rcu(&e->rcu, venet_ext_free);
+}
+
+static int venet_ext_del(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *found;
+	int err;
+
+	if (ve->veip == NULL)
+		return -ENONET;
+
+	err = -EADDRNOTAVAIL;
+	spin_lock(&veip_lock);
+	found = venet_ext_lookup(ve, addr);
+	if (found == NULL)
+		goto out;
+
+	venet_ext_release(found);
+	err = 0;
+out:
+	spin_unlock(&veip_lock);
+	return err;
+}
+
+static void venet_ext_clean(struct ve_struct *ve)
+{
+	struct ext_entry_struct *entry, *tmp;
+
+	if (ve->veip == NULL)
+		return;
+
+	spin_lock(&veip_lock);
+	list_for_each_entry_safe (entry, tmp, &ve->veip->ext_lh, list)
+		venet_ext_release(entry);
+	spin_unlock(&veip_lock);
+}
+
+struct veip_struct *veip_find(envid_t veid)
+{
+	struct veip_struct *ptr;
+
+	list_for_each_entry(ptr, &veip_lh, list) {
+		if (ptr->veid != veid)
+			continue;
+		return ptr;
+	}
+	return NULL;
+}
+
+struct veip_struct *veip_findcreate(envid_t veid)
+{
+	struct veip_struct *ptr;
+
+	ptr = veip_find(veid);
+	if (ptr != NULL)
+		return ptr;
+
+	ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC);
+	if (ptr == NULL)
+		return NULL;
+	memset(ptr, 0, sizeof(struct veip_struct));
+	INIT_LIST_HEAD(&ptr->ip_lh);
+	INIT_LIST_HEAD(&ptr->src_lh);
+	INIT_LIST_HEAD(&ptr->dst_lh);
+	INIT_LIST_HEAD(&ptr->ext_lh);
+	ptr->veid = veid;
+	list_add(&ptr->list, &veip_lh);
+	return ptr;
+}
+
+static int veip_start(struct ve_struct *ve)
+{
+	int err, get;
+
+	spin_lock(&veip_lock);
+
+	get = ve->veip == NULL;
+	err = veip_pool_ops->veip_create(ve);
+	if (!err && get && !ve_is_super(ve))
+		__module_get(THIS_MODULE);
+
+	spin_unlock(&veip_lock);
+
+	return err;
+}
+
+static void veip_stop(struct ve_struct *ve)
+{
+	struct list_head *p, *tmp;
+
+	spin_lock(&veip_lock);
+	if (ve->veip == NULL)
+		goto unlock;
+	list_for_each_safe(p, tmp, &ve->veip->ip_lh) {
+		struct ip_entry_struct *ptr;
+		ptr = list_entry(p, struct ip_entry_struct, ve_list);
+		ptr->active_env = NULL;
+
+		if (ptr->tgt_veip == NULL)
+			ip_entry_unhash(ptr);
+	}
+
+	veip_pool_ops->veip_release(ve);
+	if (!ve_is_super(ve))
+		module_put(THIS_MODULE);
+unlock:
+	spin_unlock(&veip_lock);
+}
+
+static int veip_entry_conflict(struct ip_entry_struct *entry, struct ve_struct *ve)
+{
+	if (entry->active_env != NULL)
+		return -EADDRINUSE;
+	if (entry->tgt_veip && entry->tgt_veip->veid != ve->veid)
+		return -EADDRNOTAVAIL;
+
+	entry->active_env = ve;
+	return 0;
+}
+
+static int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *entry, *found;
+	int err;
+
+	entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	if (ve->veip == NULL) {
+		/* This can happen if we load venet AFTER ve was started */
+	       	err = veip_start(ve);
+		if (err < 0)
+			goto out;
+	}
+
+	spin_lock(&veip_lock);
+	found = venet_entry_lookup(addr);
+	if (found != NULL) {
+		err = veip_entry_conflict(found, ve);
+		goto out_unlock;
+	}
+
+	entry->active_env = ve;
+	entry->addr = *addr;
+	ip_entry_hash(entry, ve->veip);
+
+	err = 0;
+	entry = NULL;
+out_unlock:
+	spin_unlock(&veip_lock);
+out:
+	if (entry != NULL)
+		kfree(entry);
+
+	return err;
+}
+
+static int veip_entry_del(envid_t veid, struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *found;
+	int err;
+
+	err = -EADDRNOTAVAIL;
+	spin_lock(&veip_lock);
+	found = venet_entry_lookup(addr);
+	if (found == NULL)
+		goto out;
+	if (found->active_env == NULL)
+		goto out;
+	if (found->active_env->veid != veid)
+		goto out;
+
+	err = 0;
+	found->active_env = NULL;
+
+	if (found->tgt_veip == NULL)
+		ip_entry_unhash(found);
+out:
+	spin_unlock(&veip_lock);
+	return err;
+}
+
+static int convert_sockaddr(struct sockaddr *addr, int addrlen,
+		struct ve_addr_struct *veaddr)
+{
+	int err;
+
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin;
+
+		err = -EINVAL;
+		if (addrlen != sizeof(struct sockaddr_in))
+			break;
+
+		err = 0;
+		sin = (struct sockaddr_in *)addr;
+		veaddr->family = AF_INET;
+		veaddr->key[0] = 0;
+		veaddr->key[1] = 0;
+		veaddr->key[2] = 0;
+		veaddr->key[3] = sin->sin_addr.s_addr;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin;
+
+		err = -EINVAL;
+		if (addrlen != sizeof(struct sockaddr_in6))
+			break;
+
+		err = 0;
+		sin = (struct sockaddr_in6 *)addr;
+		veaddr->family = AF_INET6;
+		memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key));
+		break;
+	}
+	default:
+		err = -EAFNOSUPPORT;
+	}
+	return err;
+}
+
+int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+		struct ve_addr_struct *veaddr)
+{
+	int err;
+	char addr[MAX_SOCK_ADDR];
+
+	err = move_addr_to_kernel(uaddr, addrlen, (struct sockaddr *)&addr);
+	if (err < 0)
+		goto out;
+
+	err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr);
+out:
+	return err;
+}
+
+void veaddr_print(char *str, int len, struct ve_addr_struct *a)
+{
+	if (a->family == AF_INET)
+		snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3]));
+	else
+		snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x",
+				ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF,
+				ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF,
+				ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF,
+				ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF
+			);
+}
+
+/*
+ * Device functions
+ */
+
+static int venet_open(struct net_device *dev)
+{
+	if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE))
+		return -EBUSY;
+	return 0;
+}
+
+static int venet_close(struct net_device *master)
+{
+	if (!ve_is_super(get_exec_env()))
+		module_put(THIS_MODULE);
+	return 0;
+}
+
+static void venet_destructor(struct net_device *dev)
+{
+	struct venet_stats *stats = (struct venet_stats *)dev->ml_priv;
+	if (stats == NULL)
+		return;
+	free_percpu(stats->real_stats);
+	kfree(stats);
+	dev->ml_priv = NULL;
+}
+
+/*
+ * The higher levels take care of making this non-reentrant (it's
+ * called with bh's disabled).
+ */
+static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats;
+	struct net_device *rcv = NULL;
+	struct ve_struct *ve;
+	int length;
+
+	stats = venet_stats(dev, smp_processor_id());
+	ve = get_exec_env();
+	if (unlikely(ve->disable_net))
+		goto outf;
+
+	if (skb->protocol == __constant_htons(ETH_P_IP)) {
+		struct iphdr *iph;
+		iph = ip_hdr(skb);
+		if (ipv4_is_multicast(iph->daddr))
+			goto outf;
+	} else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h;
+		ip6h = ipv6_hdr(skb);
+		if (ipv6_addr_is_multicast(&ip6h->daddr))
+			goto outf;
+		skb_orphan(skb);
+	} else {
+		goto outf;
+	}
+
+	ve = veip_pool_ops->veip_lookup(skb);
+	if (IS_ERR(ve))
+		goto outf;
+
+	skb->owner_env = ve;
+	if (unlikely(ve->disable_net))
+		goto outf;
+
+	rcv = ve->_venet_dev;
+	if (!rcv)
+		/* VE going down */
+		goto outf;
+
+	dev_hold(rcv);
+
+	if (!(rcv->flags & IFF_UP)) {
+		/* Target VE does not want to receive packets */
+		dev_put(rcv);
+		goto outf;
+	}
+
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = rcv;
+
+	/*
+	 * If there is not enough space for header we allocate one.
+	 * Remember the traffic can reach VE from outside world and
+	 * as result we have to cleanup mac address of such packet.
+	 * The same applies to traffic which comes from inside of VE
+	 * but if TUN is used and traffic get fragmented we might reach
+	 * the point where is no L2 header at all and hard_header_len
+	 * is simply ingnored (because this parameter is kind of a hint
+	 * for upper net layers and never a guarantee that header will be
+	 * provided). To unify the way how packets are seen after venet
+	 * we always produce L2 header with zero'ified MAC.
+	 */
+	if (unlikely(skb_headroom(skb) < dev->hard_header_len)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (!skb2) {
+			dev_put(rcv);
+			goto outf;
+		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		kfree_skb(skb);
+		skb = skb2;
+	}
+
+	skb_reset_mac_header(skb);
+	memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len);
+
+	nf_reset(skb);
+	length = skb->len;
+
+	netif_rx(skb);
+
+	stats->tx_bytes += length;
+	stats->tx_packets++;
+	if (rcv) {
+		struct net_device_stats *rcv_stats;
+
+		rcv_stats = venet_stats(rcv, smp_processor_id());
+		rcv_stats->rx_bytes += length;
+		rcv_stats->rx_packets++;
+		dev_put(rcv);
+	}
+
+	return 0;
+
+outf:
+	kfree_skb(skb);
+	++stats->tx_dropped;
+	return 0;
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+	int i;
+	struct venet_stats *stats;
+
+	stats = (struct venet_stats *)dev->ml_priv;
+	memset(&stats->stats, 0, sizeof(struct net_device_stats));
+	for_each_possible_cpu(i) {
+		struct net_device_stats *dev_stats;
+
+		dev_stats = venet_stats(dev, i);
+		stats->stats.rx_bytes   += dev_stats->rx_bytes;
+		stats->stats.tx_bytes   += dev_stats->tx_bytes;
+		stats->stats.rx_packets += dev_stats->rx_packets;
+		stats->stats.tx_packets += dev_stats->tx_packets;
+		stats->stats.tx_dropped += dev_stats->tx_dropped;
+	}
+
+	return &stats->stats;
+}
+
+/* Initialize the rest of the LOOPBACK device. */
+int venet_init_dev(struct net_device *dev)
+{
+	struct venet_stats *stats;
+
+	stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL);
+	if (stats == NULL)
+		goto fail;
+	stats->real_stats = alloc_percpu(struct net_device_stats);
+	if (stats->real_stats == NULL)
+		goto fail_free;
+	dev->ml_priv = stats;
+
+	/*
+	 *	Fill in the generic fields of the device structure.
+	 */
+	dev->type		= ARPHRD_VOID;
+	dev->hard_header_len 	= ETH_HLEN;
+	dev->mtu		= 1500; /* eth_mtu */
+	dev->tx_queue_len	= 0;
+
+	memset(dev->broadcast, 0xFF, ETH_ALEN);
+
+	/* New-style flags. */
+	dev->flags		= IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT;
+	return 0;
+
+fail_free:
+	kfree(stats);
+fail:
+	return -ENOMEM;
+}
+
+static const struct net_device_ops venet_netdev_ops;
+
+static int
+venet_set_op(struct net_device *dev, u32 data,
+	     int (*fop)(struct net_device *, u32))
+{
+	struct net_device *nd;
+	struct net *net;
+	int ret = 0;
+
+	for_each_net(net) {
+		for_each_netdev(net, nd) {
+			if (nd->netdev_ops == &venet_netdev_ops)
+				ret |= fop(nd, data);
+				/* no rollback here! */
+		}
+	}
+	return ret;
+}
+
+static unsigned long common_features;
+
+static int venet_op_set_sg(struct net_device *dev, u32 data)
+{
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	if (data)
+		common_features |= NETIF_F_SG;
+	else
+		common_features &= ~NETIF_F_SG;
+
+	return venet_set_op(dev, data, ethtool_op_set_sg);
+}
+
+static int venet_op_set_tx_csum(struct net_device *dev, u32 data)
+{
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	if (data)
+		common_features |= NETIF_F_IP_CSUM;
+	else
+		common_features &= ~NETIF_F_IP_CSUM;
+
+	return venet_set_op(dev, data, ethtool_op_set_tx_csum);
+}
+
+static int
+venet_op_set_tso(struct net_device *dev, u32 data)
+{
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	if (data)
+		common_features |= NETIF_F_TSO;
+	else
+		common_features &= ~NETIF_F_TSO;
+
+	return venet_set_op(dev, data, ethtool_op_set_tso);
+}
+
+#define venet_op_set_rx_csum venet_op_set_tx_csum
+
+static struct ethtool_ops venet_ethtool_ops = {
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = venet_op_set_sg,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = venet_op_set_tx_csum,
+	.get_rx_csum = ethtool_op_get_tx_csum,
+	.set_rx_csum = venet_op_set_rx_csum,
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = venet_op_set_tso,
+};
+
+static void venet_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+}
+
+static const struct net_device_ops venet_netdev_ops = {
+	.ndo_start_xmit = venet_xmit,
+	.ndo_get_stats = get_stats,
+	.ndo_open = venet_open,
+	.ndo_stop = venet_close,
+	.ndo_init = venet_init_dev,
+	.ndo_cpt = venet_cpt,
+};
+
+static void venet_setup(struct net_device *dev)
+{
+	/*
+	 * No other features, as they are:
+	 *  - checksumming is required, and nobody else will done our job
+	 */
+	dev->features |= NETIF_F_LLTX | NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED;
+	dev->vz_features |= NETIF_F_VENET | NETIF_F_VIRTUAL;
+
+	dev->netdev_ops = &venet_netdev_ops;
+	dev->destructor = venet_destructor;
+
+	dev->features |= common_features;
+
+	SET_ETHTOOL_OPS(dev, &venet_ethtool_ops);
+}
+
+#ifdef CONFIG_PROC_FS
+static void veaddr_seq_print(struct seq_file *m, struct ve_struct *ve)
+{
+	struct ip_entry_struct *entry;
+	struct veip_struct *veip;
+
+	spin_lock(&veip_lock);
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		goto unlock;
+	list_for_each_entry (entry, &veip->ip_lh, ve_list) {
+		char addr[40];
+
+		if (entry->active_env == NULL)
+			continue;
+
+		veaddr_print(addr, sizeof(addr), &entry->addr);
+		if (entry->addr.family == AF_INET)
+			seq_printf(m, " %15s", addr);
+		else
+			seq_printf(m, " %39s", addr);
+	}
+unlock:
+	spin_unlock(&veip_lock);
+}
+
+static void *veip_seq_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t l;
+	struct hlist_node *p;
+	struct ip_entry_struct *s;
+	int i;
+
+	l = *pos;
+	rcu_read_lock();
+	if (l == 0) {
+		m->private = (void *)0;
+		return SEQ_START_TOKEN;
+	}
+
+	for (i = 0; i < VEIP_HASH_SZ; i++) {
+		hlist_for_each_entry_rcu(s, p, ip_entry_hash_table + i, ip_hash) {
+			if (--l == 0) {
+				m->private = (void *)(long)(i + 1);
+				return p;
+			}
+		}
+	}
+	return NULL;
+}
+
+static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct hlist_node *p;
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		goto find;
+
+	p = rcu_dereference(((struct hlist_node *)v)->next);
+	if (p != NULL)
+		goto found;
+
+find:
+	for (i = (int)(long)m->private; i < VEIP_HASH_SZ; i++) {
+		p = rcu_dereference(ip_entry_hash_table[i].first);
+		if (p != NULL) {
+			m->private = (void *)(long)(i + 1);
+found:
+			(*pos)++;
+			return p;
+		}
+	}
+
+	return NULL;
+}
+
+static void veip_seq_stop(struct seq_file *m, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int veip_seq_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *p;
+	struct ip_entry_struct *entry;
+	struct veip_struct *veip;
+	char s[40];
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "Version: 2.5\n");
+		return 0;
+	}
+
+	p = (struct hlist_node *)v;
+	entry = hlist_entry(p, struct ip_entry_struct, ip_hash);
+	veaddr_print(s, sizeof(s), &entry->addr);
+	veip = ACCESS_ONCE(entry->tgt_veip);
+	seq_printf(m, "%39s %10u\n", s, veip == NULL ? 0 : veip->veid);
+	return 0;
+}
+
+static struct seq_operations veip_seq_op = {
+	.start	= veip_seq_start,
+	.next	= veip_seq_next,
+	.stop	= veip_seq_stop,
+	.show	= veip_seq_show,
+};
+
+static int veip_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &veip_seq_op);
+}
+
+static struct file_operations proc_veip_operations = {
+	.open		= veip_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
+static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr,
+		int addrlen)
+{
+	int err;
+	struct ve_struct *ve;
+	struct ve_addr_struct addr;
+
+	err = -EPERM;
+	if (!capable_setveid())
+		goto out;
+
+	err = sockaddr_to_veaddr(uaddr, addrlen, &addr);
+	if (err < 0)
+		goto out;
+
+	switch (op)
+	{
+		case VE_IP_ADD:
+			ve = get_ve_by_id(veid);
+			err = -ESRCH;
+			if (!ve)
+				goto out;
+
+			down_read(&ve->op_sem);
+			if (ve->is_running)
+				err = veip_entry_add(ve, &addr);
+			up_read(&ve->op_sem);
+			put_ve(ve);
+			break;
+
+		case VE_IP_DEL:
+			err = veip_entry_del(veid, &addr);
+			break;
+		case VE_IP_EXT_ADD:
+			ve = get_ve_by_id(veid);
+			err = -ESRCH;
+			if (!ve)
+				goto out;
+
+			down_read(&ve->op_sem);
+			err = venet_ext_add(ve, &addr);
+			up_read(&ve->op_sem);
+			put_ve(ve);
+			break;
+		case VE_IP_EXT_DEL:
+			ve = get_ve_by_id(veid);
+			err = -ESRCH;
+			if (!ve)
+				goto out;
+
+			down_read(&ve->op_sem);
+			err = venet_ext_del(ve, &addr);
+			up_read(&ve->op_sem);
+			put_ve(ve);
+			break;
+		default:
+			err = -EINVAL;
+	}
+
+out:
+	return err;
+}
+
+int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	case VENETCTL_VE_IP_MAP: {
+		struct vzctl_ve_ip_map s;
+		err = -EFAULT;
+		if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+			break;
+		err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen);
+		break;
+	}
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch(cmd) {
+	case VENETCTL_COMPAT_VE_IP_MAP: {
+		struct compat_vzctl_ve_ip_map cs;
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr),
+				cs.addrlen);
+		break;
+	}
+	default:
+		err = venet_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo venetcalls = {
+	.type		= VENETCTLTYPE,
+	.ioctl		= venet_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_venet_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+int venet_dev_start(struct ve_struct *ve)
+{
+	struct net_device *dev_venet;
+	int err;
+
+	dev_venet = alloc_netdev(0, "venet%d", venet_setup);
+	if (!dev_venet)
+		return -ENOMEM;
+	dev_net_set(dev_venet, ve->ve_netns);
+	err = dev_alloc_name(dev_venet, dev_venet->name);
+	if (err<0)
+		goto err;
+
+	dev_venet->features |= NETIF_F_NETNS_LOCAL;
+
+	if ((err = register_netdev(dev_venet)) != 0)
+		goto err;
+	ve->_venet_dev = dev_venet;
+	return 0;
+err:
+	free_netdev(dev_venet);
+	printk(KERN_ERR "VENET initialization error err=%d\n", err);
+	return err;
+}
+
+static __net_init int venet_init_net(struct net *net)
+{
+	struct ve_struct *env;
+	int err;
+
+	env = get_exec_env();
+	if (env->veip) {
+		if (ve_is_super(env))
+			return 0;
+		return -EEXIST;
+	}
+
+	env->ve_netns = net;
+
+	err = veip_start(env);
+	if (err != 0)
+		goto err;
+
+	err = venet_dev_start(env);
+	if (err)
+		goto err_free;
+	return 0;
+
+err_free:
+	veip_stop(env);
+err:
+	env->ve_netns = NULL;
+	return err;
+}
+
+static __net_exit void venet_exit_net(struct list_head *net_exit_list)
+{
+	struct net *net;
+	struct ve_struct *env, *old_env;
+	struct net_device *dev;
+	LIST_HEAD(netdev_kill_list);
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		env = net->owner_ve;
+		old_env = set_exec_env(env);
+
+		if (env->ve_netns != net)
+			goto next;
+
+		venet_ext_clean(env);
+		veip_stop(env);
+
+		dev = env->_venet_dev;
+		if (dev == NULL)
+			goto next;
+
+		rtnl_lock();
+		unregister_netdevice_queue(dev, &netdev_kill_list);
+		rtnl_unlock();
+next:
+		set_exec_env(old_env);
+	}
+
+	rtnl_lock();
+	unregister_netdevice_many(&netdev_kill_list);
+	rtnl_unlock();
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		env = net->owner_ve;
+
+		if (env->ve_netns != net)
+			continue;
+
+		dev = env->_venet_dev;
+		if (dev == NULL)
+			continue;
+
+		env->_venet_dev = NULL;
+
+		old_env = set_exec_env(env);
+		free_netdev(dev);
+		set_exec_env(old_env);
+	}
+}
+
+static struct pernet_operations venet_net_ops = {
+	.init = venet_init_net,
+	.exit_batch = venet_exit_net,
+};
+
+__init int venet_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *de;
+#endif
+	int i, err;
+
+	if (get_ve0()->_venet_dev != NULL)
+		return -EEXIST;
+
+	for (i = 0; i < VEIP_HASH_SZ; i++)
+		INIT_HLIST_HEAD(ip_entry_hash_table + i);
+
+	err = register_pernet_device(&venet_net_ops);
+	if (err)
+		return err;
+
+#ifdef CONFIG_PROC_FS
+	de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_veip_operations);
+	if (de == NULL)
+		printk(KERN_WARNING "venet: can't make veip proc entry\n");
+#endif
+
+	vzioctl_register(&venetcalls);
+	vzmon_register_veaddr_print_cb(veaddr_seq_print);
+	return 0;
+}
+
+__exit void venet_exit(void)
+{
+	vzmon_unregister_veaddr_print_cb(veaddr_seq_print);
+	vzioctl_unregister(&venetcalls);
+	unregister_pernet_device(&venet_net_ops);
+
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("veip", proc_vz_dir);
+#endif
+	veip_cleanup();
+
+	/* Ensure there are no outstanding rcu callbacks */
+	rcu_barrier();
+
+	BUG_ON(!list_empty(&veip_lh));
+}
+
+module_init(venet_init);
+module_exit(venet_exit);
+
+MODULE_AUTHOR("Parallels <info@parallels.com>");
+MODULE_DESCRIPTION("Virtuozzo Virtual Network Device");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("vznet");
+
+EXPORT_SYMBOL(veip_lock);
+EXPORT_SYMBOL(ip_entry_hash);
+EXPORT_SYMBOL(ip_entry_unhash);
+EXPORT_SYMBOL(sockaddr_to_veaddr);
+EXPORT_SYMBOL(veaddr_print);
+EXPORT_SYMBOL(venet_entry_lookup);
+EXPORT_SYMBOL(veip_find);
+EXPORT_SYMBOL(veip_findcreate);
+EXPORT_SYMBOL(veip_put);
+EXPORT_SYMBOL(venet_ext_lookup);
+EXPORT_SYMBOL(veip_lh);
+EXPORT_SYMBOL(ip_entry_hash_table);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/veth.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/veth.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/veth.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/veth.c	2015-01-21 12:02:51.156015798 +0300
@@ -412,7 +412,7 @@ err_register_peer:
 	return err;
 }
 
-static void veth_dellink(struct net_device *dev)
+static void veth_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct veth_priv *priv;
 	struct net_device *peer;
@@ -425,12 +425,12 @@ static void veth_dellink(struct net_devi
 	 * not being freed before one RCU grace period.
 	 */
 	RCU_INIT_POINTER(priv->peer, NULL);
-	unregister_netdevice(dev);
+	unregister_netdevice_queue(dev, head);
 
 	if (peer) {
 		priv = netdev_priv(peer);
 		RCU_INIT_POINTER(priv->peer, NULL);
-		unregister_netdevice(peer);
+		unregister_netdevice_queue(peer, head);
 	}
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/vxlan.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/vxlan.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/vxlan.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/vxlan.c	2015-01-21 12:02:51.224013994 +0300
@@ -1969,7 +1969,7 @@ static int vxlan_newlink(struct net_devi
 	return 0;
 }
 
-static void vxlan_dellink(struct net_device *dev)
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
@@ -1980,7 +1980,7 @@ static void vxlan_dellink(struct net_dev
 	spin_unlock(&vn->sock_lock);
 
 	list_del(&vxlan->next);
-	unregister_netdevice(dev);
+	unregister_netdevice_queue(dev, head);
 }
 
 static size_t vxlan_get_size(const struct net_device *dev)
@@ -2065,20 +2065,9 @@ static struct rtnl_link_ops vxlan_link_o
 
 static __net_init int vxlan_init_net(struct net *net)
 {
-	struct vxlan_net *vn;
-	int rc;
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	unsigned int h;
 
-	vn = kzalloc(sizeof(struct vxlan_net), GFP_KERNEL);
-	if (vn == NULL)
-		return -ENOMEM;
-
-	rc = net_assign_generic(net, vxlan_net_id, vn);
-	if (rc < 0) {
-		kfree(vn);
-		return rc;
-	}
-
 	INIT_LIST_HEAD(&vn->vxlan_list);
 	spin_lock_init(&vn->sock_lock);
 
@@ -2099,12 +2088,13 @@ static __net_exit void vxlan_exit_net(st
 		unregister_netdevice_queue(vxlan->dev, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
-	kfree(vn);
 }
 
 static struct pernet_operations vxlan_net_ops = {
 	.init = vxlan_init_net,
 	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
 };
 
 static int __init vxlan_init_module(void)
@@ -2117,7 +2107,7 @@ static int __init vxlan_init_module(void
 
 	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
 
-	rc = register_pernet_gen_device(&vxlan_net_id, &vxlan_net_ops);
+	rc = register_pernet_device(&vxlan_net_ops);
 	if (rc)
 		goto out1;
 
@@ -2128,7 +2118,7 @@ static int __init vxlan_init_module(void
 	return 0;
 
 out2:
-	unregister_pernet_gen_device(vxlan_net_id, &vxlan_net_ops);
+	unregister_pernet_device(&vxlan_net_ops);
 out1:
 	destroy_workqueue(vxlan_wq);
 	return rc;
@@ -2139,7 +2129,7 @@ static void __exit vxlan_cleanup_module(
 {
 	rtnl_link_unregister(&vxlan_link_ops);
 	destroy_workqueue(vxlan_wq);
-	unregister_pernet_gen_device(vxlan_net_id, &vxlan_net_ops);
+	unregister_pernet_device(&vxlan_net_ops);
 	rcu_barrier();
 }
 module_exit(vxlan_cleanup_module);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/net/vzethdev.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/vzethdev.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/net/vzethdev.c	2015-01-21 12:02:45.179174469 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/net/vzethdev.c	2015-01-21 12:02:51.373010036 +0300
@@ -0,0 +1,741 @@
+/*
+ *  veth.c
+ *
+ *  Copyright (C) 2006  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * Virtual ethernet device used to change VE ownership on packets
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/if_ether.h>	/* For the statistics structure. */
+#include <linux/if_arp.h>	/* For ARPHRD_ETHER */
+#include <linux/if_bridge.h>
+#include <linux/ethtool.h>
+#include <linux/ve_proto.h>
+#include <linux/veth.h>
+#include <linux/vzctl.h>
+#include <linux/vzctl_veth.h>
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/vzcalluser.h>
+
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
+static LIST_HEAD(veth_hwaddr_list);
+static DEFINE_RWLOCK(ve_hwaddr_lock);
+static DECLARE_MUTEX(hwaddr_sem);
+
+static struct net_device * veth_dev_start(char *dev_addr, char *name);
+
+static struct veth_struct *hwaddr_entry_lookup(char *name)
+{
+	struct veth_struct *entry;
+
+	list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) {
+		BUG_ON(entry->pair == NULL);
+		if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0)
+			return entry;
+	}
+	return NULL;
+}
+
+static int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name,
+		char *dev_addr_ve, char *name_ve)
+{
+	struct net_device *dev_ve;
+	struct net_device *dev_ve0;
+	struct ve_struct *old_env;
+	char dev_name[IFNAMSIZ];
+	int err;
+
+	down(&hwaddr_sem);
+
+	if (name[0] == '\0')
+		snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid);
+	else {
+		memcpy(dev_name, name, IFNAMSIZ - 1);
+		dev_name[IFNAMSIZ - 1] = '\0';
+	}
+	dev_ve0 = veth_dev_start(dev_addr, dev_name);
+	if (IS_ERR(dev_ve0)) {
+		err = PTR_ERR(dev_ve0);
+		goto err;
+	}
+
+	old_env = set_exec_env(ve);
+	if (name_ve[0] == '\0')
+		sprintf(dev_name, "eth%%d");
+	else {
+		memcpy(dev_name, name_ve, IFNAMSIZ - 1);
+		dev_name[IFNAMSIZ - 1] = '\0';
+	}
+	dev_ve = veth_dev_start(dev_addr_ve, dev_name);
+	if (IS_ERR(dev_ve)) {
+		err = PTR_ERR(dev_ve);
+		goto err_ve;
+	}
+	set_exec_env(old_env);
+	veth_from_netdev(dev_ve)->pair = dev_ve0;
+	veth_from_netdev(dev_ve)->me = dev_ve;
+	veth_from_netdev(dev_ve0)->pair = dev_ve;
+	veth_from_netdev(dev_ve0)->me = dev_ve0;
+
+	write_lock(&ve_hwaddr_lock);
+	list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list);
+	write_unlock(&ve_hwaddr_lock);
+
+	up(&hwaddr_sem);
+	return 0;
+
+err_ve:
+	set_exec_env(old_env);
+	unregister_netdev(dev_ve0);
+err:
+	up(&hwaddr_sem);
+	return err;
+}
+
+static void veth_pair_del(struct ve_struct *env, struct veth_struct *entry,
+			  struct list_head *head)
+{
+	struct net_device *dev;
+	struct ve_struct *old_env;
+
+	write_lock(&ve_hwaddr_lock);
+	list_del(&entry->hwaddr_list);
+	write_unlock(&ve_hwaddr_lock);
+
+	dev = entry->pair;
+	BUG_ON(entry->pair == NULL);
+
+	veth_from_netdev(dev)->pair = NULL;
+	entry->pair = NULL;
+	rtnl_lock();
+	old_env = set_exec_env(dev->owner_env);
+	dev_close(dev);
+
+	/*
+	 * Now device from VE0 does not send or receive anything,
+	 * i.e. dev->hard_start_xmit won't be called.
+	 */
+	set_exec_env(env);
+	unregister_netdevice_queue(veth_to_netdev(entry), head);
+	set_exec_env(dev->owner_env);
+	unregister_netdevice_queue(dev, head);
+	set_exec_env(old_env);
+	rtnl_unlock();
+}
+
+static int veth_entry_del(struct ve_struct *ve, char *name)
+{
+	struct veth_struct *found;
+	int err;
+
+	err = -ENODEV;
+	down(&hwaddr_sem);
+	found = hwaddr_entry_lookup(name);
+	if (found == NULL)
+		goto out;
+	if (veth_to_netdev(found)->owner_env != ve)
+		goto out;
+
+	err = 0;
+	veth_pair_del(ve, found, NULL);
+
+out:
+	up(&hwaddr_sem);
+	return err;
+}
+
+static int veth_allow_change_mac(envid_t veid, char *name, int allow)
+{
+	struct ve_struct *ve;
+	struct veth_struct *found;
+	int err;
+
+	err = -ESRCH;
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return err;
+
+	down_read(&ve->op_sem);
+	if (!ve->is_running)
+		goto out_ve;
+	err = -ENODEV;
+	down(&hwaddr_sem);
+	found = hwaddr_entry_lookup(name);
+	if (found == NULL)
+		goto out_sem;
+	if (veth_to_netdev(found)->owner_env != ve)
+		goto out_sem;
+
+	err = 0;
+	found->allow_mac_change = allow;
+
+out_sem:
+	up(&hwaddr_sem);
+out_ve:
+	up_read(&ve->op_sem);
+	put_ve(ve);
+	return err;
+}
+
+/*
+ * Device functions
+ */
+
+static int veth_open(struct net_device *dev)
+{
+	return 0;
+}
+
+static int veth_close(struct net_device *master)
+{
+	return 0;
+}
+
+static void veth_destructor(struct net_device *dev)
+{
+	free_percpu(veth_from_netdev(dev)->real_stats);
+	free_netdev(dev);
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+	int i;
+	struct net_device_stats *stats;
+
+	stats = &veth_from_netdev(dev)->stats;
+	memset(stats, 0, sizeof(struct net_device_stats));
+	for_each_possible_cpu(i) {
+		struct net_device_stats *dev_stats;
+
+		dev_stats = veth_stats(dev, i);
+		stats->rx_bytes   += dev_stats->rx_bytes;
+		stats->tx_bytes   += dev_stats->tx_bytes;
+		stats->rx_packets += dev_stats->rx_packets;
+		stats->tx_packets += dev_stats->tx_packets;
+		stats->tx_dropped += dev_stats->tx_dropped;
+	}
+
+	return stats;
+}
+
+/*
+ * The higher levels take care of making this non-reentrant (it's
+ * called with bh's disabled).
+ */
+static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats;
+	struct net_device *rcv = NULL;
+	struct veth_struct *entry;
+	int length;
+
+	stats = veth_stats(dev, smp_processor_id());
+	if (unlikely(get_exec_env()->disable_net))
+		goto outf;
+
+	entry = veth_from_netdev(dev);
+	rcv = entry->pair;
+	if (!rcv)
+		/* VE going down */
+		goto outf;
+
+	if (!(rcv->flags & IFF_UP)) {
+		/* Target VE does not want to receive packets */
+		goto outf;
+	}
+
+	if (unlikely(rcv->owner_env->disable_net))
+		goto outf;
+	/* Filtering */
+	if (ve_is_super(dev->owner_env) &&
+			!veth_from_netdev(rcv)->allow_mac_change) {
+		/* from VE0 to VEX */
+		if (ve_is_super(rcv->owner_env))
+			goto out;
+		if (is_multicast_ether_addr(
+					((struct ethhdr *)skb->data)->h_dest))
+			goto out;
+		if (!rcv->br_port &&
+			compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, rcv->dev_addr))
+				goto outf;
+	} else if (!ve_is_super(dev->owner_env) &&
+			!entry->allow_mac_change) {
+		/* from VEX to VE0 */
+		if (!skb->dev->br_port &&
+			compare_ether_addr(((struct ethhdr *)skb->data)->h_source, dev->dev_addr))
+				goto outf;
+	}
+
+out:
+	skb->owner_env = rcv->owner_env;
+
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, rcv);
+
+	if (skb->protocol != __constant_htons(ETH_P_IP))
+		skb_orphan(skb);
+
+	nf_reset(skb);
+	length = skb->len;
+	skb_init_brmark(skb);
+
+	netif_rx(skb);
+
+	stats->tx_bytes += length;
+	stats->tx_packets++;
+	if (rcv) {
+		struct net_device_stats *rcv_stats;
+		rcv_stats = veth_stats(rcv, smp_processor_id());
+		rcv_stats->rx_bytes += length;
+		rcv_stats->rx_packets++;
+	}
+
+	return 0;
+
+outf:
+	kfree_skb(skb);
+	stats->tx_dropped++;
+	return 0;
+}
+
+static int veth_set_mac(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!ve_is_super(dev->owner_env) &&
+			!veth_from_netdev(dev)->allow_mac_change)
+		return -EPERM;
+	if (netif_running(dev))
+		return -EBUSY;
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+static int veth_init_dev(struct net_device *dev)
+{
+	veth_from_netdev(dev)->real_stats =
+		alloc_percpu(struct net_device_stats);
+	if (veth_from_netdev(dev)->real_stats == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+veth_set_op(struct net_device *dev, u32 data,
+	     int (*fop)(struct net_device *, u32))
+{
+	struct net_device *pair;
+	int ret = 0;
+
+	ret = fop(dev, data);
+	if (ret < 0)
+		goto out;
+
+	pair = veth_from_netdev(dev)->pair;
+	if (pair)
+		ret = fop(pair, data);
+out:
+	return ret;
+}
+
+static int veth_op_set_sg(struct net_device *dev, u32 data)
+{
+	return veth_set_op(dev, data, ethtool_op_set_sg);
+}
+
+static int veth_op_set_tx_csum(struct net_device *dev, u32 data)
+{
+	return veth_set_op(dev, data, ethtool_op_set_tx_csum);
+}
+
+static int
+veth_op_set_tso(struct net_device *dev, u32 data)
+{
+	return veth_set_op(dev, data, ethtool_op_set_tso);
+}
+
+#define veth_op_set_rx_csum veth_op_set_tx_csum
+
+static struct ethtool_ops veth_ethtool_ops = {
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = veth_op_set_sg,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = veth_op_set_tx_csum,
+	.get_rx_csum = ethtool_op_get_tx_csum,
+	.set_rx_csum = veth_op_set_rx_csum,
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = veth_op_set_tso,
+};
+
+static void veth_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+	struct cpt_veth_image v;
+	struct veth_struct *veth;
+
+	veth = veth_from_netdev(dev);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_VETH;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	v.cpt_allow_mac_change = veth->allow_mac_change;
+
+	ops->write(&v, sizeof(v), ctx);
+}
+
+static int veth_rst(loff_t pos, struct cpt_netdev_image *di,
+		struct rst_ops *ops,
+		struct cpt_context *ctx)
+
+{
+	int err;
+	struct cpt_veth_image vi;
+	struct veth_struct *veth;
+	struct net_device *dev;
+
+	pos = pos + di->cpt_hdrlen;
+	err = ops->get_object(CPT_OBJ_NET_VETH, pos,
+			&vi, sizeof(vi), ctx);
+	if (err)
+		return err;
+
+	dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, di->cpt_name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	veth = veth_from_netdev(dev);
+	veth->allow_mac_change = vi.cpt_allow_mac_change;
+
+	return 0;
+}
+
+static struct netdev_rst veth_netdev_rst = {
+	.cpt_object = CPT_OBJ_NET_VETH,
+	.ndo_rst = veth_rst,
+};
+
+static const struct net_device_ops veth_ops = {
+	.ndo_init = veth_init_dev,
+	.ndo_start_xmit = veth_xmit,
+	.ndo_get_stats = get_stats,
+	.ndo_open = veth_open,
+	.ndo_stop = veth_close,
+	.ndo_set_mac_address = veth_set_mac,
+	.ndo_cpt = veth_cpt,
+};
+
+static void veth_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops = &veth_ops;
+	dev->destructor = veth_destructor;
+	dev->tx_queue_len = 0;
+
+	/*
+	 * No other features, as they are:
+	 *  - checksumming is required, and nobody else will done our job
+	 */
+	dev->features |= NETIF_F_LLTX |	NETIF_F_HIGHDMA;
+	dev->vz_features |= NETIF_F_VENET | NETIF_F_VIRTUAL;
+
+	SET_ETHTOOL_OPS(dev, &veth_ethtool_ops);
+}
+
+#ifdef CONFIG_PROC_FS
+#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
+#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5]
+static int vehwaddr_seq_show(struct seq_file *m, void *v)
+{
+	struct list_head *p;
+	struct veth_struct *entry;
+
+	p = (struct list_head *)v;
+	if (p == &veth_hwaddr_list) {
+		seq_puts(m, "Version: 1.0\n");
+		return 0;
+	}
+	entry = list_entry(p, struct veth_struct, hwaddr_list);
+	seq_printf(m, ADDR_FMT " %16s ",
+			ADDR_ARG(entry->pair->dev_addr), entry->pair->name);
+	seq_printf(m, ADDR_FMT " %16s %10u %5s\n",
+			ADDR_ARG(veth_to_netdev(entry)->dev_addr),
+			veth_to_netdev(entry)->name,
+			VEID(veth_to_netdev(entry)->owner_env),
+			entry->allow_mac_change ? "allow" : "deny");
+	return 0;
+}
+
+static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos)
+{
+	read_lock(&ve_hwaddr_lock);
+	return seq_list_start_head(&veth_hwaddr_list, *pos);
+}
+
+static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &veth_hwaddr_list, pos);
+}
+
+static void vehwaddr_seq_stop(struct seq_file *m, void *v)
+{
+	read_unlock(&ve_hwaddr_lock);
+}
+
+static struct seq_operations vehwaddr_seq_op = {
+	.start	= vehwaddr_seq_start,
+	.next	= vehwaddr_seq_next,
+	.stop	= vehwaddr_seq_stop,
+	.show	= vehwaddr_seq_show,
+};
+
+static int vehwaddr_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vehwaddr_seq_op);
+}
+
+static struct file_operations proc_vehwaddr_operations = {
+	.open		= vehwaddr_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
+static int real_ve_hwaddr(envid_t veid, int op,
+		unsigned char *dev_addr, int addrlen, char *name,
+		unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve)
+{
+	int err;
+	struct ve_struct *ve;
+	char ve_addr[ETH_ALEN];
+
+	err = -EPERM;
+	if (!capable(CAP_NET_ADMIN))
+		goto out;
+
+	err = -EINVAL;
+	switch (op) {
+	case VE_ETH_ADD:
+		if (addrlen != ETH_ALEN)
+			goto out;
+		if (addrlen_ve != ETH_ALEN && addrlen_ve != 0)
+			goto out;
+		/* If ve addr is not set then we use dev_addr[3] & 0x80 for it */
+		if (addrlen_ve == 0 && (dev_addr[3] & 0x80))
+			goto out;
+		if (addrlen_ve == 0) {
+			memcpy(ve_addr, dev_addr, ETH_ALEN);
+			ve_addr[3] |= 0x80;
+		} else {
+			memcpy(ve_addr, dev_addr_ve, ETH_ALEN);
+		}
+
+		ve = get_ve_by_id(veid);
+		err = -ESRCH;
+		if (!ve)
+			goto out;
+
+		down_read(&ve->op_sem);
+		if (ve->is_running)
+			err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve);
+		up_read(&ve->op_sem);
+		put_ve(ve);
+		break;
+
+	case VE_ETH_DEL:
+		if (name[0] == '\0')
+			goto out;
+		ve = get_ve_by_id(veid);
+		err = -ESRCH;
+		if (!ve)
+			goto out;
+
+		down_read(&ve->op_sem);
+		if (ve->is_running)
+			err = veth_entry_del(ve, name);
+		up_read(&ve->op_sem);
+		put_ve(ve);
+		break;
+	case VE_ETH_ALLOW_MAC_CHANGE:
+	case VE_ETH_DENY_MAC_CHANGE:
+		err = veth_allow_change_mac(veid, name,
+						op == VE_ETH_ALLOW_MAC_CHANGE);
+		break;
+	}
+
+out:
+	return err;
+}
+
+static int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	case VETHCTL_VE_HWADDR: {
+		struct vzctl_ve_hwaddr s;
+
+		err = -EFAULT;
+		if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+			break;
+		err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen,
+				     s.dev_name, s.dev_addr_ve, s.addrlen_ve,
+				     s.dev_name_ve);
+	}
+	break;
+	}
+	return err;
+}
+
+static struct vzioctlinfo vethcalls = {
+	.type		= VETHCTLTYPE,
+	.ioctl		= veth_ioctl,
+	.compat_ioctl	= veth_ioctl,
+	.owner		= THIS_MODULE,
+};
+
+static struct net_device * veth_dev_start(char *dev_addr, char *name)
+{
+	struct net_device *dev;
+	int err;
+
+	if (!is_valid_ether_addr(dev_addr))
+		return ERR_PTR(-EADDRNOTAVAIL);
+
+	dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+	dev->nd_net = get_exec_env()->ve_netns;
+	if (strchr(dev->name, '%')) {
+		err = dev_alloc_name(dev, dev->name);
+		if (err < 0)
+			goto err;
+	}
+	if ((err = register_netdev(dev)) != 0)
+		goto err;
+
+	memcpy(dev->dev_addr, dev_addr, ETH_ALEN);
+	dev->addr_len = ETH_ALEN;
+
+	return dev;
+err:
+	free_netdev(dev);
+	printk(KERN_ERR "%s initialization error err=%d\n", name, err);
+	return ERR_PTR(err);
+}
+
+static __net_exit void veth_exit_net(struct list_head *net_exit_list)
+{
+	struct net *net;
+	struct veth_struct *entry, *tmp;
+	LIST_HEAD(netdev_kill_list);
+
+	down(&hwaddr_sem);
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		struct ve_struct *old_env;
+
+		old_env = set_exec_env(net->owner_ve);
+		list_for_each_entry_safe(entry, tmp,
+					 &veth_hwaddr_list, hwaddr_list)
+			if (net == veth_to_netdev(entry)->nd_net)
+				veth_pair_del(net->owner_ve, entry,
+					      &netdev_kill_list);
+		set_exec_env(old_env);
+	}
+	up(&hwaddr_sem);
+
+	rtnl_lock();
+	unregister_netdevice_many(&netdev_kill_list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations veth_net_ops = {
+	.exit_batch = veth_exit_net,
+};
+
+static __init int veth_init(void)
+{
+	int err;
+	struct proc_dir_entry *de;
+
+	err = register_pernet_device(&veth_net_ops);
+	if (err)
+		return err;
+
+#ifdef CONFIG_PROC_FS
+	de = proc_create("veth", S_IFREG|S_IRUSR, proc_vz_dir,
+			&proc_vehwaddr_operations);
+	if (de == NULL)
+		printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n");
+#endif
+
+	register_netdev_rst(&veth_netdev_rst);
+	vzioctl_register(&vethcalls);
+	return 0;
+}
+
+static __exit void veth_exit(void)
+{
+	vzioctl_unregister(&vethcalls);
+	unregister_pernet_device(&veth_net_ops);
+	unregister_netdev_rst(&veth_netdev_rst);
+
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("veth", proc_vz_dir);
+#endif
+}
+
+module_init(veth_init);
+module_exit(veth_exit);
+
+MODULE_AUTHOR("Andrey Mirkin <amirkin@sw.ru>");
+MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device");
+MODULE_LICENSE("GPL v2");
+
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/parport/parport_pc.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/parport/parport_pc.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/parport/parport_pc.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/parport/parport_pc.c	2015-01-21 12:02:42.687240629 +0300
@@ -2908,6 +2908,7 @@ enum parport_pc_pci_cards {
 	netmos_9805,
 	netmos_9815,
 	netmos_9901,
+	netmos_9865,
 	quatech_sppxp100,
 };
 
@@ -2989,6 +2990,7 @@ static struct parport_pc_pci {
 	/* netmos_9805 */               { 1, { { 0, -1 }, } },
 	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } },
 	/* netmos_9901 */               { 1, { { 0, -1 }, } },
+	/* netmos_9865 */               { 1, { { 0, -1 }, } },
 	/* quatech_sppxp100 */		{ 1, { { 0, 1 }, } },
 };
 
@@ -3092,6 +3094,10 @@ static const struct pci_device_id parpor
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9815 },
 	{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9901,
 	  0xA000, 0x2000, 0, 0, netmos_9901 },
+	{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9865,
+	  0xA000, 0x1000, 0, 0, netmos_9865 },
+	{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9865,
+	  0xA000, 0x2000, 0, 0, netmos_9865 },
 	/* Quatech SPPXP-100 Parallel port PCI ExpressCard */
 	{ PCI_VENDOR_ID_QUATECH, PCI_DEVICE_ID_QUATECH_SPPXP_100,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, quatech_sppxp100 },
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/regulator/ab3100.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/regulator/ab3100.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/regulator/ab3100.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/regulator/ab3100.c	2015-01-21 12:02:41.351276100 +0300
@@ -241,24 +241,12 @@ static int ab3100_disable_regulator(stru
 	 * LDO D is a special regulator. When it is disabled, the entire
 	 * system is shut down. So this is handled specially.
 	 */
+	pr_info("Called ab3100_disable_regulator\n");
 	if (abreg->regreg == AB3100_LDO_D) {
-		int i;
-
 		dev_info(&reg->dev, "disabling LDO D - shut down system\n");
-		/*
-		 * Set regulators to default values, ignore any errors,
-		 * we're going DOWN
-		 */
-		for (i = 0; i < ARRAY_SIZE(ab3100_reg_init_order); i++) {
-			(void) ab3100_set_register_interruptible(abreg->ab3100,
-					ab3100_reg_init_order[i],
-					abreg->plfdata->reg_initvals[i]);
-		}
-
 		/* Setting LDO D to 0x00 cuts the power to the SoC */
 		return ab3100_set_register_interruptible(abreg->ab3100,
 							 AB3100_LDO_D, 0x00U);
-
 	}
 
 	/*
@@ -607,13 +595,6 @@ static int __init ab3100_regulators_prob
 		}
 	}
 
-	if (err) {
-		dev_err(&pdev->dev,
-			"LDO D regulator initialization failed with error %d\n",
-			err);
-		return err;
-	}
-
 	/* Register the regulators */
 	for (i = 0; i < AB3100_NUM_REGULATORS; i++) {
 		struct ab3100_regulator *reg = &ab3100_regulators[i];
@@ -688,7 +669,7 @@ static __init int ab3100_regulators_init
 
 static __exit void ab3100_regulators_exit(void)
 {
-	platform_driver_register(&ab3100_regulators_driver);
+	platform_driver_unregister(&ab3100_regulators_driver);
 }
 
 subsys_initcall(ab3100_regulators_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/scsi/hosts.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/scsi/hosts.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/scsi/hosts.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/scsi/hosts.c	2015-01-21 12:02:41.790264445 +0300
@@ -420,9 +420,8 @@ struct Scsi_Host *scsi_host_alloc(struct
 
 	device_initialize(&shost->shost_gendev);
 	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
-#ifndef CONFIG_SYSFS_DEPRECATED
-	shost->shost_gendev.bus = &scsi_bus_type;
-#endif
+	if (!sysfs_deprecated)
+		shost->shost_gendev.bus = &scsi_bus_type;
 	shost->shost_gendev.type = &scsi_host_type;
 
 	device_initialize(&shost->shost_dev);
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/scsi/scsi_scan.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/scsi/scsi_scan.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/scsi/scsi_scan.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/scsi/scsi_scan.c	2015-01-21 12:02:41.791264419 +0300
@@ -435,9 +435,8 @@ static struct scsi_target *scsi_alloc_ta
 	kref_init(&starget->reap_ref);
 	dev->parent = get_device(parent);
 	dev_set_name(dev, "target%d:%d:%d", shost->host_no, channel, id);
-#ifndef CONFIG_SYSFS_DEPRECATED
-	dev->bus = &scsi_bus_type;
-#endif
+	if (!sysfs_deprecated)
+		dev->bus = &scsi_bus_type;
 	dev->type = &scsi_target_type;
 	starget->id = id;
 	starget->channel = channel;
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/serial/8250_pci.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/serial/8250_pci.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/serial/8250_pci.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/serial/8250_pci.c	2015-01-21 12:02:42.687240629 +0300
@@ -819,7 +819,8 @@ static int pci_netmos_init(struct pci_de
 	/* subdevice 0x00PS means <P> parallel, <S> serial */
 	unsigned int num_serial = dev->subsystem_device & 0xf;
 
-	if (dev->device == PCI_DEVICE_ID_NETMOS_9901)
+	if ((dev->device == PCI_DEVICE_ID_NETMOS_9901) ||
+		(dev->device == PCI_DEVICE_ID_NETMOS_9865))
 		return 0;
 
 	if (dev->subsystem_vendor == PCI_VENDOR_ID_IBM &&
@@ -1585,6 +1586,7 @@ enum pci_board_num_t {
 
 	pbn_b0_bt_1_115200,
 	pbn_b0_bt_2_115200,
+	pbn_b0_bt_4_115200,
 	pbn_b0_bt_8_115200,
 
 	pbn_b0_bt_1_460800,
@@ -1811,6 +1813,12 @@ static struct pciserial_board pci_boards
 		.base_baud	= 115200,
 		.uart_offset	= 8,
 	},
+	[pbn_b0_bt_4_115200] = {
+		.flags		= FL_BASE0|FL_BASE_BARS,
+		.num_ports	= 4,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
 	[pbn_b0_bt_8_115200] = {
 		.flags		= FL_BASE0|FL_BASE_BARS,
 		.num_ports	= 8,
@@ -3805,6 +3813,18 @@ static struct pci_device_id serial_pci_t
 		pbn_brcm_trumanage },
 
 	/*
+	 * Best Connectivity PCI Multi I/O cards
+	 */
+
+	{	PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9865,
+		0xA000, 0x1000,
+		0, 0, pbn_b0_1_115200 },
+
+	{	PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9865,
+		0xA000, 0x3004,
+		0, 0, pbn_b0_bt_4_115200 },
+
+	/*
 	 * These entries match devices with class COMMUNICATION_SERIAL,
 	 * COMMUNICATION_MODEM or COMMUNICATION_MULTISERIAL
 	 */
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/staging/pohmelfs/path_entry.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/staging/pohmelfs/path_entry.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/staging/pohmelfs/path_entry.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/staging/pohmelfs/path_entry.c	2015-01-21 12:02:42.162254568 +0300
@@ -45,9 +45,9 @@ int pohmelfs_construct_path_string(struc
 		return -ENOENT;
 	}
 
-	read_lock(&current->fs->lock);
+	spin_lock(&current->fs->lock);
 	path.mnt = mntget(current->fs->root.mnt);
-	read_unlock(&current->fs->lock);
+	spin_unlock(&current->fs->lock);
 
 	path.dentry = d;
 
@@ -92,9 +92,9 @@ int pohmelfs_path_length(struct pohmelfs
 		return -ENOENT;
 	}
 
-	read_lock(&current->fs->lock);
+	spin_lock(&current->fs->lock);
 	root = dget(current->fs->root.dentry);
-	read_unlock(&current->fs->lock);
+	spin_unlock(&current->fs->lock);
 
 	spin_lock(&dcache_lock);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/Kconfig	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/Kconfig	2015-01-21 12:02:58.147830203 +0300
@@ -82,4 +82,9 @@ config LOGO_M32R_CLUT224
 	depends on M32R
 	default y
 
+config LOGO_PSBM_CLUT224
+	bool
+	depends on LOGO
+	default y
+
 endif # LOGO
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/Makefile
--- linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/Makefile	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/Makefile	2015-01-21 12:02:58.147830203 +0300
@@ -15,6 +15,7 @@ obj-$(CONFIG_LOGO_SUPERH_MONO)		+= logo_
 obj-$(CONFIG_LOGO_SUPERH_VGA16)		+= logo_superh_vga16.o
 obj-$(CONFIG_LOGO_SUPERH_CLUT224)	+= logo_superh_clut224.o
 obj-$(CONFIG_LOGO_M32R_CLUT224)		+= logo_m32r_clut224.o
+obj-$(CONFIG_LOGO_PSBM_CLUT224)		+= logo_psbm_clut224.o
 
 obj-$(CONFIG_SPU_BASE)			+= logo_spe_clut224.o
 
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/logo.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/logo.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/logo.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/logo.c	2015-01-21 12:02:58.147830203 +0300
@@ -21,6 +21,18 @@
 #include <asm/bootinfo.h>
 #endif
 
+#ifdef CONFIG_LOGO_PSBM_CLUT224
+
+static int logo_psbm = 0;
+static int __init psbm(char *arg)
+{
+	logo_psbm = 1;
+	return 0;
+}
+__setup("psbm", psbm);
+
+#endif
+
 static int nologo;
 module_param(nologo, bool, 0);
 MODULE_PARM_DESC(nologo, "Disables startup logo");
@@ -100,6 +112,11 @@ const struct linux_logo * __init_refok f
 		/* M32R Linux logo */
 		logo = &logo_m32r_clut224;
 #endif
+#ifdef CONFIG_LOGO_PSBM_CLUT224
+		/* PSBM logo */
+		if (logo_psbm)
+			logo = &logo_psbm_clut224;
+#endif
 	}
 	return logo;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/logo_psbm_clut224.ppm linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/logo_psbm_clut224.ppm
--- linux-2.6.32-504.3.3.el6.orig/drivers/video/logo/logo_psbm_clut224.ppm	2015-01-21 12:02:58.150830123 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/logo/logo_psbm_clut224.ppm	2015-01-21 12:02:58.150830123 +0300
@@ -0,0 +1,19204 @@
+P3
+# CREATOR: GIMP PNM Filter Version 1.1
+80 80
+255
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+48
+72
+105
+99
+118
+168
+44
+32
+43
+15
+10
+13
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+15
+10
+13
+48
+72
+105
+96
+110
+155
+108
+137
+186
+131
+147
+192
+140
+156
+194
+108
+137
+186
+99
+118
+168
+96
+110
+155
+17
+32
+49
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+17
+32
+49
+48
+72
+105
+99
+118
+168
+108
+137
+186
+140
+156
+194
+191
+196
+216
+228
+229
+231
+232
+233
+234
+217
+220
+223
+172
+179
+196
+121
+146
+192
+108
+126
+179
+96
+110
+155
+48
+72
+105
+10
+17
+25
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+17
+32
+49
+96
+110
+155
+105
+121
+170
+121
+146
+192
+172
+179
+196
+211
+212
+222
+222
+223
+224
+221
+221
+222
+228
+229
+230
+232
+233
+234
+227
+228
+229
+227
+228
+229
+228
+229
+231
+195
+200
+215
+149
+161
+191
+108
+137
+186
+105
+121
+170
+96
+110
+155
+17
+32
+49
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+10
+17
+25
+48
+72
+105
+99
+118
+168
+108
+137
+186
+140
+156
+194
+189
+191
+197
+217
+218
+219
+217
+218
+219
+220
+221
+222
+222
+223
+224
+222
+223
+224
+228
+229
+230
+231
+232
+233
+227
+228
+229
+227
+228
+229
+227
+228
+229
+227
+228
+229
+228
+229
+230
+210
+216
+228
+172
+179
+196
+121
+146
+192
+108
+137
+186
+99
+118
+168
+48
+72
+105
+15
+10
+13
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+17
+32
+49
+96
+110
+155
+108
+137
+186
+109
+140
+191
+149
+161
+191
+203
+205
+212
+210
+212
+214
+215
+216
+217
+217
+218
+219
+218
+219
+220
+220
+221
+222
+221
+221
+222
+221
+221
+222
+226
+227
+228
+228
+229
+230
+224
+225
+226
+224
+225
+226
+226
+227
+228
+228
+229
+230
+228
+229
+230
+228
+229
+230
+228
+229
+230
+230
+231
+232
+195
+200
+215
+149
+161
+191
+108
+137
+186
+108
+126
+179
+96
+110
+155
+17
+32
+49
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+10
+17
+25
+48
+72
+105
+98
+114
+161
+108
+137
+186
+131
+147
+192
+172
+179
+196
+209
+210
+212
+207
+208
+210
+210
+211
+213
+212
+213
+215
+215
+216
+217
+216
+217
+218
+216
+217
+218
+218
+219
+220
+220
+221
+222
+218
+219
+220
+224
+226
+227
+228
+229
+230
+222
+223
+224
+223
+224
+225
+224
+225
+226
+224
+226
+227
+226
+227
+228
+228
+229
+230
+228
+229
+230
+228
+229
+231
+228
+229
+230
+228
+229
+231
+210
+216
+228
+191
+196
+216
+121
+146
+192
+105
+121
+170
+96
+110
+155
+48
+72
+105
+10
+17
+25
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+17
+32
+49
+48
+72
+105
+105
+121
+170
+109
+140
+191
+140
+156
+194
+189
+191
+197
+202
+204
+207
+202
+204
+207
+207
+208
+210
+210
+211
+213
+212
+213
+215
+212
+213
+215
+212
+213
+215
+212
+214
+216
+215
+216
+217
+215
+216
+217
+217
+218
+219
+217
+218
+219
+223
+224
+225
+226
+227
+228
+222
+223
+224
+220
+221
+222
+222
+223
+224
+223
+224
+225
+224
+225
+226
+224
+226
+227
+226
+227
+228
+228
+229
+230
+227
+228
+229
+228
+229
+230
+221
+221
+222
+218
+210
+210
+218
+210
+210
+194
+197
+208
+149
+161
+191
+108
+137
+186
+99
+118
+168
+96
+110
+155
+17
+32
+49
+4
+2
+3
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+24
+6
+6
+25
+8
+8
+24
+6
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+10
+17
+25
+48
+72
+105
+96
+110
+155
+108
+126
+179
+131
+147
+192
+157
+164
+189
+196
+198
+200
+199
+201
+202
+200
+201
+203
+206
+206
+208
+208
+209
+211
+208
+209
+211
+208
+209
+211
+209
+210
+212
+210
+211
+213
+212
+213
+215
+210
+212
+214
+210
+212
+214
+212
+214
+216
+215
+216
+217
+212
+214
+216
+220
+221
+222
+223
+224
+225
+220
+221
+222
+220
+221
+222
+222
+223
+224
+222
+223
+224
+222
+223
+224
+223
+224
+225
+224
+225
+226
+224
+225
+226
+218
+210
+210
+192
+181
+183
+206
+88
+67
+207
+87
+66
+207
+87
+66
+206
+88
+67
+192
+181
+183
+212
+197
+195
+191
+196
+216
+121
+146
+192
+108
+126
+179
+96
+110
+155
+48
+72
+105
+15
+10
+13
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+63
+18
+25
+132
+29
+26
+154
+37
+33
+154
+37
+33
+144
+28
+23
+93
+23
+26
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+17
+32
+49
+48
+72
+105
+99
+118
+168
+108
+137
+186
+140
+156
+194
+172
+179
+196
+195
+197
+199
+195
+197
+199
+199
+201
+202
+201
+202
+204
+202
+203
+205
+202
+204
+207
+206
+206
+208
+206
+206
+208
+207
+208
+210
+207
+208
+210
+208
+209
+211
+209
+210
+212
+210
+211
+213
+209
+210
+212
+205
+207
+213
+194
+197
+208
+194
+197
+208
+213
+214
+219
+221
+221
+222
+216
+217
+218
+217
+218
+219
+220
+221
+222
+221
+221
+222
+222
+223
+224
+222
+223
+224
+221
+221
+222
+212
+197
+195
+206
+88
+67
+227
+54
+40
+224
+18
+34
+224
+18
+34
+224
+18
+34
+224
+18
+34
+221
+43
+35
+206
+88
+67
+212
+197
+195
+228
+229
+231
+195
+200
+215
+140
+156
+194
+108
+126
+179
+108
+137
+186
+96
+110
+155
+17
+32
+49
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+132
+29
+26
+202
+49
+42
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+220
+49
+37
+154
+37
+33
+59
+14
+23
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+15
+10
+13
+17
+32
+49
+96
+110
+155
+108
+137
+186
+121
+146
+192
+149
+161
+191
+186
+187
+194
+189
+189
+191
+192
+192
+195
+195
+197
+199
+199
+201
+202
+199
+201
+202
+200
+201
+203
+200
+201
+203
+201
+202
+204
+202
+203
+205
+202
+204
+207
+206
+206
+208
+206
+206
+208
+206
+206
+208
+207
+208
+210
+202
+204
+207
+186
+187
+194
+149
+161
+191
+140
+156
+194
+140
+156
+194
+149
+161
+191
+191
+196
+216
+203
+205
+212
+215
+216
+217
+216
+217
+218
+218
+219
+220
+220
+221
+222
+218
+219
+220
+218
+210
+210
+206
+88
+67
+221
+43
+35
+224
+18
+34
+227
+48
+37
+227
+52
+39
+227
+53
+39
+227
+51
+39
+224
+18
+34
+224
+18
+34
+206
+88
+67
+212
+197
+195
+228
+229
+231
+231
+232
+233
+207
+210
+224
+191
+196
+216
+119
+140
+184
+108
+126
+179
+98
+114
+161
+48
+72
+105
+10
+17
+25
+0
+0
+0
+0
+0
+0
+0
+0
+0
+93
+23
+26
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+185
+48
+43
+24
+6
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+10
+17
+25
+48
+72
+105
+99
+118
+168
+108
+137
+186
+134
+150
+190
+165
+170
+188
+185
+187
+189
+185
+187
+189
+189
+189
+191
+193
+195
+197
+195
+197
+199
+195
+197
+199
+195
+197
+199
+196
+198
+200
+199
+201
+202
+199
+201
+202
+200
+201
+203
+200
+201
+203
+199
+201
+202
+202
+203
+205
+202
+203
+205
+189
+191
+197
+172
+179
+196
+140
+156
+194
+122
+145
+187
+108
+137
+186
+118
+139
+183
+119
+140
+184
+108
+137
+186
+121
+146
+192
+140
+156
+194
+172
+179
+196
+194
+197
+208
+210
+212
+214
+216
+217
+218
+212
+213
+215
+192
+181
+183
+227
+54
+40
+221
+43
+35
+227
+52
+39
+227
+54
+40
+227
+51
+39
+227
+51
+39
+227
+52
+39
+227
+54
+40
+221
+43
+35
+224
+18
+34
+192
+181
+183
+224
+225
+226
+231
+232
+233
+230
+231
+232
+231
+232
+233
+224
+226
+227
+191
+196
+216
+140
+156
+194
+108
+126
+179
+98
+114
+161
+10
+17
+25
+0
+0
+0
+25
+8
+8
+210
+47
+36
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+132
+29
+26
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+17
+32
+49
+96
+110
+155
+108
+126
+179
+119
+140
+184
+146
+158
+190
+172
+176
+186
+182
+182
+186
+182
+182
+186
+185
+187
+189
+192
+192
+195
+192
+192
+195
+193
+195
+197
+193
+195
+197
+195
+197
+199
+195
+197
+199
+195
+197
+199
+196
+198
+200
+196
+198
+200
+196
+198
+200
+199
+201
+202
+193
+195
+200
+172
+179
+196
+149
+161
+191
+126
+144
+186
+117
+136
+179
+108
+126
+179
+108
+137
+186
+108
+137
+186
+122
+145
+187
+126
+144
+186
+118
+139
+183
+108
+137
+186
+108
+137
+186
+119
+140
+184
+131
+147
+192
+149
+161
+191
+189
+191
+197
+203
+205
+212
+206
+88
+67
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+206
+88
+67
+218
+210
+210
+228
+229
+230
+228
+229
+231
+230
+231
+232
+231
+232
+233
+234
+235
+236
+244
+244
+245
+228
+229
+231
+131
+147
+192
+17
+32
+49
+0
+0
+0
+93
+23
+26
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+202
+49
+42
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+10
+17
+25
+48
+72
+105
+98
+114
+161
+108
+126
+179
+126
+144
+186
+157
+162
+180
+178
+179
+182
+178
+179
+182
+180
+182
+185
+185
+187
+189
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+192
+192
+195
+193
+195
+197
+193
+195
+197
+193
+195
+197
+193
+195
+197
+193
+195
+197
+182
+186
+191
+165
+170
+188
+137
+154
+190
+120
+138
+180
+111
+129
+175
+108
+126
+179
+108
+126
+179
+114
+133
+178
+118
+139
+183
+118
+139
+183
+122
+145
+187
+122
+145
+187
+118
+139
+183
+118
+139
+183
+118
+139
+183
+108
+137
+186
+108
+137
+186
+108
+137
+186
+121
+146
+192
+146
+158
+190
+197
+88
+72
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+224
+226
+227
+228
+229
+231
+234
+235
+236
+239
+240
+242
+244
+244
+245
+244
+244
+245
+244
+244
+245
+121
+146
+192
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+220
+49
+37
+0
+0
+0
+5
+4
+6
+17
+32
+49
+96
+110
+155
+105
+121
+170
+108
+137
+186
+137
+154
+190
+161
+167
+184
+175
+174
+176
+173
+172
+176
+178
+179
+182
+182
+184
+187
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+186
+187
+194
+171
+175
+188
+146
+158
+190
+127
+141
+181
+112
+132
+175
+110
+128
+172
+108
+123
+172
+111
+129
+175
+114
+133
+178
+117
+136
+179
+114
+133
+178
+114
+133
+178
+114
+133
+178
+122
+145
+187
+122
+145
+187
+118
+139
+183
+118
+139
+183
+118
+139
+183
+118
+139
+183
+119
+140
+184
+119
+140
+184
+109
+140
+191
+114
+133
+178
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+231
+232
+233
+239
+240
+242
+244
+244
+245
+244
+244
+245
+239
+240
+242
+234
+235
+236
+231
+232
+233
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+17
+32
+49
+96
+110
+155
+108
+137
+186
+122
+145
+187
+145
+152
+186
+173
+172
+176
+173
+172
+176
+173
+172
+176
+177
+179
+181
+178
+179
+182
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+182
+186
+182
+184
+187
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+177
+180
+190
+157
+162
+180
+127
+143
+182
+115
+134
+176
+108
+123
+172
+102
+119
+168
+108
+123
+172
+111
+129
+175
+112
+132
+175
+112
+132
+175
+112
+132
+175
+112
+132
+175
+114
+133
+178
+114
+133
+178
+114
+133
+178
+119
+140
+184
+122
+145
+187
+118
+139
+183
+118
+139
+183
+118
+139
+183
+118
+139
+183
+118
+139
+183
+118
+139
+183
+109
+140
+191
+118
+139
+183
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+244
+244
+245
+244
+244
+245
+239
+240
+242
+232
+233
+234
+231
+232
+233
+232
+233
+234
+232
+233
+234
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+121
+146
+192
+157
+162
+180
+157
+162
+180
+157
+162
+180
+173
+172
+176
+175
+174
+176
+177
+179
+181
+177
+179
+181
+177
+179
+181
+177
+179
+181
+178
+179
+182
+178
+179
+182
+180
+182
+185
+182
+182
+186
+182
+184
+187
+182
+184
+187
+182
+184
+187
+167
+171
+180
+145
+152
+186
+123
+140
+181
+110
+128
+172
+102
+119
+168
+102
+119
+168
+107
+122
+171
+110
+128
+172
+110
+128
+172
+111
+129
+174
+112
+132
+175
+112
+132
+175
+112
+132
+175
+112
+132
+175
+112
+132
+175
+112
+132
+175
+114
+133
+178
+119
+140
+184
+119
+140
+184
+117
+136
+179
+117
+136
+179
+117
+136
+179
+118
+139
+183
+118
+139
+183
+118
+139
+183
+109
+140
+191
+127
+143
+182
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+232
+233
+234
+228
+229
+230
+230
+231
+232
+231
+232
+233
+232
+233
+234
+232
+233
+234
+230
+231
+232
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+221
+222
+227
+210
+212
+214
+189
+189
+191
+177
+179
+181
+173
+172
+176
+173
+172
+176
+173
+172
+176
+173
+172
+176
+177
+179
+181
+177
+179
+181
+177
+179
+181
+177
+179
+181
+178
+179
+182
+180
+182
+185
+180
+182
+185
+180
+182
+185
+167
+171
+180
+120
+133
+172
+99
+115
+164
+99
+118
+168
+102
+119
+168
+105
+121
+166
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+111
+129
+174
+111
+129
+174
+112
+132
+175
+112
+132
+175
+112
+132
+175
+112
+132
+175
+119
+139
+182
+119
+139
+182
+114
+133
+178
+114
+133
+178
+117
+136
+179
+118
+139
+183
+119
+140
+184
+125
+142
+182
+122
+145
+187
+121
+135
+173
+188
+87
+73
+220
+44
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+227
+228
+229
+230
+231
+232
+231
+232
+233
+230
+231
+232
+232
+233
+234
+231
+232
+233
+230
+231
+232
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+239
+240
+242
+244
+244
+245
+244
+244
+245
+230
+231
+232
+207
+208
+210
+182
+184
+187
+173
+172
+176
+173
+172
+176
+173
+172
+176
+173
+172
+176
+177
+179
+181
+177
+179
+181
+177
+179
+181
+177
+179
+181
+178
+179
+182
+178
+179
+182
+157
+162
+180
+101
+116
+162
+96
+110
+155
+101
+116
+162
+107
+122
+163
+109
+124
+168
+110
+128
+172
+110
+128
+172
+109
+124
+168
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+111
+129
+174
+112
+132
+175
+112
+132
+175
+112
+132
+175
+117
+136
+179
+118
+136
+180
+114
+133
+178
+117
+136
+179
+119
+139
+182
+123
+140
+181
+123
+140
+181
+119
+135
+179
+108
+126
+179
+109
+122
+168
+173
+85
+78
+220
+44
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+228
+229
+230
+230
+231
+232
+228
+229
+231
+230
+231
+232
+230
+231
+232
+230
+231
+232
+230
+231
+232
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+230
+231
+232
+234
+235
+236
+239
+240
+242
+244
+244
+245
+244
+244
+245
+239
+240
+242
+222
+223
+224
+196
+198
+200
+178
+179
+182
+173
+172
+176
+173
+172
+176
+173
+172
+176
+175
+174
+176
+177
+179
+181
+178
+179
+182
+177
+179
+181
+157
+162
+180
+112
+117
+158
+96
+110
+155
+101
+112
+157
+101
+116
+162
+107
+122
+163
+107
+122
+163
+112
+127
+169
+111
+127
+170
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+111
+129
+175
+120
+138
+180
+123
+140
+181
+123
+140
+181
+123
+140
+181
+120
+137
+180
+117
+130
+172
+109
+122
+168
+104
+119
+165
+99
+118
+168
+112
+127
+169
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+226
+227
+228
+228
+229
+230
+228
+229
+231
+230
+231
+232
+228
+229
+231
+230
+231
+232
+228
+229
+230
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+228
+229
+230
+230
+231
+232
+231
+232
+233
+232
+233
+234
+239
+240
+242
+244
+244
+245
+244
+244
+245
+244
+244
+245
+232
+233
+234
+212
+213
+215
+189
+189
+191
+175
+174
+176
+173
+172
+176
+173
+172
+176
+173
+172
+176
+175
+174
+176
+157
+162
+180
+112
+117
+158
+96
+110
+155
+101
+112
+157
+101
+112
+157
+101
+112
+157
+101
+112
+157
+107
+122
+163
+111
+123
+164
+112
+127
+169
+114
+129
+171
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+110
+128
+172
+112
+132
+175
+115
+134
+176
+127
+143
+182
+127
+141
+181
+120
+132
+173
+112
+125
+170
+104
+119
+165
+99
+115
+164
+101
+116
+162
+114
+128
+171
+137
+154
+190
+172
+179
+196
+197
+88
+72
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+224
+226
+227
+227
+228
+229
+228
+229
+230
+228
+229
+230
+228
+229
+231
+230
+231
+232
+227
+228
+229
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+226
+227
+228
+230
+231
+232
+230
+231
+232
+230
+231
+232
+231
+232
+233
+231
+232
+233
+234
+235
+236
+239
+240
+242
+244
+244
+245
+244
+244
+245
+239
+240
+242
+226
+227
+228
+202
+203
+205
+180
+182
+185
+173
+172
+176
+173
+172
+176
+157
+162
+180
+101
+112
+157
+96
+110
+155
+101
+112
+157
+96
+110
+155
+96
+110
+155
+101
+112
+157
+101
+112
+157
+101
+112
+157
+101
+116
+162
+111
+123
+164
+112
+127
+169
+116
+130
+170
+112
+132
+175
+112
+132
+175
+119
+134
+175
+121
+135
+173
+120
+133
+172
+121
+135
+173
+113
+125
+167
+98
+114
+161
+98
+114
+161
+104
+119
+165
+120
+132
+173
+146
+158
+190
+191
+196
+216
+211
+218
+231
+195
+200
+215
+197
+88
+72
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+224
+226
+227
+226
+227
+228
+226
+227
+228
+227
+228
+229
+228
+229
+230
+228
+229
+231
+227
+228
+229
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+224
+226
+227
+228
+229
+230
+228
+229
+230
+230
+231
+232
+230
+231
+232
+231
+232
+233
+231
+232
+233
+231
+232
+233
+234
+235
+236
+239
+240
+242
+244
+244
+245
+244
+244
+245
+244
+244
+245
+234
+235
+236
+220
+221
+222
+192
+192
+195
+157
+162
+180
+112
+117
+158
+96
+110
+155
+101
+112
+157
+101
+112
+157
+96
+110
+155
+96
+110
+155
+96
+110
+155
+101
+112
+157
+101
+112
+157
+101
+116
+162
+101
+116
+162
+107
+122
+163
+117
+128
+167
+121
+135
+173
+120
+133
+172
+111
+123
+164
+101
+116
+162
+104
+119
+165
+104
+119
+165
+107
+122
+163
+127
+141
+181
+165
+170
+188
+195
+200
+215
+207
+210
+224
+191
+196
+216
+140
+156
+194
+119
+135
+179
+188
+87
+73
+222
+47
+36
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+223
+224
+225
+224
+226
+227
+224
+225
+226
+226
+227
+228
+226
+227
+228
+227
+228
+229
+224
+226
+227
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+223
+224
+225
+226
+227
+228
+228
+229
+230
+228
+229
+230
+228
+229
+230
+230
+231
+232
+230
+231
+232
+231
+232
+233
+232
+233
+234
+231
+232
+233
+232
+233
+234
+234
+235
+236
+239
+240
+242
+244
+244
+245
+244
+244
+245
+244
+244
+245
+221
+222
+227
+172
+179
+196
+121
+135
+173
+120
+133
+172
+113
+125
+165
+101
+112
+157
+101
+112
+157
+96
+110
+155
+96
+110
+155
+96
+110
+155
+101
+112
+157
+101
+112
+157
+101
+112
+157
+110
+120
+164
+113
+125
+165
+101
+112
+157
+96
+110
+155
+101
+112
+157
+120
+133
+172
+145
+152
+186
+172
+179
+196
+200
+202
+216
+195
+200
+215
+172
+179
+196
+131
+147
+192
+112
+130
+177
+108
+137
+186
+113
+131
+175
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+223
+224
+225
+224
+226
+227
+224
+225
+226
+226
+227
+228
+224
+225
+226
+224
+226
+227
+223
+224
+225
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+221
+221
+222
+224
+226
+227
+226
+227
+228
+226
+227
+228
+228
+229
+230
+228
+229
+230
+228
+229
+230
+230
+231
+232
+230
+231
+232
+230
+231
+232
+231
+232
+233
+232
+233
+234
+232
+233
+234
+234
+235
+236
+239
+240
+242
+239
+240
+242
+221
+222
+227
+207
+210
+224
+210
+216
+228
+191
+196
+216
+160
+166
+188
+127
+141
+181
+117
+128
+167
+112
+117
+158
+101
+112
+157
+96
+110
+155
+101
+112
+157
+101
+112
+157
+96
+110
+155
+96
+110
+155
+101
+112
+157
+101
+112
+157
+117
+128
+167
+146
+158
+190
+191
+196
+216
+207
+210
+224
+191
+196
+216
+149
+161
+191
+118
+136
+180
+108
+126
+179
+108
+126
+179
+114
+133
+178
+108
+137
+186
+119
+135
+179
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+223
+224
+225
+223
+224
+225
+224
+225
+226
+224
+225
+226
+224
+225
+226
+224
+226
+227
+222
+223
+224
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+220
+221
+222
+224
+225
+226
+224
+225
+226
+224
+226
+227
+226
+227
+228
+226
+227
+228
+228
+229
+230
+228
+229
+230
+228
+229
+230
+230
+231
+232
+230
+231
+232
+231
+232
+233
+231
+232
+233
+232
+233
+234
+232
+233
+234
+232
+233
+234
+200
+202
+216
+121
+146
+192
+109
+140
+191
+172
+179
+196
+191
+196
+216
+207
+210
+224
+186
+187
+194
+157
+162
+180
+121
+135
+173
+112
+117
+158
+101
+112
+157
+96
+110
+155
+96
+110
+155
+107
+122
+163
+127
+141
+181
+160
+166
+188
+191
+196
+216
+207
+210
+224
+191
+196
+216
+140
+156
+194
+112
+130
+177
+108
+126
+179
+108
+126
+179
+114
+133
+178
+118
+136
+180
+119
+135
+179
+108
+137
+186
+119
+135
+179
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+221
+221
+222
+222
+223
+224
+224
+225
+226
+223
+224
+225
+224
+225
+226
+224
+225
+226
+221
+221
+222
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+220
+221
+222
+224
+225
+226
+223
+224
+225
+222
+223
+224
+223
+224
+225
+226
+227
+228
+226
+227
+228
+227
+228
+229
+227
+228
+229
+230
+231
+232
+230
+231
+232
+230
+231
+232
+230
+231
+232
+230
+231
+232
+231
+232
+233
+231
+232
+233
+200
+202
+216
+119
+140
+184
+108
+126
+179
+140
+156
+194
+140
+156
+194
+140
+156
+194
+191
+196
+216
+200
+202
+216
+195
+200
+215
+172
+179
+196
+127
+143
+182
+121
+135
+173
+145
+152
+186
+172
+179
+196
+200
+202
+216
+207
+210
+224
+172
+179
+196
+122
+138
+181
+119
+135
+179
+114
+133
+178
+111
+128
+175
+112
+130
+177
+119
+135
+179
+114
+133
+178
+114
+133
+178
+119
+135
+179
+108
+137
+186
+118
+134
+176
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+220
+221
+222
+222
+223
+224
+222
+223
+224
+223
+224
+225
+224
+225
+226
+223
+224
+225
+220
+221
+222
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+217
+218
+219
+222
+223
+224
+226
+227
+228
+227
+228
+229
+224
+225
+226
+222
+223
+224
+222
+223
+224
+224
+226
+227
+227
+228
+229
+227
+228
+229
+228
+229
+230
+230
+231
+232
+230
+231
+232
+230
+231
+232
+230
+231
+232
+230
+231
+232
+200
+202
+216
+121
+146
+192
+108
+126
+179
+121
+146
+192
+131
+147
+192
+131
+147
+192
+131
+147
+192
+140
+156
+194
+172
+179
+196
+191
+196
+216
+207
+210
+224
+207
+210
+224
+207
+210
+224
+191
+196
+216
+157
+164
+189
+114
+133
+178
+108
+123
+172
+108
+123
+172
+122
+138
+181
+122
+138
+181
+114
+133
+178
+112
+130
+177
+114
+133
+178
+114
+133
+178
+114
+133
+178
+114
+133
+178
+108
+137
+186
+116
+132
+174
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+220
+221
+222
+222
+223
+224
+221
+221
+222
+222
+223
+224
+222
+223
+224
+223
+224
+225
+220
+221
+222
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+215
+216
+217
+220
+221
+222
+228
+229
+230
+234
+235
+236
+232
+233
+234
+228
+229
+230
+224
+225
+226
+221
+221
+222
+223
+224
+225
+224
+226
+227
+227
+228
+229
+228
+229
+230
+228
+229
+230
+230
+231
+232
+230
+231
+232
+230
+231
+232
+200
+202
+216
+121
+146
+192
+108
+126
+179
+108
+126
+179
+108
+126
+179
+109
+140
+191
+131
+147
+192
+140
+156
+194
+131
+147
+192
+131
+147
+192
+191
+196
+216
+221
+222
+227
+140
+156
+194
+112
+128
+175
+119
+135
+179
+111
+128
+175
+111
+128
+175
+112
+130
+177
+122
+138
+181
+120
+137
+180
+112
+130
+177
+112
+130
+177
+112
+130
+177
+114
+133
+178
+114
+133
+178
+114
+133
+178
+108
+137
+186
+116
+132
+174
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+220
+221
+222
+220
+221
+222
+221
+221
+222
+222
+223
+224
+221
+221
+222
+223
+224
+225
+218
+219
+220
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+212
+214
+216
+218
+219
+220
+221
+221
+222
+227
+228
+229
+232
+233
+234
+234
+235
+236
+234
+235
+236
+228
+229
+231
+226
+227
+228
+222
+223
+224
+222
+223
+224
+224
+226
+227
+227
+228
+229
+228
+229
+230
+228
+229
+230
+228
+229
+230
+200
+202
+216
+121
+146
+192
+108
+137
+186
+131
+147
+192
+109
+140
+191
+108
+126
+179
+108
+126
+179
+108
+137
+186
+121
+146
+192
+131
+147
+192
+172
+179
+196
+211
+218
+231
+115
+130
+174
+108
+123
+172
+122
+138
+181
+114
+133
+178
+112
+128
+175
+112
+128
+175
+122
+138
+181
+120
+137
+180
+112
+130
+177
+112
+128
+175
+112
+130
+177
+112
+130
+177
+114
+133
+178
+114
+133
+178
+108
+137
+186
+116
+132
+174
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+218
+219
+220
+218
+219
+220
+220
+221
+222
+220
+221
+222
+221
+221
+222
+221
+221
+222
+217
+218
+219
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+212
+214
+216
+217
+218
+219
+218
+219
+220
+218
+219
+220
+218
+219
+220
+224
+225
+226
+230
+231
+232
+234
+235
+236
+234
+235
+236
+231
+232
+233
+227
+228
+229
+224
+225
+226
+222
+223
+224
+224
+225
+226
+226
+227
+228
+227
+228
+229
+195
+200
+215
+119
+140
+184
+108
+126
+179
+135
+149
+191
+140
+156
+194
+131
+147
+192
+121
+146
+192
+108
+137
+186
+108
+126
+179
+108
+126
+179
+149
+161
+191
+221
+222
+227
+127
+141
+181
+112
+128
+175
+118
+134
+176
+112
+128
+175
+110
+126
+172
+111
+128
+175
+119
+135
+179
+119
+135
+179
+112
+128
+175
+112
+128
+175
+112
+130
+177
+112
+130
+177
+112
+130
+177
+115
+130
+174
+108
+137
+186
+116
+132
+174
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+212
+197
+195
+216
+217
+218
+218
+219
+220
+218
+219
+220
+218
+219
+220
+220
+221
+222
+218
+219
+220
+217
+218
+219
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+212
+213
+215
+217
+218
+219
+218
+219
+220
+217
+218
+219
+217
+218
+219
+218
+219
+220
+218
+219
+220
+223
+224
+225
+228
+229
+230
+232
+233
+234
+234
+235
+236
+232
+233
+234
+228
+229
+230
+224
+225
+226
+222
+223
+224
+223
+224
+225
+195
+200
+215
+119
+140
+184
+99
+118
+168
+108
+126
+179
+118
+139
+183
+131
+147
+192
+140
+156
+194
+140
+156
+194
+121
+146
+192
+108
+137
+186
+140
+156
+194
+210
+216
+228
+115
+130
+174
+108
+123
+172
+115
+130
+174
+111
+128
+175
+111
+128
+175
+115
+130
+174
+127
+141
+181
+127
+141
+181
+114
+133
+178
+112
+128
+175
+111
+128
+175
+111
+128
+175
+112
+130
+177
+115
+130
+174
+108
+137
+186
+116
+132
+174
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+215
+216
+217
+218
+219
+220
+218
+219
+220
+218
+219
+220
+218
+219
+220
+218
+219
+220
+217
+218
+219
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+210
+211
+213
+216
+217
+218
+216
+217
+218
+217
+218
+219
+218
+219
+220
+217
+218
+219
+217
+218
+219
+217
+218
+219
+218
+219
+220
+221
+221
+222
+227
+228
+229
+232
+233
+234
+234
+235
+236
+234
+235
+236
+231
+232
+233
+224
+226
+227
+191
+196
+216
+108
+126
+179
+99
+118
+168
+108
+126
+179
+108
+126
+179
+108
+126
+179
+108
+137
+186
+121
+146
+192
+131
+147
+192
+140
+156
+194
+172
+179
+196
+210
+216
+228
+111
+125
+171
+107
+122
+171
+118
+134
+176
+118
+134
+176
+119
+135
+179
+122
+138
+181
+122
+138
+181
+123
+140
+181
+123
+140
+181
+122
+138
+181
+119
+135
+179
+112
+128
+175
+111
+128
+175
+112
+128
+175
+108
+126
+179
+115
+130
+174
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+215
+216
+217
+216
+217
+218
+217
+218
+219
+218
+219
+220
+218
+219
+220
+218
+219
+220
+215
+216
+217
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+208
+209
+211
+215
+216
+217
+212
+214
+216
+212
+213
+215
+215
+216
+217
+217
+218
+219
+218
+219
+220
+217
+218
+219
+217
+218
+219
+217
+218
+219
+218
+219
+220
+220
+221
+222
+224
+225
+226
+231
+232
+233
+234
+235
+236
+234
+235
+236
+207
+210
+224
+131
+147
+192
+99
+118
+168
+99
+118
+168
+99
+118
+168
+108
+126
+179
+108
+126
+179
+108
+126
+179
+108
+126
+179
+109
+140
+191
+172
+179
+196
+211
+218
+231
+127
+141
+181
+115
+130
+174
+127
+141
+181
+127
+141
+181
+119
+135
+179
+115
+130
+174
+110
+126
+172
+110
+126
+172
+115
+130
+174
+120
+137
+180
+127
+141
+181
+122
+138
+181
+119
+135
+179
+118
+134
+176
+108
+126
+179
+111
+127
+170
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+215
+216
+217
+216
+217
+218
+216
+217
+218
+216
+217
+218
+216
+217
+218
+218
+219
+220
+212
+214
+216
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+207
+208
+210
+212
+213
+215
+217
+218
+219
+218
+219
+220
+210
+212
+214
+210
+211
+213
+215
+216
+217
+217
+218
+219
+217
+218
+219
+217
+218
+219
+217
+218
+219
+217
+218
+219
+218
+219
+220
+220
+221
+222
+223
+224
+225
+227
+228
+229
+210
+216
+228
+172
+179
+196
+140
+156
+194
+131
+147
+192
+108
+137
+186
+99
+118
+168
+99
+118
+168
+108
+126
+179
+108
+126
+179
+108
+126
+179
+140
+156
+194
+211
+218
+231
+127
+143
+182
+119
+134
+175
+119
+135
+179
+112
+128
+175
+108
+123
+172
+108
+123
+172
+107
+122
+171
+108
+123
+172
+107
+122
+171
+110
+126
+172
+112
+130
+177
+119
+135
+179
+122
+138
+181
+127
+141
+181
+118
+139
+183
+119
+134
+175
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+212
+214
+216
+215
+216
+217
+216
+217
+218
+216
+217
+218
+216
+217
+218
+216
+217
+218
+212
+213
+215
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+206
+206
+208
+210
+212
+214
+220
+221
+222
+230
+231
+232
+226
+227
+228
+218
+219
+220
+212
+214
+216
+212
+213
+215
+215
+216
+217
+217
+218
+219
+218
+219
+220
+218
+219
+220
+218
+219
+220
+218
+219
+220
+218
+219
+220
+218
+219
+220
+195
+200
+215
+135
+149
+191
+135
+149
+191
+172
+179
+196
+157
+164
+189
+140
+156
+194
+114
+133
+178
+108
+126
+179
+108
+126
+179
+118
+139
+183
+157
+164
+189
+210
+216
+228
+118
+131
+173
+107
+122
+171
+108
+123
+172
+107
+122
+171
+107
+122
+171
+107
+122
+171
+108
+123
+172
+108
+123
+172
+108
+123
+172
+108
+123
+172
+107
+122
+171
+108
+123
+172
+112
+128
+175
+118
+131
+173
+118
+139
+183
+127
+141
+181
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+212
+213
+215
+212
+214
+216
+215
+216
+217
+215
+216
+217
+216
+217
+218
+216
+217
+218
+212
+213
+215
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+202
+204
+207
+209
+210
+212
+215
+216
+217
+222
+223
+224
+228
+229
+230
+228
+229
+231
+227
+228
+229
+222
+223
+224
+215
+216
+217
+212
+213
+215
+212
+213
+215
+216
+217
+218
+218
+219
+220
+218
+219
+220
+217
+218
+219
+217
+218
+219
+191
+196
+216
+112
+130
+177
+99
+118
+168
+108
+126
+179
+135
+149
+191
+172
+179
+196
+172
+179
+196
+172
+179
+196
+131
+147
+192
+108
+126
+179
+149
+161
+191
+207
+210
+224
+109
+123
+169
+101
+118
+168
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+108
+123
+172
+108
+123
+172
+108
+123
+172
+108
+123
+172
+107
+122
+171
+108
+123
+172
+108
+126
+179
+114
+129
+171
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+212
+213
+215
+212
+213
+215
+212
+214
+216
+212
+214
+216
+215
+216
+217
+215
+216
+217
+210
+212
+214
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+202
+203
+205
+208
+209
+211
+208
+209
+211
+209
+210
+212
+212
+213
+215
+217
+218
+219
+224
+226
+227
+228
+229
+231
+228
+229
+230
+224
+225
+226
+218
+219
+220
+212
+213
+215
+212
+213
+215
+216
+217
+218
+218
+219
+220
+217
+218
+219
+194
+197
+208
+108
+126
+179
+99
+118
+168
+108
+126
+179
+108
+126
+179
+119
+140
+184
+191
+196
+216
+234
+235
+236
+172
+179
+196
+108
+126
+179
+131
+147
+192
+207
+210
+224
+109
+123
+169
+101
+118
+168
+108
+123
+172
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+108
+123
+172
+107
+122
+171
+108
+123
+172
+108
+123
+172
+108
+123
+172
+108
+123
+172
+105
+121
+170
+109
+124
+168
+173
+85
+78
+220
+44
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+210
+212
+214
+212
+213
+215
+212
+214
+216
+212
+213
+215
+212
+214
+216
+212
+214
+216
+210
+211
+213
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+201
+202
+204
+208
+209
+211
+208
+209
+211
+208
+209
+211
+208
+209
+211
+207
+208
+210
+210
+212
+214
+217
+218
+219
+222
+223
+224
+227
+228
+229
+228
+229
+231
+226
+227
+228
+221
+221
+222
+215
+216
+217
+212
+213
+215
+212
+214
+216
+194
+197
+208
+122
+138
+181
+108
+126
+179
+118
+136
+180
+108
+126
+179
+105
+121
+170
+140
+156
+194
+210
+216
+228
+172
+179
+196
+108
+126
+179
+131
+147
+192
+207
+210
+224
+109
+123
+169
+99
+115
+164
+105
+121
+170
+105
+121
+170
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+108
+123
+172
+107
+122
+171
+108
+123
+172
+107
+122
+171
+99
+118
+168
+105
+121
+166
+173
+85
+78
+221
+43
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+210
+211
+213
+212
+213
+215
+212
+213
+215
+212
+213
+215
+212
+214
+216
+212
+213
+215
+209
+210
+212
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+200
+201
+203
+207
+208
+210
+207
+208
+210
+209
+210
+212
+209
+210
+212
+207
+208
+210
+208
+209
+211
+209
+210
+212
+209
+210
+212
+215
+216
+217
+220
+221
+222
+226
+227
+228
+228
+229
+231
+228
+229
+230
+224
+225
+226
+216
+217
+218
+195
+199
+206
+137
+154
+190
+108
+126
+179
+105
+121
+170
+99
+118
+168
+99
+118
+168
+108
+126
+179
+121
+146
+192
+109
+140
+191
+108
+126
+179
+135
+149
+191
+207
+210
+224
+109
+123
+169
+99
+115
+164
+105
+121
+170
+105
+121
+170
+105
+121
+170
+105
+121
+170
+107
+122
+171
+107
+122
+171
+107
+122
+171
+107
+122
+171
+105
+121
+170
+105
+121
+170
+105
+121
+170
+105
+121
+170
+108
+126
+179
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+210
+211
+213
+210
+212
+214
+210
+212
+214
+212
+213
+215
+212
+213
+215
+212
+213
+215
+209
+210
+212
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+196
+198
+200
+206
+206
+208
+202
+204
+207
+202
+204
+207
+207
+208
+210
+209
+210
+212
+209
+210
+212
+208
+209
+211
+208
+209
+211
+209
+210
+212
+209
+210
+212
+212
+213
+215
+218
+219
+220
+224
+226
+227
+228
+229
+231
+228
+229
+231
+217
+220
+223
+157
+164
+189
+108
+126
+179
+99
+118
+168
+99
+118
+168
+99
+118
+168
+99
+118
+168
+99
+118
+168
+99
+118
+168
+99
+118
+168
+135
+149
+191
+207
+210
+224
+109
+123
+169
+99
+115
+164
+105
+121
+170
+105
+121
+170
+105
+121
+170
+105
+121
+170
+105
+121
+170
+105
+120
+168
+107
+122
+171
+105
+121
+170
+102
+119
+168
+105
+121
+170
+112
+128
+175
+119
+134
+175
+119
+140
+184
+127
+141
+181
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+209
+210
+212
+209
+210
+212
+210
+212
+214
+210
+212
+214
+210
+212
+214
+212
+213
+215
+208
+209
+211
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+195
+197
+199
+202
+204
+207
+207
+208
+210
+206
+206
+208
+201
+202
+204
+202
+203
+205
+207
+208
+210
+209
+210
+212
+209
+210
+212
+208
+209
+211
+208
+209
+211
+209
+210
+212
+210
+211
+213
+210
+212
+214
+215
+216
+217
+221
+221
+222
+211
+212
+222
+172
+179
+196
+149
+161
+191
+135
+149
+191
+112
+130
+177
+99
+118
+168
+99
+118
+168
+99
+118
+168
+99
+118
+168
+99
+118
+168
+135
+149
+191
+207
+210
+224
+109
+122
+168
+99
+115
+164
+105
+120
+168
+105
+120
+168
+105
+120
+168
+105
+120
+168
+105
+120
+168
+101
+118
+168
+101
+118
+168
+107
+122
+171
+116
+132
+174
+120
+137
+180
+127
+141
+181
+127
+141
+181
+122
+145
+187
+121
+135
+173
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+207
+208
+210
+209
+210
+212
+210
+211
+213
+209
+210
+212
+210
+211
+213
+210
+212
+214
+207
+208
+210
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+195
+197
+199
+202
+203
+205
+212
+213
+215
+224
+225
+226
+218
+219
+220
+209
+210
+212
+201
+202
+204
+201
+202
+204
+206
+206
+208
+209
+210
+212
+208
+209
+211
+208
+209
+211
+208
+209
+211
+209
+210
+212
+210
+211
+213
+209
+210
+212
+194
+197
+208
+140
+156
+194
+140
+156
+194
+172
+179
+196
+172
+179
+196
+149
+161
+191
+126
+144
+186
+112
+130
+177
+105
+121
+170
+99
+118
+168
+135
+149
+191
+207
+210
+224
+105
+121
+166
+99
+115
+164
+105
+120
+168
+105
+120
+168
+102
+119
+168
+101
+118
+168
+105
+120
+168
+110
+126
+172
+119
+134
+175
+127
+141
+181
+127
+141
+181
+127
+141
+181
+127
+141
+181
+121
+135
+173
+117
+136
+179
+121
+135
+173
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+206
+206
+208
+208
+209
+211
+208
+209
+211
+209
+210
+212
+209
+210
+212
+209
+210
+212
+206
+206
+208
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+195
+197
+199
+201
+202
+204
+206
+206
+208
+215
+216
+217
+222
+223
+224
+224
+225
+226
+221
+221
+222
+215
+216
+217
+206
+206
+208
+201
+202
+204
+202
+204
+207
+208
+209
+211
+209
+210
+212
+209
+210
+212
+209
+210
+212
+208
+209
+211
+189
+191
+197
+122
+138
+181
+108
+123
+172
+126
+144
+186
+149
+161
+191
+172
+179
+196
+157
+164
+189
+126
+144
+186
+114
+133
+178
+118
+136
+180
+149
+161
+191
+207
+210
+224
+104
+119
+165
+98
+114
+161
+104
+119
+165
+99
+115
+164
+102
+119
+168
+114
+128
+171
+127
+141
+181
+127
+143
+182
+127
+141
+181
+127
+141
+181
+127
+141
+181
+121
+135
+173
+119
+134
+175
+120
+132
+173
+117
+136
+179
+121
+135
+173
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+206
+206
+208
+207
+208
+210
+207
+208
+210
+208
+209
+211
+208
+209
+211
+208
+209
+211
+202
+204
+207
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+192
+192
+195
+200
+201
+203
+200
+201
+203
+200
+201
+203
+206
+206
+208
+212
+213
+215
+218
+219
+220
+224
+225
+226
+224
+225
+226
+217
+218
+219
+208
+209
+211
+202
+203
+205
+202
+203
+205
+206
+206
+208
+209
+210
+212
+208
+209
+211
+189
+191
+197
+122
+138
+181
+105
+121
+170
+111
+128
+175
+112
+130
+177
+125
+142
+182
+131
+147
+192
+120
+137
+180
+114
+133
+178
+120
+137
+180
+149
+161
+191
+210
+216
+228
+120
+133
+172
+105
+121
+166
+109
+124
+168
+116
+130
+170
+127
+141
+181
+127
+141
+181
+134
+150
+190
+127
+141
+181
+121
+135
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+119
+134
+175
+121
+135
+173
+117
+136
+179
+121
+135
+173
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+206
+206
+208
+206
+206
+208
+207
+208
+210
+207
+208
+210
+207
+208
+210
+208
+209
+211
+202
+204
+207
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+192
+192
+195
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+199
+201
+202
+202
+203
+205
+208
+209
+211
+216
+217
+218
+222
+223
+224
+224
+226
+227
+221
+221
+222
+212
+213
+215
+202
+204
+207
+202
+203
+205
+202
+204
+207
+189
+191
+197
+127
+141
+181
+108
+123
+172
+108
+123
+172
+108
+123
+172
+111
+128
+175
+112
+130
+177
+114
+133
+178
+112
+130
+177
+114
+133
+178
+149
+161
+191
+210
+216
+228
+127
+141
+181
+121
+135
+173
+127
+141
+181
+127
+141
+181
+127
+141
+181
+121
+135
+173
+127
+141
+181
+121
+135
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+115
+134
+176
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+206
+206
+208
+206
+206
+208
+206
+206
+208
+206
+206
+208
+207
+208
+210
+207
+208
+210
+202
+204
+207
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+189
+189
+191
+196
+198
+200
+196
+198
+200
+199
+201
+202
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+201
+202
+204
+207
+208
+210
+212
+214
+216
+220
+221
+222
+224
+226
+227
+224
+225
+226
+216
+217
+218
+206
+206
+208
+172
+179
+196
+118
+131
+173
+114
+133
+178
+135
+149
+191
+123
+140
+181
+111
+128
+175
+108
+123
+172
+112
+130
+177
+112
+130
+177
+112
+130
+177
+149
+161
+191
+207
+210
+224
+120
+133
+172
+120
+133
+172
+127
+141
+181
+121
+135
+173
+117
+130
+172
+116
+130
+170
+121
+135
+173
+121
+135
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+115
+134
+176
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+202
+204
+207
+202
+204
+207
+206
+206
+208
+206
+206
+208
+206
+206
+208
+206
+206
+208
+201
+202
+204
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+189
+189
+191
+195
+197
+199
+195
+197
+199
+196
+198
+200
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+200
+201
+203
+199
+201
+202
+200
+201
+203
+206
+206
+208
+210
+212
+214
+218
+219
+220
+224
+225
+226
+226
+227
+228
+200
+202
+216
+127
+141
+181
+102
+119
+168
+119
+135
+179
+135
+149
+191
+137
+154
+190
+127
+143
+182
+114
+133
+178
+108
+123
+172
+108
+126
+179
+146
+158
+190
+207
+210
+224
+116
+130
+170
+117
+128
+167
+121
+135
+173
+117
+130
+172
+116
+126
+167
+116
+130
+170
+121
+135
+173
+121
+135
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+115
+134
+176
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+201
+202
+204
+202
+204
+207
+202
+204
+207
+202
+204
+207
+206
+206
+208
+206
+206
+208
+200
+201
+203
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+189
+189
+191
+195
+197
+199
+195
+197
+199
+195
+197
+199
+195
+197
+199
+196
+198
+200
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+200
+201
+203
+200
+201
+203
+200
+201
+203
+202
+203
+205
+208
+209
+211
+215
+216
+217
+206
+208
+218
+172
+179
+196
+140
+156
+194
+127
+141
+181
+112
+128
+175
+116
+132
+174
+127
+143
+182
+137
+154
+190
+127
+143
+182
+114
+133
+178
+145
+152
+186
+207
+210
+224
+117
+128
+167
+116
+126
+167
+121
+135
+173
+116
+130
+170
+117
+128
+167
+117
+130
+172
+121
+135
+173
+121
+135
+173
+120
+132
+173
+118
+131
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+120
+132
+173
+115
+134
+176
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+200
+201
+203
+202
+203
+205
+202
+204
+207
+202
+204
+207
+202
+204
+207
+202
+204
+207
+199
+201
+202
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+185
+187
+189
+193
+195
+197
+193
+195
+197
+193
+195
+197
+195
+197
+199
+195
+197
+199
+195
+197
+199
+196
+198
+200
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+200
+201
+203
+200
+201
+203
+199
+201
+202
+199
+201
+202
+186
+187
+194
+145
+152
+186
+149
+161
+191
+172
+179
+196
+157
+164
+189
+131
+147
+192
+119
+135
+179
+122
+138
+181
+126
+144
+186
+135
+149
+191
+167
+174
+189
+207
+210
+224
+117
+128
+167
+117
+128
+167
+121
+135
+173
+120
+133
+172
+116
+126
+167
+117
+128
+167
+121
+135
+173
+121
+135
+173
+116
+130
+170
+117
+130
+172
+117
+130
+172
+118
+131
+173
+120
+132
+173
+120
+132
+173
+115
+134
+176
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+199
+201
+202
+201
+202
+204
+202
+203
+205
+202
+203
+205
+202
+204
+207
+202
+204
+207
+199
+201
+202
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+182
+184
+187
+192
+192
+195
+193
+195
+197
+193
+195
+197
+193
+195
+197
+193
+195
+197
+195
+197
+199
+195
+197
+199
+195
+197
+199
+196
+198
+200
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+200
+201
+203
+199
+201
+202
+178
+179
+191
+118
+131
+173
+112
+128
+175
+149
+161
+191
+172
+179
+196
+191
+196
+216
+146
+158
+190
+108
+123
+172
+105
+121
+170
+119
+135
+179
+157
+164
+189
+211
+218
+231
+121
+135
+173
+117
+128
+167
+121
+135
+173
+116
+130
+170
+116
+126
+167
+117
+128
+167
+120
+133
+172
+120
+133
+172
+117
+128
+167
+116
+130
+170
+116
+130
+170
+117
+130
+172
+117
+130
+172
+120
+132
+173
+114
+133
+175
+120
+133
+172
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+199
+201
+202
+201
+202
+204
+201
+202
+204
+201
+202
+204
+202
+203
+205
+202
+203
+205
+196
+198
+200
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+182
+182
+186
+192
+192
+195
+192
+192
+195
+192
+192
+195
+192
+192
+195
+193
+195
+197
+195
+197
+199
+193
+195
+197
+195
+197
+199
+195
+197
+199
+195
+197
+199
+196
+198
+200
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+178
+179
+191
+119
+134
+175
+101
+118
+168
+115
+130
+174
+127
+141
+181
+149
+161
+191
+157
+164
+189
+127
+143
+182
+108
+123
+172
+102
+119
+168
+135
+149
+191
+210
+216
+228
+121
+135
+173
+116
+126
+167
+121
+135
+173
+117
+128
+167
+116
+126
+167
+117
+128
+167
+120
+133
+172
+120
+133
+172
+117
+128
+167
+116
+130
+170
+117
+128
+167
+116
+130
+170
+116
+130
+170
+120
+132
+173
+112
+132
+175
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+196
+198
+200
+200
+201
+203
+200
+201
+203
+201
+202
+204
+201
+202
+204
+201
+202
+204
+196
+198
+200
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+180
+182
+185
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+192
+192
+195
+192
+192
+195
+193
+195
+197
+195
+197
+199
+193
+195
+197
+195
+197
+199
+196
+198
+200
+196
+198
+200
+196
+198
+200
+196
+198
+200
+196
+198
+200
+178
+179
+191
+119
+134
+175
+101
+118
+168
+102
+119
+168
+102
+119
+168
+110
+126
+172
+119
+135
+179
+134
+150
+190
+134
+150
+190
+127
+143
+182
+149
+161
+191
+207
+210
+224
+113
+125
+165
+113
+125
+165
+120
+133
+172
+117
+128
+167
+116
+126
+167
+116
+126
+167
+120
+133
+172
+120
+133
+172
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+112
+132
+175
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+196
+198
+200
+199
+201
+202
+199
+201
+202
+200
+201
+203
+200
+201
+203
+200
+201
+203
+195
+197
+199
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+178
+179
+182
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+192
+192
+195
+192
+192
+195
+193
+195
+197
+195
+197
+199
+193
+195
+197
+195
+197
+199
+196
+198
+200
+196
+198
+200
+196
+198
+200
+171
+175
+188
+118
+131
+173
+114
+129
+172
+127
+141
+181
+119
+134
+175
+105
+120
+168
+102
+119
+168
+112
+128
+175
+118
+134
+176
+127
+141
+181
+160
+166
+188
+210
+216
+228
+120
+133
+172
+113
+125
+165
+120
+133
+172
+117
+128
+167
+113
+125
+165
+116
+126
+167
+120
+133
+172
+120
+133
+172
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+112
+132
+175
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+199
+201
+202
+200
+201
+203
+195
+197
+199
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+178
+179
+182
+185
+187
+189
+185
+187
+189
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+192
+192
+195
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+193
+195
+197
+171
+175
+188
+118
+131
+173
+105
+121
+170
+123
+140
+181
+127
+143
+182
+134
+150
+190
+122
+138
+181
+109
+123
+169
+99
+115
+164
+105
+121
+170
+145
+152
+186
+210
+216
+228
+120
+133
+172
+113
+125
+165
+117
+128
+167
+116
+126
+167
+113
+125
+165
+113
+125
+165
+120
+133
+172
+120
+133
+172
+116
+126
+167
+113
+125
+165
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+110
+128
+172
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+195
+197
+199
+196
+198
+200
+199
+201
+202
+199
+201
+202
+199
+201
+202
+199
+201
+202
+193
+195
+197
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+177
+179
+181
+185
+187
+189
+185
+187
+189
+185
+187
+189
+189
+189
+191
+189
+189
+191
+189
+189
+191
+189
+189
+191
+192
+192
+195
+189
+189
+191
+163
+169
+184
+119
+135
+179
+112
+128
+175
+127
+141
+181
+177
+180
+190
+193
+195
+197
+181
+184
+191
+120
+132
+173
+101
+118
+168
+109
+123
+169
+112
+125
+170
+119
+134
+175
+127
+141
+181
+134
+150
+190
+127
+141
+181
+112
+128
+175
+127
+143
+182
+207
+210
+224
+116
+126
+167
+111
+123
+164
+117
+128
+167
+113
+125
+165
+117
+128
+167
+120
+133
+172
+121
+135
+173
+121
+135
+173
+120
+133
+172
+117
+128
+167
+116
+126
+167
+113
+125
+165
+117
+128
+167
+117
+128
+167
+110
+128
+172
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+199
+201
+202
+196
+198
+200
+196
+198
+200
+196
+198
+200
+199
+201
+202
+199
+201
+202
+193
+195
+197
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+177
+179
+181
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+189
+189
+191
+185
+187
+189
+163
+169
+184
+108
+123
+172
+121
+146
+192
+191
+196
+216
+131
+147
+192
+119
+135
+179
+178
+179
+191
+181
+184
+191
+117
+130
+172
+99
+115
+164
+107
+121
+168
+107
+121
+168
+107
+121
+168
+112
+125
+170
+118
+131
+173
+122
+138
+181
+127
+143
+182
+160
+166
+188
+207
+210
+224
+111
+123
+164
+111
+123
+164
+120
+133
+172
+120
+133
+172
+120
+133
+172
+117
+128
+167
+117
+128
+167
+117
+128
+167
+117
+128
+167
+120
+133
+172
+121
+135
+173
+120
+133
+172
+117
+128
+167
+113
+125
+165
+110
+128
+172
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+192
+181
+183
+199
+201
+202
+202
+203
+205
+202
+203
+205
+200
+201
+203
+196
+198
+200
+196
+198
+200
+193
+195
+197
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+173
+172
+176
+182
+182
+186
+182
+182
+186
+182
+184
+187
+185
+187
+189
+185
+187
+189
+185
+187
+189
+185
+187
+189
+182
+182
+186
+127
+141
+181
+99
+115
+164
+149
+161
+191
+244
+244
+245
+191
+196
+216
+108
+137
+186
+134
+150
+190
+174
+177
+185
+120
+133
+172
+98
+114
+161
+105
+121
+166
+107
+121
+168
+107
+121
+168
+107
+121
+168
+107
+121
+168
+107
+121
+168
+111
+125
+171
+145
+152
+186
+210
+216
+228
+127
+141
+181
+117
+128
+167
+120
+133
+172
+117
+128
+167
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+117
+128
+167
+117
+128
+167
+120
+133
+172
+121
+135
+173
+121
+135
+173
+114
+129
+171
+113
+125
+165
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+180
+177
+178
+193
+195
+197
+196
+198
+200
+200
+201
+203
+202
+203
+205
+206
+206
+208
+202
+203
+205
+192
+192
+195
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+173
+172
+176
+182
+182
+186
+180
+182
+185
+182
+182
+186
+182
+182
+186
+182
+184
+187
+182
+184
+187
+185
+187
+189
+189
+189
+191
+145
+152
+186
+99
+118
+168
+140
+156
+194
+239
+240
+242
+211
+218
+231
+109
+140
+191
+108
+137
+186
+161
+167
+184
+121
+135
+173
+98
+114
+161
+105
+121
+166
+105
+121
+166
+107
+121
+168
+107
+121
+168
+105
+120
+168
+104
+119
+165
+111
+125
+171
+145
+152
+186
+207
+210
+224
+120
+133
+172
+113
+125
+165
+117
+128
+167
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+116
+126
+167
+117
+128
+167
+120
+133
+172
+121
+135
+173
+121
+135
+173
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+180
+177
+178
+192
+192
+195
+193
+195
+197
+196
+198
+200
+196
+198
+200
+195
+197
+199
+195
+197
+199
+196
+198
+200
+109
+140
+191
+17
+32
+49
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+173
+172
+176
+182
+182
+186
+180
+182
+185
+180
+182
+185
+180
+182
+185
+185
+187
+189
+192
+192
+195
+192
+192
+195
+185
+187
+189
+165
+170
+188
+108
+137
+186
+109
+140
+191
+211
+218
+231
+191
+196
+216
+109
+140
+191
+109
+140
+191
+157
+164
+189
+120
+133
+172
+98
+114
+161
+104
+119
+165
+104
+119
+165
+104
+119
+165
+104
+119
+165
+104
+119
+165
+109
+124
+168
+114
+129
+172
+145
+152
+186
+206
+208
+218
+113
+125
+165
+112
+117
+158
+117
+128
+167
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+116
+126
+167
+110
+128
+172
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+180
+177
+178
+193
+195
+197
+193
+195
+197
+189
+189
+191
+185
+187
+189
+186
+187
+194
+157
+164
+189
+121
+146
+192
+109
+140
+191
+10
+17
+25
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+173
+172
+176
+178
+179
+182
+182
+184
+187
+189
+189
+191
+189
+189
+191
+189
+189
+191
+185
+187
+189
+182
+184
+187
+180
+182
+185
+180
+182
+185
+140
+156
+194
+109
+140
+191
+140
+156
+194
+140
+156
+194
+109
+140
+191
+109
+140
+191
+163
+169
+184
+117
+128
+167
+96
+110
+155
+104
+119
+165
+109
+122
+168
+109
+122
+168
+121
+135
+173
+146
+158
+190
+121
+135
+173
+104
+119
+165
+127
+143
+182
+206
+208
+218
+113
+125
+165
+112
+117
+158
+113
+125
+165
+111
+123
+164
+111
+123
+164
+111
+123
+164
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+109
+124
+168
+117
+128
+167
+188
+87
+73
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+180
+177
+178
+182
+184
+187
+185
+187
+189
+172
+179
+196
+140
+156
+194
+99
+115
+164
+48
+72
+105
+17
+32
+49
+10
+17
+25
+4
+2
+3
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+177
+179
+181
+192
+192
+195
+192
+192
+195
+189
+189
+191
+182
+184
+187
+182
+182
+186
+180
+182
+185
+180
+182
+185
+182
+182
+186
+180
+182
+185
+172
+176
+186
+140
+156
+194
+109
+140
+191
+99
+118
+168
+109
+140
+191
+157
+164
+189
+167
+171
+180
+113
+125
+165
+101
+116
+162
+117
+128
+167
+113
+125
+165
+104
+119
+165
+137
+154
+190
+195
+199
+206
+149
+161
+191
+102
+119
+168
+127
+141
+181
+206
+208
+218
+111
+123
+164
+112
+117
+158
+113
+125
+165
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+113
+125
+165
+116
+126
+167
+107
+122
+163
+107
+122
+163
+173
+85
+78
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+207
+87
+66
+175
+174
+176
+157
+164
+189
+121
+146
+192
+96
+110
+155
+17
+32
+49
+10
+17
+25
+5
+4
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+173
+172
+176
+182
+184
+187
+182
+184
+187
+182
+182
+186
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+174
+177
+185
+167
+174
+189
+149
+161
+191
+165
+170
+188
+182
+184
+187
+172
+176
+186
+120
+133
+172
+98
+114
+161
+101
+112
+157
+101
+116
+162
+101
+116
+162
+127
+141
+181
+172
+179
+196
+140
+156
+194
+105
+121
+166
+127
+141
+181
+206
+208
+218
+111
+123
+164
+112
+117
+158
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+113
+125
+165
+113
+125
+165
+110
+120
+164
+101
+116
+162
+107
+122
+163
+121
+135
+173
+197
+88
+72
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+54
+40
+224
+18
+34
+197
+88
+72
+101
+112
+157
+48
+72
+105
+17
+32
+49
+10
+17
+25
+4
+2
+3
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+109
+140
+191
+149
+161
+191
+174
+177
+185
+173
+172
+176
+175
+174
+176
+178
+179
+182
+182
+182
+186
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+182
+184
+187
+182
+184
+187
+167
+171
+180
+145
+152
+186
+113
+125
+165
+98
+114
+161
+96
+110
+155
+104
+119
+165
+115
+132
+173
+111
+127
+170
+104
+119
+165
+127
+141
+181
+206
+208
+218
+111
+123
+164
+112
+117
+158
+112
+117
+158
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+111
+123
+164
+101
+116
+162
+107
+122
+163
+117
+128
+167
+127
+141
+181
+157
+162
+180
+173
+172
+176
+206
+88
+67
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+52
+39
+221
+43
+35
+185
+48
+43
+17
+32
+49
+5
+4
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+17
+32
+49
+48
+72
+105
+99
+118
+168
+131
+147
+192
+163
+169
+184
+177
+179
+181
+173
+172
+176
+177
+179
+181
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+182
+184
+187
+182
+184
+187
+180
+182
+185
+172
+176
+186
+157
+162
+180
+121
+135
+173
+104
+119
+165
+96
+110
+155
+96
+110
+155
+101
+116
+162
+127
+141
+181
+206
+208
+218
+112
+117
+158
+112
+117
+158
+112
+117
+158
+112
+117
+158
+111
+123
+164
+111
+123
+164
+111
+123
+164
+112
+117
+158
+101
+112
+157
+107
+122
+163
+120
+133
+172
+157
+162
+180
+167
+171
+180
+174
+177
+185
+177
+179
+181
+180
+177
+178
+206
+88
+67
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+5
+4
+6
+10
+17
+25
+17
+32
+49
+48
+72
+105
+108
+137
+186
+146
+158
+190
+172
+176
+186
+173
+172
+176
+175
+174
+176
+178
+179
+182
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+182
+184
+187
+182
+184
+187
+182
+184
+187
+174
+177
+185
+167
+171
+180
+145
+152
+186
+113
+125
+165
+98
+114
+161
+121
+135
+173
+200
+202
+216
+112
+117
+158
+112
+117
+158
+112
+117
+158
+112
+117
+158
+112
+117
+158
+101
+112
+157
+101
+112
+157
+113
+125
+165
+127
+141
+181
+157
+162
+180
+167
+171
+180
+178
+179
+182
+180
+182
+185
+182
+182
+186
+177
+179
+181
+180
+177
+178
+206
+88
+67
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+15
+10
+13
+17
+32
+49
+48
+72
+105
+96
+110
+155
+122
+145
+187
+157
+164
+189
+178
+179
+182
+173
+172
+176
+177
+179
+181
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+182
+184
+187
+182
+184
+187
+182
+184
+187
+180
+182
+185
+172
+176
+186
+157
+162
+180
+157
+162
+180
+200
+202
+216
+101
+112
+157
+101
+112
+157
+112
+117
+158
+101
+112
+157
+112
+117
+158
+117
+128
+167
+145
+152
+186
+157
+162
+180
+167
+171
+180
+178
+179
+182
+180
+182
+185
+180
+182
+185
+180
+182
+185
+178
+179
+182
+173
+172
+176
+175
+174
+176
+206
+88
+67
+221
+43
+35
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+10
+17
+25
+17
+32
+49
+48
+72
+105
+108
+126
+179
+137
+154
+190
+167
+171
+180
+177
+179
+181
+175
+174
+176
+178
+179
+182
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+182
+184
+187
+182
+184
+187
+182
+184
+187
+182
+184
+187
+185
+185
+191
+221
+222
+227
+145
+152
+186
+117
+128
+167
+112
+117
+158
+121
+135
+173
+157
+162
+180
+167
+171
+180
+173
+172
+176
+177
+179
+181
+178
+179
+182
+178
+179
+182
+178
+179
+182
+177
+179
+181
+173
+172
+176
+177
+179
+181
+149
+161
+191
+109
+140
+191
+173
+85
+78
+216
+43
+34
+227
+48
+37
+227
+52
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+10
+17
+25
+17
+32
+49
+96
+110
+155
+121
+146
+192
+149
+161
+191
+174
+177
+185
+173
+172
+176
+177
+179
+181
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+182
+184
+187
+185
+187
+189
+189
+189
+191
+224
+226
+227
+173
+172
+176
+167
+171
+180
+167
+171
+180
+167
+171
+180
+173
+172
+176
+177
+179
+181
+177
+179
+181
+177
+179
+181
+177
+179
+181
+173
+172
+176
+173
+172
+176
+167
+171
+180
+137
+154
+190
+108
+126
+179
+48
+72
+105
+17
+32
+49
+134
+39
+41
+220
+44
+35
+227
+48
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+15
+10
+13
+17
+32
+49
+48
+72
+105
+99
+118
+168
+134
+150
+190
+165
+170
+188
+177
+179
+181
+175
+174
+176
+178
+179
+182
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+184
+187
+189
+189
+191
+224
+225
+226
+173
+172
+176
+175
+174
+176
+175
+174
+176
+175
+174
+176
+177
+179
+181
+177
+179
+181
+173
+172
+176
+173
+172
+176
+175
+174
+176
+157
+162
+180
+122
+145
+187
+96
+110
+155
+48
+72
+105
+10
+17
+25
+5
+4
+6
+5
+4
+6
+132
+29
+26
+220
+44
+35
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+10
+17
+25
+17
+32
+49
+48
+72
+105
+108
+137
+186
+146
+158
+190
+174
+177
+185
+175
+174
+176
+177
+179
+181
+180
+182
+185
+180
+182
+185
+180
+182
+185
+182
+182
+186
+185
+187
+189
+224
+225
+226
+173
+172
+176
+173
+172
+176
+175
+174
+176
+173
+172
+176
+173
+172
+176
+173
+172
+176
+167
+171
+180
+137
+154
+190
+108
+126
+179
+48
+72
+105
+17
+32
+49
+15
+10
+13
+4
+2
+3
+0
+0
+0
+0
+0
+0
+4
+2
+3
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+5
+4
+6
+10
+17
+25
+48
+72
+105
+98
+114
+161
+121
+146
+192
+161
+167
+184
+177
+179
+181
+175
+174
+176
+178
+179
+182
+182
+182
+186
+185
+187
+189
+224
+225
+226
+173
+172
+176
+173
+172
+176
+157
+162
+180
+173
+172
+176
+157
+162
+180
+122
+145
+187
+96
+110
+155
+48
+72
+105
+10
+17
+25
+5
+4
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+15
+10
+13
+17
+32
+49
+48
+72
+105
+108
+126
+179
+137
+154
+190
+172
+176
+186
+177
+179
+181
+180
+182
+185
+221
+221
+222
+157
+162
+180
+157
+162
+180
+137
+154
+190
+108
+126
+179
+48
+72
+105
+17
+32
+49
+15
+10
+13
+4
+2
+3
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+10
+17
+25
+48
+72
+105
+96
+110
+155
+121
+146
+192
+160
+166
+188
+211
+212
+222
+122
+145
+187
+96
+110
+155
+48
+72
+105
+10
+17
+25
+5
+4
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+2
+3
+5
+4
+6
+17
+32
+49
+48
+72
+105
+48
+72
+105
+17
+32
+49
+5
+4
+6
+4
+2
+3
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+132
+29
+26
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+24
+6
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+132
+29
+26
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+202
+49
+42
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+5
+4
+6
+5
+4
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+93
+23
+26
+210
+47
+36
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+93
+23
+26
+4
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+59
+14
+23
+210
+47
+36
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+132
+29
+26
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+154
+37
+33
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+135
+32
+25
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+24
+6
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+25
+8
+8
+154
+37
+33
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+185
+48
+43
+59
+14
+23
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+144
+28
+23
+220
+49
+37
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+227
+51
+39
+210
+47
+36
+59
+14
+23
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+1
+1
+93
+23
+26
+154
+37
+33
+210
+47
+36
+210
+47
+36
+202
+49
+42
+132
+29
+26
+24
+6
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+93
+23
+26
+154
+37
+33
+210
+47
+36
+210
+47
+36
+185
+48
+43
+132
+29
+26
+25
+8
+8
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
diff -upr linux-2.6.32-504.3.3.el6.orig/drivers/video/vga16fb.c linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/vga16fb.c
--- linux-2.6.32-504.3.3.el6.orig/drivers/video/vga16fb.c	2014-12-12 23:29:00.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/drivers/video/vga16fb.c	2015-01-21 12:02:41.353276046 +0300
@@ -1266,9 +1266,11 @@ static void vga16fb_imageblit(struct fb_
 
 static void vga16fb_destroy(struct fb_info *info)
 {
+	struct platform_device *dev = container_of(info->device, struct platform_device, dev);
 	iounmap(info->screen_base);
 	fb_dealloc_cmap(&info->cmap);
 	/* XXX unshare VGA regions */
+	platform_set_drvdata(dev, NULL);
 	framebuffer_release(info);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/fs/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/fs/Kconfig	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/Kconfig	2015-01-21 12:02:53.037965840 +0300
@@ -63,6 +63,14 @@ source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
+config SIM_FS
+	tristate "VPS filesystem"
+	default m
+	help
+	  This file system is a part of Virtuozzo. It intoduces a fake
+	  superblock and blockdev to VE to hide real device and show
+	  statfs results taken from quota.
+
 config CUSE
 	tristate "Character device in Userpace support"
 	depends on FUSE_FS
@@ -82,6 +90,14 @@ menu "Caches"
 source "fs/fscache/Kconfig"
 source "fs/cachefiles/Kconfig"
 
+config PRAMCACHE
+	bool "Persistent FS cache"
+	depends on PRAM
+
+config PRAMCACHE_FEATURE_NOSYNC
+	int
+	default "1"
+
 endmenu
 
 if BLOCK
@@ -120,6 +136,10 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
+config PRAMFS
+	bool "Persistent in-memory file system"
+	depends on PRAM
+
 config TMPFS_POSIX_ACL
 	bool "Tmpfs POSIX Access Control Lists"
 	depends on TMPFS
@@ -222,7 +242,7 @@ config LOCKD_V4
 	default y
 
 config EXPORTFS
-	tristate
+	bool
 
 config NFS_ACL_SUPPORT
 	tristate
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/Makefile	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/Makefile	2015-01-21 12:02:52.675975448 +0300
@@ -48,8 +48,12 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
+obj-$(CONFIG_FHANDLE)		+= fhandle.o
+
 obj-y				+= quota/
 
+obj-$(CONFIG_SIM_FS)		+= simfs.o
+
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
 obj-$(CONFIG_SYSFS)		+= sysfs/
@@ -122,3 +126,4 @@ obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_PSTORE)		+= pstore/
+obj-$(CONFIG_PRAMCACHE)		+= pramcache.o
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/aio.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/aio.c
--- linux-2.6.32-504.3.3.el6.orig/fs/aio.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/aio.c	2015-01-21 12:02:58.219828291 +0300
@@ -48,14 +48,9 @@
 #define dprintk(x...)	do { ; } while (0)
 #endif
 
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;		/* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
 static struct kmem_cache	*kiocb_cachep;
-static struct kmem_cache	*kioctx_cachep;
+struct kmem_cache		*kioctx_cachep;
+EXPORT_SYMBOL(kioctx_cachep);
 
 static struct workqueue_struct *aio_wq;
 
@@ -74,7 +69,8 @@ struct aio_batch_entry {
 };
 mempool_t *abe_pool;
 
-static void aio_kick_handler(struct work_struct *);
+void aio_kick_handler(struct work_struct *);
+EXPORT_SYMBOL(aio_kick_handler);
 static void aio_queue_work(struct kioctx *);
 
 /* aio_setup
@@ -96,6 +92,12 @@ static int __init aio_setup(void)
 }
 __initcall(aio_setup);
 
+static inline void aio_kunmap_atomic(void *kvaddr, enum km_type type)
+{
+	ClearPageCheckpointed(kmap_atomic_to_page(kvaddr));
+	kunmap_atomic(kvaddr, type);
+}
+
 static void aio_free_ring(struct kioctx *ctx)
 {
 	struct aio_ring_info *info = &ctx->ring_info;
@@ -180,7 +182,7 @@ static int aio_setup_ring(struct kioctx 
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
 	ring->header_length = sizeof(struct aio_ring);
-	kunmap_atomic(ring, KM_USER0);
+	aio_kunmap_atomic(ring, KM_USER0);
 
 	return 0;
 }
@@ -205,22 +207,44 @@ static int aio_setup_ring(struct kioctx 
 #define put_aio_ring_event(event, km) do {	\
 	struct io_event *__event = (event);	\
 	(void)__event;				\
-	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
+	aio_kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
 
+static int aio_nr_charge(struct ve_struct *ve, unsigned nr_events)
+{
+	int err = 0;
+
+	spin_lock_bh(&ve->aio_nr_lock);
+	if (ve->aio_nr + nr_events > ve->aio_max_nr ||
+	    ve->aio_nr + nr_events < ve->aio_nr)
+		err = -EAGAIN;
+	else
+		ve->aio_nr += nr_events;
+	spin_unlock_bh(&ve->aio_nr_lock);
+
+	return err;
+}
+
+static void aio_nr_discharge(struct ve_struct *ve, unsigned nr_events)
+{
+	spin_lock(&ve->aio_nr_lock);
+	BUG_ON(ve->aio_nr - nr_events > ve->aio_nr);
+	ve->aio_nr -= nr_events;
+	spin_unlock(&ve->aio_nr_lock);
+}
+
 static void ctx_rcu_free(struct rcu_head *head)
 {
 	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	struct ve_struct *ve = ctx->ve;
 	unsigned nr_events = ctx->max_reqs;
 
 	kmem_cache_free(kioctx_cachep, ctx);
 
 	if (nr_events) {
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - nr_events > aio_nr);
-		aio_nr -= nr_events;
-		spin_unlock(&aio_nr_lock);
+		aio_nr_discharge(ve, nr_events);
 	}
+	put_ve(ve);
 }
 
 /* __put_ioctx
@@ -260,6 +284,7 @@ static struct kioctx *ioctx_alloc(unsign
 	struct mm_struct *mm;
 	struct kioctx *ctx;
 	int did_sync = 0;
+	struct ve_struct *ve = get_exec_env();
 
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -268,7 +293,7 @@ static struct kioctx *ioctx_alloc(unsign
 		return ERR_PTR(-EINVAL);
 	}
 
-	if ((unsigned long)nr_events > aio_max_nr)
+	if ((unsigned long)nr_events > ve->aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -278,6 +303,7 @@ static struct kioctx *ioctx_alloc(unsign
 	ctx->max_reqs = nr_events;
 	mm = ctx->mm = current->mm;
 	atomic_inc(&mm->mm_count);
+	ctx->ve = get_ve(ve);
 
 	atomic_set(&ctx->users, 2);
 	spin_lock_init(&ctx->ctx_lock);
@@ -293,13 +319,8 @@ static struct kioctx *ioctx_alloc(unsign
 
 	/* limit the number of system wide aios */
 	do {
-		spin_lock_bh(&aio_nr_lock);
-		if (aio_nr + nr_events > aio_max_nr ||
-		    aio_nr + nr_events < aio_nr)
+		if (aio_nr_charge(ctx->ve, nr_events))
 			ctx->max_reqs = 0;
-		else
-			aio_nr += ctx->max_reqs;
-		spin_unlock_bh(&aio_nr_lock);
 		if (ctx->max_reqs || did_sync)
 			break;
 
@@ -326,6 +347,7 @@ out_cleanup:
 	return ERR_PTR(-EAGAIN);
 
 out_freectx:
+	put_ve(ctx->ve);
 	mmdrop(mm);
 	kmem_cache_free(kioctx_cachep, ctx);
 	ctx = ERR_PTR(-ENOMEM);
@@ -361,7 +383,7 @@ static void aio_cancel_all(struct kioctx
 	spin_unlock_irq(&ctx->ctx_lock);
 }
 
-static void wait_for_all_aios(struct kioctx *ctx)
+void wait_for_all_aios(struct kioctx *ctx)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -384,6 +406,7 @@ static void wait_for_all_aios(struct kio
 out:
 	spin_unlock_irq(&ctx->ctx_lock);
 }
+EXPORT_SYMBOL(wait_for_all_aios);
 
 /* wait_on_sync_kiocb:
  *	Waits on the given sync kiocb to complete.
@@ -475,7 +498,7 @@ static struct kiocb *__aio_get_req(struc
 		ctx->reqs_active++;
 		okay = 1;
 	}
-	kunmap_atomic(ring, KM_USER0);
+	aio_kunmap_atomic(ring, KM_USER0);
 	spin_unlock_irq(&ctx->ctx_lock);
 
 	if (!okay) {
@@ -845,7 +868,7 @@ static inline void aio_run_all_iocbs(str
  *      space.
  * Run on aiod's context.
  */
-static void aio_kick_handler(struct work_struct *work)
+void aio_kick_handler(struct work_struct *work)
 {
 	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
 	mm_segment_t oldfs = get_fs();
@@ -945,6 +968,10 @@ int aio_complete(struct kiocb *iocb, lon
 		iocb->ki_users = 0;
 		wake_up_process(iocb->ki_obj.tsk);
 		return 1;
+	} else if (is_kernel_kiocb(iocb)) {
+		iocb->ki_obj.complete(iocb->ki_user_data, res);
+		aio_kernel_free(iocb);
+		return 0;
 	}
 
 	info = &ctx->ring_info;
@@ -992,7 +1019,7 @@ int aio_complete(struct kiocb *iocb, lon
 	ring->tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
-	kunmap_atomic(ring, KM_IRQ1);
+	aio_kunmap_atomic(ring, KM_IRQ1);
 
 	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
 
@@ -1060,7 +1087,7 @@ static int aio_read_evt(struct kioctx *i
 	spin_unlock(&info->ring_lock);
 
 out:
-	kunmap_atomic(ring, KM_USER0);
+	aio_kunmap_atomic(ring, KM_USER0);
 	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
 		 (unsigned long)ring->head, (unsigned long)ring->tail);
 	return ret;
@@ -1338,27 +1365,30 @@ static void aio_advance_iovec(struct kio
 static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
+	int may_seek = (!S_ISFIFO(file->f_mapping->host->i_mode) &&
+			!S_ISSOCK(file->f_mapping->host->i_mode));
 	ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
 			 unsigned long, loff_t);
 	ssize_t ret = 0;
 	unsigned short opcode;
 
 	if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
-		(iocb->ki_opcode == IOCB_CMD_PREAD)) {
-		rw_op = file->f_op->aio_read;
+		(iocb->ki_opcode == IOCB_CMD_PREAD))
 		opcode = IOCB_CMD_PREADV;
-	} else {
-		rw_op = file->f_op->aio_write;
+	else
 		opcode = IOCB_CMD_PWRITEV;
-	}
 
 	/* This matches the pread()/pwrite() logic */
 	if (iocb->ki_pos < 0)
 		return -EINVAL;
 
 	do {
+		/* Stack fs may change ->ki_filp, so we have to update rw_op */
+		if (opcode == IOCB_CMD_PREADV)
+			rw_op = iocb->ki_filp->f_op->aio_read;
+		else
+			rw_op = iocb->ki_filp->f_op->aio_write;
+
 		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
 			    iocb->ki_nr_segs - iocb->ki_cur_seg,
 			    iocb->ki_pos);
@@ -1368,8 +1398,7 @@ static ssize_t aio_rw_vect_retry(struct 
 	/* retry all partial writes.  retry partial reads as long as its a
 	 * regular file. */
 	} while (ret > 0 && iocb->ki_left > 0 &&
-		 (opcode == IOCB_CMD_PWRITEV ||
-		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
+		 (opcode == IOCB_CMD_PWRITEV || may_seek));
 
 	/* This means we must have transferred all that we could */
 	/* No need to retry anymore */
@@ -1446,6 +1475,26 @@ static ssize_t aio_setup_single_vector(s
 	return 0;
 }
 
+static ssize_t aio_read_iter(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret = -EINVAL;
+
+	if (file->f_op->read_iter)
+		ret = file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+	return ret;
+}
+
+static ssize_t aio_write_iter(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret = -EINVAL;
+
+	if (file->f_op->write_iter)
+		ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+	return ret;
+}
+
 /*
  * aio_setup_iocb:
  *	Performs the initial checks and aio retry method
@@ -1521,6 +1570,34 @@ static ssize_t aio_setup_iocb(struct kio
 		if (file->f_op->aio_write)
 			kiocb->ki_retry = aio_rw_vect_retry;
 		break;
+	case IOCB_CMD_READ_ITER:
+		ret = -EINVAL;
+		if (unlikely(!is_kernel_kiocb(kiocb)))
+			break;
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_READ)))
+			break;
+		ret = security_file_permission(file, MAY_READ);
+		if (unlikely(ret))
+			break;
+		ret = -EINVAL;
+		if (file->f_op->read_iter)
+			kiocb->ki_retry = aio_read_iter;
+		break;
+	case IOCB_CMD_WRITE_ITER:
+		ret = -EINVAL;
+		if (unlikely(!is_kernel_kiocb(kiocb)))
+			break;
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_WRITE)))
+			break;
+		ret = security_file_permission(file, MAY_WRITE);
+		if (unlikely(ret))
+			break;
+		ret = -EINVAL;
+		if (file->f_op->write_iter)
+			kiocb->ki_retry = aio_write_iter;
+		break;
 	case IOCB_CMD_FDSYNC:
 		ret = -EINVAL;
 		if (file->f_op->aio_fsync)
@@ -1543,6 +1620,95 @@ static ssize_t aio_setup_iocb(struct kio
 }
 
 /*
+ * This allocates an iocb that will be used to submit and track completion of
+ * an IO that is issued from kernel space.
+ *
+ * The caller is expected to call the appropriate aio_kernel_init_() functions
+ * and then call aio_kernel_submit().  From that point forward progress is
+ * guaranteed by the file system aio method.  Eventually the caller's
+ * completion callback will be called.
+ *
+ * These iocbs are special.  They don't have a context, we don't limit the
+ * number pending, they can't be canceled, and can't be retried.  In the short
+ * term callers need to be careful not to call operations which might retry by
+ * only calling new ops which never add retry support.  In the long term
+ * retry-based AIO should be removed.
+ */
+struct kiocb *aio_kernel_alloc(gfp_t gfp)
+{
+	struct kiocb *iocb = kzalloc(sizeof(struct kiocb), gfp);
+	if (iocb)
+		iocb->ki_key = KIOCB_KERNEL_KEY;
+	return iocb;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_alloc);
+
+void aio_kernel_free(struct kiocb *iocb)
+{
+	kfree(iocb);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_free);
+
+/*
+ * The iter count must be set before calling here.  Some filesystems uses
+ * iocb->ki_left as an indicator of the size of an IO.
+ */
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+			  unsigned short op, struct iov_iter *iter, loff_t off)
+{
+	iocb->ki_filp = filp;
+	iocb->ki_iter = iter;
+	iocb->ki_opcode = op;
+	iocb->ki_pos = off;
+	iocb->ki_nbytes = iov_iter_count(iter);
+	iocb->ki_left = iocb->ki_nbytes;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_iter);
+
+void aio_kernel_init_callback(struct kiocb *iocb,
+			      void (*complete)(u64 user_data, long res),
+			      u64 user_data)
+{
+	iocb->ki_obj.complete = complete;
+	iocb->ki_user_data = user_data;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_callback);
+
+/*
+ * The iocb is our responsibility once this is called.  The caller must not
+ * reference it.  This comes from aio_setup_iocb() modifying the iocb.
+ *
+ * Callers must be prepared for their iocb completion callback to be called the
+ * moment they enter this function.  The completion callback may be called from
+ * any context.
+ *
+ * Returns: 0: the iocb completion callback will be called with the op result
+ * negative errno: the operation was not submitted and the iocb was freed
+ */
+int aio_kernel_submit(struct kiocb *iocb)
+{
+	int ret;
+
+	BUG_ON(!is_kernel_kiocb(iocb));
+	BUG_ON(!iocb->ki_obj.complete);
+	BUG_ON(!iocb->ki_filp);
+
+	ret = aio_setup_iocb(iocb, 0);
+	if (ret) {
+		aio_kernel_free(iocb);
+		return ret;
+	}
+
+	ret = iocb->ki_retry(iocb);
+	BUG_ON(ret == -EIOCBRETRY);
+	if (ret != -EIOCBQUEUED)
+		aio_complete(iocb, ret, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_submit);
+
+/*
  * aio_wake_function:
  * 	wait queue callback function for aio notification,
  * 	Simply triggers a retry of the operation via kick_iocb.
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/anon_inodes.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/anon_inodes.c
--- linux-2.6.32-504.3.3.el6.orig/fs/anon_inodes.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/anon_inodes.c	2015-01-21 12:02:55.330904974 +0300
@@ -24,7 +24,8 @@
 #include <asm/uaccess.h>
 
 static struct vfsmount *anon_inode_mnt __read_mostly;
-static struct inode *anon_inode_inode;
+struct inode *anon_inode_inode;
+EXPORT_SYMBOL(anon_inode_inode);
 static const struct file_operations anon_inode_fops;
 
 static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
@@ -83,9 +84,10 @@ static const struct address_space_operat
  * hence saving memory and avoiding code duplication for the file/inode/dentry
  * setup.  Returns the newly created file* or an error pointer.
  */
-struct file *anon_inode_getfile(const char *name,
-				const struct file_operations *fops,
-				void *priv, int flags)
+static struct file *__anon_inode_getfile(const char *name,
+					 const struct file_operations *fops,
+					 void *priv, int flags,
+					 const struct dentry_operations *dops)
 {
 	struct qstr this;
 	struct path path;
@@ -118,7 +120,7 @@ struct file *anon_inode_getfile(const ch
 	 */
 	atomic_inc(&anon_inode_inode->i_count);
 
-	path.dentry->d_op = &anon_inodefs_dentry_operations;
+	path.dentry->d_op = dops;
 	/* Do not publish this dentry inside the global dentry hash table */
 	path.dentry->d_flags &= ~DCACHE_UNHASHED;
 	d_instantiate(path.dentry, anon_inode_inode);
@@ -142,6 +144,13 @@ err_module:
 	module_put(fops->owner);
 	return ERR_PTR(error);
 }
+struct file *anon_inode_getfile(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags)
+{
+	return __anon_inode_getfile(name, fops, priv, flags,
+				    &anon_inodefs_dentry_operations);
+}
 EXPORT_SYMBOL_GPL(anon_inode_getfile);
 
 /**
@@ -160,8 +169,9 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
  * hence saving memory and avoiding code duplication for the file/inode/dentry
  * setup.  Returns new descriptor or an error code.
  */
-int anon_inode_getfd(const char *name, const struct file_operations *fops,
-		     void *priv, int flags)
+int __anon_inode_getfd(const char *name, const struct file_operations *fops,
+		       void *priv, int flags,
+		       const struct dentry_operations *dops)
 {
 	int error, fd;
 	struct file *file;
@@ -171,7 +181,7 @@ int anon_inode_getfd(const char *name, c
 		return error;
 	fd = error;
 
-	file = anon_inode_getfile(name, fops, priv, flags);
+	file = __anon_inode_getfile(name, fops, priv, flags, dops);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_put_unused_fd;
@@ -184,6 +194,13 @@ err_put_unused_fd:
 	put_unused_fd(fd);
 	return error;
 }
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	return __anon_inode_getfd(name, fops, priv, flags,
+				  &anon_inodefs_dentry_operations);
+}
+EXPORT_SYMBOL(__anon_inode_getfd);
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
@@ -209,7 +226,7 @@ static struct inode *anon_inode_mkinode(
 	 * that it already _is_ on the dirty list.
 	 */
 	inode->i_state = I_DIRTY;
-	inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+	inode->i_mode = S_IRUSR | S_IWUSR;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs/init.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs/init.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs/init.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs/init.c	2015-01-21 12:02:43.649215090 +0300
@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs
 	.name		= "autofs",
 	.get_sb		= autofs_get_sb,
 	.kill_sb	= autofs_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
 };
 
 static int __init init_autofs_fs(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs/inode.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs/inode.c	2015-01-21 12:02:43.642215275 +0300
@@ -78,7 +78,7 @@ static int parse_options(char *options, 
 
 	*uid = current_uid();
 	*gid = current_gid();
-	*pgrp = task_pgrp_nr(current);
+	*pgrp = task_pgrp_vnr(current);
 
 	*minproto = *maxproto = AUTOFS_PROTO_VERSION;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs/root.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs/root.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs/root.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs/root.c	2015-01-21 12:02:43.649215090 +0300
@@ -362,7 +362,7 @@ static int autofs_root_unlink(struct ino
 
 	/* This allows root to remove symlinks */
 	lock_kernel();
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
+	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) {
 		unlock_kernel();
 		return -EACCES;
 	}
@@ -556,7 +556,7 @@ static int autofs_root_ioctl(struct inod
 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
 	
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
 		return -EPERM;
 	
 	switch(cmd) {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs4/autofs_i.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/autofs_i.h
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs4/autofs_i.h	2014-12-12 23:29:13.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/autofs_i.h	2015-01-21 12:02:48.781078846 +0300
@@ -121,7 +121,8 @@ struct autofs_sb_info {
 	u32 magic;
 	int pipefd;
 	struct file *pipe;
-	pid_t oz_pgrp;
+	struct pid *oz_pgrp;
+	pid_t pipe_pid;
 	int catatonic;
 	int version;
 	int sub_version;
@@ -139,6 +140,22 @@ struct autofs_sb_info {
 	spinlock_t lookup_lock;
 	struct list_head active_list;
 	struct list_head expiring_list;
+	unsigned is32bit:1;
+};
+
+struct autofs_mount_data {
+	__u32	i_uid;
+	__u32	i_gid;
+	__u32	oz_pgrp;
+	__u32	type;
+	__u32	min_proto;
+	__u32	max_proto;
+	__u32	exp_timeout;
+	__u32	pipefd;
+	__u32	pipe_pid;
+	__u32	is32bit;
+	/* see comment in check_autofs */
+	__u64	pipe_fd_id;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -156,7 +173,7 @@ static inline struct autofs_info *autofs
    filesystem without "magic".) */
 
 static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
-	return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
 
 /* Does a dentry have some pending activity? */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs4/dev-ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/dev-ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs4/dev-ioctl.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/dev-ioctl.c	2015-01-21 12:02:48.512085986 +0300
@@ -385,9 +385,11 @@ static int autofs_dev_ioctl_setpipefd(st
 			fput(pipe);
 			goto out;
 		}
-		sbi->oz_pgrp = task_pgrp_nr(current);
+		put_pid(sbi->oz_pgrp);
+		sbi->oz_pgrp = get_pid(task_pgrp(current));
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
+		sbi->pipe_pid = task_pid_vnr(current);
 		sbi->catatonic = 0;
 	}
 out:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs4/init.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/init.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs4/init.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/init.c	2015-01-21 12:02:48.503086225 +0300
@@ -12,6 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ve_proto.h>
 #include "autofs_i.h"
 
 static int autofs_get_sb(struct file_system_type *fs_type,
@@ -25,6 +26,20 @@ static struct file_system_type autofs_fs
 	.name		= "autofs",
 	.get_sb		= autofs_get_sb,
 	.kill_sb	= autofs4_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
+};
+
+static void ve_autofs_stop(void *data)
+{
+	struct ve_struct *ve = data;
+
+	umount_ve_fs_type(&autofs_fs_type, ve->veid);
+}
+
+static struct ve_hook autofs4_hook = {
+	.fini	  = ve_autofs_stop,
+	.owner	  = THIS_MODULE,
+	.priority = HOOK_PRIO_FS,
 };
 
 static int __init init_autofs4_fs(void)
@@ -36,12 +51,14 @@ static int __init init_autofs4_fs(void)
 		return err;
 
 	autofs_dev_ioctl_init();
+	ve_hook_register(VE_INIT_EXIT_CHAIN, &autofs4_hook);
 
 	return err;
 }
 
 static void __exit exit_autofs4_fs(void)
 {
+	ve_hook_unregister(&autofs4_hook);
 	autofs_dev_ioctl_exit();
 	unregister_filesystem(&autofs_fs_type);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs4/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs4/inode.c	2014-12-12 23:29:13.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/inode.c	2015-01-21 12:02:48.781078846 +0300
@@ -19,6 +19,7 @@
 #include <linux/parser.h>
 #include <linux/bitops.h>
 #include <linux/magic.h>
+#include <linux/nsproxy.h>
 #include "autofs_i.h"
 #include <linux/module.h>
 
@@ -160,6 +161,8 @@ void autofs4_kill_sb(struct super_block 
 	/* Clean up and release dangling references */
 	autofs4_force_release(sbi);
 
+	put_pid(sbi->oz_pgrp);
+
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 
@@ -181,7 +184,7 @@ static int autofs4_show_options(struct s
 		seq_printf(m, ",uid=%u", root_inode->i_uid);
 	if (root_inode->i_gid != 0)
 		seq_printf(m, ",gid=%u", root_inode->i_gid);
-	seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+	seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
 	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
 	seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -226,7 +229,7 @@ static int parse_options(char *options, 
 
 	*uid = current_uid();
 	*gid = current_gid();
-	*pgrp = task_pgrp_nr(current);
+	*pgrp = task_pgrp_vnr(current);
 
 	*minproto = AUTOFS_MIN_PROTO_VERSION;
 	*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -299,6 +302,18 @@ static struct autofs_info *autofs4_mkroo
 	return ino;
 }
 
+static int autofs_open_pipe(struct autofs_sb_info *sbi)
+{
+	struct file *f;
+
+	f = get_task_file(sbi->pipe_pid, sbi->pipefd);
+	if (IS_ERR(f))
+		return PTR_ERR(f);
+
+	sbi->pipe = f;
+	return 0;
+}
+
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode * root_inode;
@@ -319,13 +334,16 @@ int autofs4_fill_super(struct super_bloc
 	sbi->pipe = NULL;
 	sbi->catatonic = 1;
 	sbi->exp_timeout = 0;
-	sbi->oz_pgrp = task_pgrp_nr(current);
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
 	set_autofs_type_indirect(&sbi->type);
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
+#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32))
+		sbi->is32bit = 1;
+#endif
 	mutex_init(&sbi->wq_mutex);
 	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
@@ -357,12 +375,68 @@ int autofs4_fill_super(struct super_bloc
 	root->d_op = &autofs4_dentry_operations;
 	root->d_fsdata = ino;
 
-	/* Can this call block? */
-	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
-				&sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
-				&sbi->max_proto)) {
-		printk("autofs: called with bogus options\n");
-		goto fail_dput;
+	if (s->s_flags & MS_CPTMOUNT) {
+		struct autofs_mount_data *kd;
+		int err;
+
+		kd = (struct autofs_mount_data *)data;
+
+		root_inode->i_uid = kd->i_uid;
+		root_inode->i_gid = kd->i_gid;
+		rcu_read_lock();
+		sbi->oz_pgrp = get_pid(find_pid_ns(kd->oz_pgrp,
+					get_exec_env()->ve_ns->pid_ns));
+		rcu_read_unlock();
+		if (!sbi->oz_pgrp) {
+			printk("autofs: could not find process with group %d\n", kd->oz_pgrp);
+			goto fail_dput;
+		}
+		sbi->type = kd->type;
+		sbi->min_proto = kd->min_proto;
+		sbi->max_proto = kd->max_proto;
+		sbi->exp_timeout = kd->exp_timeout;
+		sbi->pipefd = kd->pipefd;
+#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
+		sbi->is32bit = kd->is32bit;
+#endif
+		sbi->pipe_pid = kd->pipe_pid;
+
+		err = autofs_open_pipe(sbi);
+		if (err < 0) {
+			printk("autofs: can't open file %d of %d - %d\n",
+					sbi->pipefd, sbi->pipe_pid, err);
+			dump_stack();
+			goto fail_dput;
+		}
+	} else {
+		pid_t pgrp;
+		if (parse_options(data, &pipefd,
+					&root_inode->i_uid, &root_inode->i_gid,
+					&pgrp, &sbi->type,
+					&sbi->min_proto, &sbi->max_proto)) {
+			printk("autofs: called with bogus options\n");
+			goto fail_dput;
+		}
+		DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pgrp);
+
+		sbi->oz_pgrp = find_get_pid(pgrp);
+		if (!sbi->oz_pgrp) {
+			printk("autofs: could not find process with group %d\n", pgrp);
+			goto fail_dput;
+		}
+
+		pipe = fget(pipefd);
+
+		if ( !pipe ) {
+			printk("autofs: could not open pipe file descriptor\n");
+			goto fail_dput;
+		}
+		if ( !pipe->f_op || !pipe->f_op->write )
+			goto fail_fput;
+
+		sbi->pipe = pipe;
+		sbi->pipefd = pipefd;
+		sbi->pipe_pid = task_pid_vnr(current);
 	}
 
 	if (autofs_type_trigger(sbi->type))
@@ -378,7 +452,7 @@ int autofs4_fill_super(struct super_bloc
 		       "daemon (%d, %d) kernel (%d, %d)\n",
 			sbi->min_proto, sbi->max_proto,
 			AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
-		goto fail_dput;
+		goto fail_fput;
 	}
 
 	/* Establish highest kernel protocol version */
@@ -388,17 +462,6 @@ int autofs4_fill_super(struct super_bloc
 		sbi->version = sbi->max_proto;
 	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
 
-	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
-	pipe = fget(pipefd);
-	
-	if (!pipe) {
-		printk("autofs: could not open pipe file descriptor\n");
-		goto fail_dput;
-	}
-	if (!pipe->f_op || !pipe->f_op->write)
-		goto fail_fput;
-	sbi->pipe = pipe;
-	sbi->pipefd = pipefd;
 	sbi->catatonic = 0;
 
 	/*
@@ -414,6 +477,7 @@ fail_fput:
 	printk("autofs: pipe file descriptor does not contain proper ops\n");
 	fput(pipe);
 	/* fall through */
+	put_pid(sbi->oz_pgrp);
 fail_dput:
 	dput(root);
 	goto fail_free;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs4/root.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/root.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs4/root.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/root.c	2015-01-21 12:02:43.649215090 +0300
@@ -579,7 +579,7 @@ static int autofs4_dir_unlink(struct ino
 	struct autofs_info *p_ino;
 	
 	/* This allows root to remove symlinks */
-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
 		return -EACCES;
 
 	if (atomic_dec_and_test(&ino->count)) {
@@ -817,7 +817,7 @@ static int autofs4_root_ioctl(struct ino
 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
 	
-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
 		return -EPERM;
 	
 	switch(cmd) {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/autofs4/waitq.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/waitq.c
--- linux-2.6.32-504.3.3.el6.orig/fs/autofs4/waitq.c	2014-12-12 23:29:13.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/autofs4/waitq.c	2015-01-21 12:02:42.076256851 +0300
@@ -156,6 +156,16 @@ static void autofs4_notify_daemon(struct
 		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
 
 		pktsz = sizeof(*packet);
+#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
+		/*
+		 * On x86_64 autofs_v5_packet struct padded with 4 bytes
+		 * it broke autofs daemon worked in ia32 emulation mode
+		 *
+		 * reduce size if work in 32-bit mode to satisfy userspace hope
+		 */
+		if (sbi->is32bit)
+			pktsz -= 4;
+#endif
 
 		packet->wait_queue_token = wq->wait_queue_token;
 		packet->len = wq->name.len;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/bad_inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/bad_inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/bad_inode.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/bad_inode.c	2015-01-21 12:02:50.800025249 +0300
@@ -80,6 +80,11 @@ static int bad_file_mmap(struct file *fi
 
 static int bad_file_open(struct inode *inode, struct file *filp)
 {
+#if IS_ENABLED(CONFIG_VZ_CHECKPOINT)
+	/* Nobody but CPT uses bad_file_ops on anon_inode */
+	if (inode == anon_inode_inode)
+		return 0;
+#endif
 	return -EIO;
 }
 
@@ -127,7 +132,7 @@ static unsigned long bad_file_get_unmapp
 	return -EIO;
 }
 
-static int bad_file_check_flags(int flags)
+static int bad_file_set_flags(struct file *file, int flags)
 {
 	return -EIO;
 }
@@ -151,7 +156,7 @@ static ssize_t bad_file_splice_read(stru
 	return -EIO;
 }
 
-static const struct file_operations bad_file_ops =
+const struct file_operations bad_file_ops =
 {
 	.llseek		= bad_file_llseek,
 	.read		= bad_file_read,
@@ -173,12 +178,16 @@ static const struct file_operations bad_
 	.lock		= bad_file_lock,
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
-	.check_flags	= bad_file_check_flags,
+	.set_flags	= bad_file_set_flags,
 	.flock		= bad_file_flock,
 	.splice_write	= bad_file_splice_write,
 	.splice_read	= bad_file_splice_read,
 };
 
+#if IS_ENABLED(CONFIG_VZ_CHECKPOINT)
+EXPORT_SYMBOL(bad_file_ops);
+#endif
+
 static int bad_inode_create (struct inode *dir, struct dentry *dentry,
 		int mode, struct nameidata *nd)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/binfmt_aout.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/binfmt_aout.c
--- linux-2.6.32-504.3.3.el6.orig/fs/binfmt_aout.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/binfmt_aout.c	2015-01-21 12:02:43.837210097 +0300
@@ -298,12 +298,12 @@ static int load_aout_binary(struct linux
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
 		{
-			printk(KERN_NOTICE "executable not page aligned\n");
+			ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n");
 		}
 
 		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
 		{
-			printk(KERN_WARNING 
+			ve_printk(VE_LOG, KERN_WARNING 
 			       "fd_offset is not page aligned. Please convert program: %s\n",
 			       bprm->file->f_path.dentry->d_name.name);
 		}
@@ -412,7 +412,7 @@ static int load_aout_library(struct file
 
 		if (printk_ratelimit())
 		{
-			printk(KERN_WARNING 
+			ve_printk(VE_LOG, KERN_WARNING 
 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
 			       file->f_path.dentry->d_name.name);
 		}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/binfmt_elf.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/binfmt_elf.c
--- linux-2.6.32-504.3.3.el6.orig/fs/binfmt_elf.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/binfmt_elf.c	2015-01-21 12:02:49.337064085 +0300
@@ -65,7 +65,7 @@ static int elf_core_dump(struct coredump
 #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 
-static struct linux_binfmt elf_format = {
+struct linux_binfmt elf_format = {
 		.module		= THIS_MODULE,
 		.load_binary	= load_elf_binary,
 		.load_shlib	= load_elf_library,
@@ -73,6 +73,9 @@ static struct linux_binfmt elf_format = 
 		.min_coredump	= ELF_EXEC_PAGESIZE,
 		.hasvdso	= 1
 };
+#ifndef CONFIG_COMPAT_BINFMT_ELF
+EXPORT_SYMBOL(elf_format);
+#endif
 
 #define BAD_ADDR(x) IS_ERR_VALUE(x)
 
@@ -438,7 +441,7 @@ static unsigned long load_elf_interp(str
 	eppnt = elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
 		if (eppnt->p_type == PT_LOAD) {
-			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+			int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO;
 			int elf_prot = 0;
 			unsigned long vaddr = 0;
 			unsigned long k, map_addr;
@@ -805,7 +808,8 @@ static int load_elf_binary(struct linux_
 		if (elf_ppnt->p_flags & PF_X)
 			elf_prot |= PROT_EXEC;
 
-		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+		elf_flags = MAP_PRIVATE | MAP_DENYWRITE |
+				MAP_EXECUTABLE | MAP_EXECPRIO;
 
 		vaddr = elf_ppnt->p_vaddr;
 		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
@@ -938,7 +942,7 @@ static int load_elf_binary(struct linux_
 	set_binfmt(&elf_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-	retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+	retval = arch_setup_additional_pages(bprm, !!elf_interpreter, 0);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
 		goto out;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/binfmt_misc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/binfmt_misc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/binfmt_misc.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/binfmt_misc.c	2015-01-21 12:02:44.217200009 +0300
@@ -28,6 +28,7 @@
 #include <linux/mount.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/ve_proto.h>
 
 #include <asm/uaccess.h>
 
@@ -35,8 +36,15 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
+#ifdef CONFIG_VE
+#define bm_entries(ve)		((ve)->bm_entries)
+#define bm_enabled(ve)		((ve)->bm_enabled)
+#else
 static LIST_HEAD(entries);
 static int enabled = 1;
+#define bm_entries(ve)		(entries)
+#define bm_enabled(ve)		(enabled)
+#endif
 
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1<<31)
@@ -56,21 +64,30 @@ typedef struct {
 } Node;
 
 static DEFINE_RWLOCK(entries_lock);
+#ifdef CONFIG_VE
+#define bm_fs_type(ve)		(*(ve)->bm_fs_type)
+#define bm_mnt(ve)		((ve)->bm_mnt)
+#define bm_entry_count(ve)	((ve)->bm_entry_count)
+#else
 static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
+#define bm_fs_type(ve)		(bm_fs_type)
+#define bm_mnt(ve)		(bm_mnt)
+#define bm_entry_count(ve)	(bm_entry_count)
+#endif
 
 /* 
  * Check if we support the binfmt
  * if we do, return the node, else NULL
  * locking is done in load_misc_binary
  */
-static Node *check_file(struct linux_binprm *bprm)
+static Node *check_file(struct ve_struct *ve, struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
 	struct list_head *l;
 
-	list_for_each(l, &entries) {
+	list_for_each(l, &bm_entries(ve)) {
 		Node *e = list_entry(l, Node, list);
 		char *s;
 		int j;
@@ -111,14 +128,15 @@ static int load_misc_binary(struct linux
 	char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
+	struct ve_struct *ve = get_exec_env();
 
 	retval = -ENOEXEC;
-	if (!enabled)
+	if (!bm_enabled(ve))
 		goto _ret;
 
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
-	fmt = check_file(bprm);
+	fmt = check_file(ve, bprm);
 	if (fmt)
 		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
 	read_unlock(&entries_lock);
@@ -504,7 +522,7 @@ static void bm_clear_inode(struct inode 
 	kfree(inode->i_private);
 }
 
-static void kill_node(Node *e)
+static void kill_node(struct ve_struct *ve, Node *e)
 {
 	struct dentry *dentry;
 
@@ -520,7 +538,7 @@ static void kill_node(Node *e)
 		dentry->d_inode->i_nlink--;
 		d_drop(dentry);
 		dput(dentry);
-		simple_release_fs(&bm_mnt, &entry_count);
+		simple_release_fs(&bm_mnt(ve), &bm_entry_count(ve));
 	}
 }
 
@@ -559,7 +577,7 @@ static ssize_t bm_entry_write(struct fil
 		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
-			kill_node(e);
+			kill_node(get_exec_env(), e);
 
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
@@ -584,6 +602,7 @@ static ssize_t bm_register_write(struct 
 	struct dentry *root, *dentry;
 	struct super_block *sb = file->f_path.mnt->mnt_sb;
 	int err = 0;
+	struct ve_struct *ve = get_exec_env();
 
 	e = create_entry(buffer, count);
 
@@ -607,7 +626,7 @@ static ssize_t bm_register_write(struct 
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type(ve), &bm_mnt(ve), &bm_entry_count(ve));
 	if (err) {
 		iput(inode);
 		inode = NULL;
@@ -620,7 +639,7 @@ static ssize_t bm_register_write(struct 
 
 	d_instantiate(dentry, inode);
 	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
+	list_add(&e->list, &bm_entries(ve));
 	write_unlock(&entries_lock);
 
 	err = 0;
@@ -646,26 +665,31 @@ static const struct file_operations bm_r
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct ve_struct *ve = get_exec_env();
+	char *s = bm_enabled(ve) ? "enabled\n" : "disabled\n";
 
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
 
+static void dm_genocide(struct ve_struct *ve)
+{
+	while (!list_empty(&bm_entries(ve)))
+		kill_node(ve, list_entry(bm_entries(ve).next, Node, list));
+}
+
 static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 		size_t count, loff_t *ppos)
 {
+	struct ve_struct *ve = get_exec_env();
 	int res = parse_command(buffer, count);
 	struct dentry *root;
 
 	switch (res) {
-		case 1: enabled = 0; break;
-		case 2: enabled = 1; break;
+		case 1: bm_enabled(ve) = 0; break;
+		case 2: bm_enabled(ve) = 1; break;
 		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
-
-			while (!list_empty(&entries))
-				kill_node(list_entry(entries.next, Node, list));
-
+			dm_genocide(ve);
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
 		default: return res;
@@ -716,6 +740,52 @@ static struct file_system_type bm_fs_typ
 	.kill_sb	= kill_litter_super,
 };
 
+#ifdef CONFIG_VE
+static void __ve_binfmt_init(struct ve_struct *ve, struct file_system_type *fs)
+{
+	ve->bm_fs_type = fs;
+	INIT_LIST_HEAD(&ve->bm_entries);
+	ve->bm_enabled = 1;
+	ve->bm_mnt = NULL;
+	ve->bm_entry_count = 0;
+}
+
+static int ve_binfmt_init(void *x)
+{
+	struct ve_struct *ve = x;
+	struct file_system_type *fs_type;
+	int err;
+
+	err = register_ve_fs_type(ve, &bm_fs_type, &fs_type, NULL);
+	if (err == 0)
+		__ve_binfmt_init(ve, fs_type);
+
+	return err;
+}
+
+static void ve_binfmt_fini(void *x)
+{
+	struct ve_struct *ve = x;
+
+	/*
+	 * no locks since exec_ve is dead and noone will
+	 * mess with bm_xxx fields any longer
+	 */
+	if (!ve->bm_fs_type)
+		return;
+	dm_genocide(ve);
+	unregister_ve_fs_type(ve->bm_fs_type, NULL);
+	/* bm_fs_type is freed in real_put_ve -> free_ve_filesystems */
+}
+
+static struct ve_hook ve_binfmt_hook = {
+	.init		= ve_binfmt_init,
+	.fini		= ve_binfmt_fini,
+	.priority	= HOOK_PRIO_FS,
+	.owner		= THIS_MODULE,
+};
+#endif
+
 static int __init init_misc_binfmt(void)
 {
 	int err = register_filesystem(&bm_fs_type);
@@ -724,11 +794,17 @@ static int __init init_misc_binfmt(void)
 		if (err)
 			unregister_filesystem(&bm_fs_type);
 	}
+
+	if (!err) {
+		__ve_binfmt_init(get_ve0(), &bm_fs_type);
+		ve_hook_register(VE_SS_CHAIN, &ve_binfmt_hook);
+	}
 	return err;
 }
 
 static void __exit exit_misc_binfmt(void)
 {
+	ve_hook_unregister(&ve_binfmt_hook);
 	unregister_binfmt(&misc_format);
 	unregister_filesystem(&bm_fs_type);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/block_dev.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/block_dev.c
--- linux-2.6.32-504.3.3.el6.orig/fs/block_dev.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/block_dev.c	2015-01-21 12:02:58.271826911 +0300
@@ -194,6 +194,7 @@ static struct super_block *freeze_bdev_o
 	down_write(&sb->s_umount);
 	if (sb->s_flags & MS_RDONLY) {
 		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
 		up_write(&sb->s_umount);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
@@ -340,6 +341,12 @@ static int thaw_bdev_old(struct block_de
 
 	BUG_ON(sb->s_bdev != bdev);
 	down_write(&sb->s_umount);
+	if (sb->s_frozen == SB_UNFROZEN) {
+		up_write(&sb->s_umount);
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
 	if (sb->s_flags & MS_RDONLY)
 		goto out_unfrozen;
 
@@ -350,6 +357,7 @@ static int thaw_bdev_old(struct block_de
 				"VFS:Filesystem thaw failed\n");
 			sb->s_frozen = SB_FREEZE_TRANS;
 			bdev->bd_fsfreeze_count++;
+			up_write(&sb->s_umount);
 			mutex_unlock(&bdev->bd_fsfreeze_mutex);
 			return error;
 		}
@@ -396,6 +404,7 @@ int thaw_bdev(struct block_device *bdev,
 			printk(KERN_ERR
 				"VFS:Filesystem thaw failed\n");
 			bdev->bd_fsfreeze_count++;
+			up_write(&sb->s_umount);
 			mutex_unlock(&bdev->bd_fsfreeze_mutex);
 			return error;
 		}
@@ -1269,6 +1278,8 @@ static int __blkdev_get(struct block_dev
 		perm |= MAY_READ;
 	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
+	if (mode & FMODE_EXCLUSIVE)
+		perm |= MAY_MOUNT;
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
@@ -1670,7 +1681,7 @@ struct block_device *open_bdev_exclusive
 	if (IS_ERR(bdev))
 		return bdev;
 
-	error = blkdev_get(bdev, mode);
+	error = blkdev_get(bdev, mode | FMODE_EXCLUSIVE);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
@@ -1718,7 +1729,7 @@ int __invalidate_device(struct block_dev
 		 * hold).
 		 */
 		shrink_dcache_sb(sb);
-		res = invalidate_inodes(sb, kill_dirty);
+		res = invalidate_inodes_check(sb, kill_dirty, 1);
 		drop_super(sb);
 	}
 	invalidate_bdev(bdev);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/buffer.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/buffer.c
--- linux-2.6.32-504.3.3.el6.orig/fs/buffer.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/buffer.c	2015-01-21 12:02:58.292826354 +0300
@@ -288,7 +288,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_flusher_threads(1024);
+	wakeup_flusher_threads(NULL, 1024);
 	yield();
 
 	for_each_online_node(nid) {
@@ -678,6 +678,11 @@ static void __set_page_dirty(struct page
 		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+		if (mapping_cap_account_dirty(mapping) &&
+				!radix_tree_prev_tag_get(
+					&mapping->page_tree,
+					PAGECACHE_TAG_DIRTY))
+			ub_io_account_dirty(mapping);
 	}
 	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1029,7 +1034,8 @@ grow_dev_page(struct block_device *bdev,
 	int ret = 0;		/* Will call free_more_memory() */
 
 	page = find_or_create_page(inode->i_mapping, index,
-		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) |
+		__GFP_MOVABLE | __GFP_NOFAIL);
 	if (!page)
 		return ret;
 
@@ -1633,6 +1639,22 @@ static struct buffer_head *create_page_b
 	return page_buffers(page);
 }
 
+static void bdi_congestion_wait(struct backing_dev_info *bdi)
+{
+	DEFINE_WAIT(_wait);
+
+	for (;;) {
+		prepare_to_wait(&bdi->cong_waitq, &_wait,
+				TASK_UNINTERRUPTIBLE);
+		if (!bdi_write_congested2(bdi))
+			break;
+
+		io_schedule();
+	}
+
+	finish_wait(&bdi->cong_waitq, &_wait);
+}
+
 /*
  * NOTE! All mapped/uptodate combinations are valid:
  *
@@ -1670,6 +1692,7 @@ static struct buffer_head *create_page_b
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc,
+			bh_submit_io_t *submit_handler,
 			bh_end_io_t *handler)
 {
 	int err;
@@ -1765,10 +1788,14 @@ static int __block_write_full_page(struc
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 
+	if (!wbc->for_reclaim &&
+	    bdi_write_congested2(page->mapping->backing_dev_info))
+		bdi_congestion_wait(page->mapping->backing_dev_info);
+
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(write_op, bh);
+			submit_handler(write_op, bh, wbc->fsdata);
 			nr_underway++;
 		}
 		bh = next;
@@ -1822,7 +1849,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(write_op, bh);
+			submit_handler(write_op, bh, wbc->fsdata);
 			nr_underway++;
 		}
 		bh = next;
@@ -2475,20 +2502,19 @@ EXPORT_SYMBOL(block_commit_write);
  * using sb_start_write() - sb_end_write() functions.
  */
 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-			 get_block_t get_block)
+		   get_block_t get_block)
 {
-	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
 	int ret;
+	struct file *file = vma->vm_file;
+	struct inode *inode;
+	struct page *page = vmf->page;
 
-	/*
-	 * Update file times before taking page lock. We may end up failing the
-	 * fault so this update may be superfluous but who really cares...
-	 */
-	file_update_time(vma->vm_file);
+	if (file->f_op->get_host)
+		file = file->f_op->get_host(file);
 
+	inode = file->f_path.dentry->d_inode;
 	lock_page(page);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
@@ -2535,7 +2561,8 @@ int block_page_mkwrite(struct vm_area_st
 		   get_block_t get_block)
 {
 	int ret;
-	__attribute__ ((unused)) struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	__attribute__ ((unused)) struct super_block *sb = inode->i_sb;
 
 	/*
 	 *  OLD FREEZE PATH:
@@ -2546,6 +2573,13 @@ int block_page_mkwrite(struct vm_area_st
 		vfs_check_frozen(sb, SB_FREEZE_WRITE);
 
 	sb_start_pagefault(sb);
+
+	/*
+	 * Update file times before taking page lock. We may end up failing the
+	 * fault so this update may be superfluous but who really cares...
+	 */
+	file_update_time(vma->vm_file);
+
 	ret = __block_page_mkwrite(vma, vmf, get_block);
 	sb_end_pagefault(sb);
 	return block_page_mkwrite_return(ret);
@@ -2844,6 +2878,7 @@ out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
 		ret = __block_write_full_page(inode, page, get_block, wbc,
+					      generic_submit_bh_handler,
 					      end_buffer_async_write);
 	return ret;
 }
@@ -3007,8 +3042,10 @@ EXPORT_SYMBOL(block_truncate_page);
  * The generic ->writepage function for buffer-backed address_spaces
  * this form passes in the end_io handler used to finish the IO.
  */
-int block_write_full_page_endio(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc, bh_end_io_t *handler)
+int generic_block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc,
+				 bh_submit_io_t *submit_handler,
+				 bh_end_io_t *handler)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
@@ -3018,6 +3055,7 @@ int block_write_full_page_endio(struct p
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
 		return __block_write_full_page(inode, page, get_block, wbc,
+					       submit_handler,
 					       handler);
 
 	/* Is the page fully outside i_size? (truncate in progress) */
@@ -3041,7 +3079,16 @@ int block_write_full_page_endio(struct p
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-	return __block_write_full_page(inode, page, get_block, wbc, handler);
+	return __block_write_full_page(inode, page, get_block, wbc,
+				       submit_handler, handler);
+}
+EXPORT_SYMBOL(generic_block_write_full_page);
+
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc, bh_end_io_t *handler)
+{
+	return generic_block_write_full_page(page, get_block, wbc,
+				     generic_submit_bh_handler, handler);
 }
 EXPORT_SYMBOL(block_write_full_page_endio);
 
@@ -3192,6 +3239,11 @@ int submit_bh(int rw, struct buffer_head
 }
 EXPORT_SYMBOL(submit_bh);
 
+int generic_submit_bh_handler(int rw, struct buffer_head * bh, void *fsdata)
+{
+	return submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(generic_submit_bh_handler);
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/cachefiles/daemon.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/cachefiles/daemon.c
--- linux-2.6.32-504.3.3.el6.orig/fs/cachefiles/daemon.c	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/cachefiles/daemon.c	2015-01-21 12:02:42.152254833 +0300
@@ -552,8 +552,7 @@ static int cachefiles_daemon_tag(struct 
  */
 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 {
-	struct fs_struct *fs;
-	struct dentry *dir;
+	struct path path;
 	const struct cred *saved_cred;
 	int ret;
 
@@ -573,24 +572,21 @@ static int cachefiles_daemon_cull(struct
 	}
 
 	/* extract the directory dentry from the cwd */
-	fs = current->fs;
-	read_lock(&fs->lock);
-	dir = dget(fs->pwd.dentry);
-	read_unlock(&fs->lock);
+	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(dir->d_inode->i_mode))
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = cachefiles_cull(cache, dir, args);
+	ret = cachefiles_cull(cache, path.dentry, args);
 	cachefiles_end_secure(cache, saved_cred);
 
-	dput(dir);
+	path_put(&path);
 	_leave(" = %d", ret);
 	return ret;
 
 notdir:
-	dput(dir);
+	path_put(&path);
 	kerror("cull command requires dirfd to be a directory");
 	return -ENOTDIR;
 
@@ -628,8 +624,7 @@ inval:
  */
 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 {
-	struct fs_struct *fs;
-	struct dentry *dir;
+	struct path path;
 	const struct cred *saved_cred;
 	int ret;
 
@@ -649,24 +644,21 @@ static int cachefiles_daemon_inuse(struc
 	}
 
 	/* extract the directory dentry from the cwd */
-	fs = current->fs;
-	read_lock(&fs->lock);
-	dir = dget(fs->pwd.dentry);
-	read_unlock(&fs->lock);
+	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(dir->d_inode->i_mode))
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = cachefiles_check_in_use(cache, dir, args);
+	ret = cachefiles_check_in_use(cache, path.dentry, args);
 	cachefiles_end_secure(cache, saved_cred);
 
-	dput(dir);
+	path_put(&path);
 	//_leave(" = %d", ret);
 	return ret;
 
 notdir:
-	dput(dir);
+	path_put(&path);
 	kerror("inuse command requires dirfd to be a directory");
 	return -ENOTDIR;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/char_dev.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/char_dev.c
--- linux-2.6.32-504.3.3.el6.orig/fs/char_dev.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/char_dev.c	2015-01-21 12:02:44.394195310 +0300
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/device_cgroup.h>
 
 #include "internal.h"
 
@@ -69,8 +70,12 @@ void chrdev_show(struct seq_file *f, off
 
 	if (offset < CHRDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&chrdevs_lock);
-		for (cd = chrdevs[offset]; cd; cd = cd->next)
+		for (cd = chrdevs[offset]; cd; cd = cd->next) {
+			if (!devcgroup_device_visible(S_IFCHR, cd->major,
+						cd->baseminor, cd->minorct))
+				continue;
 			seq_printf(f, "%3d %s\n", cd->major, cd->name);
+		}
 		mutex_unlock(&chrdevs_lock);
 	}
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/cifs/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/cifs/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/cifs/file.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/cifs/file.c	2015-01-21 12:02:58.172829538 +0300
@@ -1986,7 +1986,8 @@ cifs_readdata_to_iov(struct cifs_readdat
 		/* go while there's data to be copied and no errors */
 		if (copy && !rc) {
 			pdata = kmap(page);
-			rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
+			rc = memcpy_toiovecend((struct iovec *)ii.data,
+					       pdata, ii.iov_offset,
 						(int)copy);
 			kunmap(page);
 			if (!rc) {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/compat.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/compat.c
--- linux-2.6.32-504.3.3.el6.orig/fs/compat.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/compat.c	2015-01-21 12:02:52.018992889 +0300
@@ -50,6 +50,7 @@
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/pagemap.h>
+#include <linux/ve_proto.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -72,6 +73,18 @@ int compat_printk(const char *fmt, ...)
 
 #include "read_write.h"
 
+int ve_compat_printk(int dst, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = ve_vprintk(dst, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -90,6 +103,21 @@ asmlinkage long compat_sys_utime(char __
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
 
+asmlinkage long compat_sys_lutime(char __user * filename,
+		struct compat_utimbuf __user *t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t->actime) ||
+		    get_user(tv[1].tv_sec, &t->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, AT_SYMLINK_NOFOLLOW);
+}
+
 asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
 {
 	struct timespec tv[2];
@@ -245,11 +273,8 @@ static int put_compat_statfs(struct comp
 	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
 	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
 	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
-	    __put_user(0, &ubuf->f_spare[0]) || 
-	    __put_user(0, &ubuf->f_spare[1]) || 
-	    __put_user(0, &ubuf->f_spare[2]) || 
-	    __put_user(0, &ubuf->f_spare[3]) || 
-	    __put_user(0, &ubuf->f_spare[4]))
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
 		return -EFAULT;
 	return 0;
 }
@@ -302,7 +327,9 @@ static int put_compat_statfs64(struct co
 	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
 	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
 	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-	    __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
 		return -EFAULT;
 	return 0;
 }
@@ -342,12 +369,18 @@ asmlinkage long compat_sys_fstatfs64(uns
  */
 asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
 {
+	dev_t kdev;
 	struct super_block *sb;
 	struct compat_ustat tmp;
 	struct kstatfs sbuf;
 	int err;
 
-	sb = user_get_super(new_decode_dev(dev));
+	kdev = new_decode_dev(dev);
+	err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ);
+	if (err)
+		return err;
+
+	sb = user_get_super(kdev);
 	if (!sb)
 		return -EINVAL;
 	err = statfs_by_dentry(sb->s_root, &sbuf);
@@ -2288,3 +2321,16 @@ asmlinkage long compat_sys_timerfd_getti
 }
 
 #endif /* CONFIG_TIMERFD */
+
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+			     struct file_handle __user *handle, int flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/compat_binfmt_elf.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/compat_binfmt_elf.c
--- linux-2.6.32-504.3.3.el6.orig/fs/compat_binfmt_elf.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/compat_binfmt_elf.c	2015-01-21 12:02:48.897075766 +0300
@@ -131,3 +131,5 @@ static void cputime_to_compat_timeval(co
  * We share all the actual code with the native (64-bit) version.
  */
 #include "binfmt_elf.c"
+
+EXPORT_SYMBOL(compat_elf_format);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/compat_ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/compat_ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/compat_ioctl.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/compat_ioctl.c	2015-01-21 12:02:43.839210045 +0300
@@ -1877,6 +1877,7 @@ COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
 COMPATIBLE_IOCTL(TIOCCBRK)
 ULONG_IOCTL(TIOCMIWAIT)
+COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
@@ -2741,7 +2742,7 @@ static void compat_ioctl_error(struct fi
 	 sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
 	if (!isprint(buf[1]))
 		sprintf(buf, "%02x", buf[1]);
-	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+	ve_compat_printk(VE_LOG, "ioctl32(%s:%d): Unknown cmd fd(%d) "
 			"cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
 			current->comm, current->pid,
 			(int)fd, (unsigned int)cmd, buf,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/configfs/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/configfs/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/configfs/dir.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/configfs/dir.c	2015-01-21 12:02:41.343276313 +0300
@@ -692,7 +692,8 @@ static int create_default_group(struct c
 			sd = child->d_fsdata;
 			sd->s_type |= CONFIGFS_USET_DEFAULT;
 		} else {
-			d_delete(child);
+			BUG_ON(child->d_inode);
+			d_drop(child);
 			dput(child);
 		}
 	}
@@ -1684,7 +1685,8 @@ int configfs_register_subsystem(struct c
 		err = configfs_attach_group(sd->s_element, &group->cg_item,
 					    dentry);
 		if (err) {
-			d_delete(dentry);
+			BUG_ON(dentry->d_inode);
+			d_drop(dentry);
 			dput(dentry);
 		} else {
 			spin_lock(&configfs_dirent_lock);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/dcache.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/dcache.c
--- linux-2.6.32-504.3.3.el6.orig/fs/dcache.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/dcache.c	2015-01-21 12:02:58.808812658 +0300
@@ -33,6 +33,8 @@
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
 #include <linux/hardirq.h>
+#include <bc/beancounter.h>
+#include <bc/dcache.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -43,9 +45,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC
 
 EXPORT_SYMBOL(dcache_lock);
 
-static struct kmem_cache *dentry_cache __read_mostly;
-
-#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
+struct kmem_cache *dentry_cache __read_mostly;
 
 /*
  * This is the single most critical data structure when it comes
@@ -122,20 +122,70 @@ static void dentry_iput(struct dentry * 
 	}
 }
 
+static unsigned int dcache_time_shift = 32;
+static u64 jif_off;
+
+static __init void dcache_time_init(void)
+{
+	unsigned long jiff;
+	struct timespec ts;
+
+	ts.tv_sec = 0;
+	ts.tv_nsec = 0;
+	jiff = timespec_to_jiffies(&ts);
+	ts.tv_sec++;
+	jiff = timespec_to_jiffies(&ts) - jiff;
+
+	dcache_time_shift = ilog2(jiff);
+	jif_off = get_jiffies_64();
+
+	printk("Dcache time values: %lu -> %u , %Lu\n", jiff, dcache_time_shift, (unsigned long long)jif_off);
+}
+
+unsigned int dcache_update_time(void)
+{
+	u64 ctime;
+
+	ctime = (get_jiffies_64() - jif_off) >> dcache_time_shift;
+	if (unlikely(ctime > (u64)(unsigned int)-1))
+		printk("Strange DC timestamp %Lu %Lu %u -> %Lu\n",
+				get_jiffies_64(), jif_off, dcache_time_shift, ctime);
+	return ctime;
+}
+
 /*
  * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
  */
 static void dentry_lru_add(struct dentry *dentry)
 {
+	struct user_beancounter *bc = dentry->d_ub;
+
+	if (ub_dcache_lru_popup)
+		dentry->d_lru_time = dcache_update_time();
 	list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	dentry->d_sb->s_nr_dentry_unused++;
+	ub_dcache_insert(bc, dentry->d_lru_time);
+	list_add(&dentry->d_bclru, &bc->ub_dentry_lru);
+	bc->ub_dentry_unused++;
 	dentry_stat.nr_unused++;
 }
 
+static void dentry_lru_popup(struct dentry *dentry)
+{
+	BUG_ON(dentry->d_flags & DCACHE_BCTOP);
+	dentry->d_lru_time = dcache_update_time();
+	list_move(&dentry->d_bclru, &dentry->d_ub->ub_dentry_lru);
+}
+
 static void dentry_lru_add_tail(struct dentry *dentry)
 {
+	struct user_beancounter *bc = dentry->d_ub;
+
 	list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	dentry->d_sb->s_nr_dentry_unused++;
+	ub_dcache_insert(bc, 0);
+	list_add_tail(&dentry->d_bclru, &bc->ub_dentry_lru);
+	bc->ub_dentry_unused++;
 	dentry_stat.nr_unused++;
 }
 
@@ -144,6 +194,8 @@ static void dentry_lru_del(struct dentry
 	if (!list_empty(&dentry->d_lru)) {
 		list_del(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
+		list_del(&dentry->d_bclru);
+		dentry->d_ub->ub_dentry_unused--;
 		dentry_stat.nr_unused--;
 	}
 }
@@ -153,6 +205,8 @@ static void dentry_lru_del_init(struct d
 	if (likely(!list_empty(&dentry->d_lru))) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
+		list_del(&dentry->d_bclru);
+		dentry->d_ub->ub_dentry_unused--;
 		dentry_stat.nr_unused--;
 	}
 }
@@ -173,6 +227,11 @@ static struct dentry *d_kill(struct dent
 
 	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
+
+	ub_dcache_uncharge(dentry->d_ub, dentry->d_name.len);
+	if (dentry->d_flags & DCACHE_BCTOP)
+		list_del(&dentry->d_bclru);
+
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
 	if (IS_ROOT(dentry))
@@ -212,7 +271,7 @@ static struct dentry *d_kill(struct dent
  * no dcache lock, please.
  */
 
-void dput(struct dentry *dentry)
+void dput_nocache(struct dentry *dentry, int nocache)
 {
 	if (!dentry)
 		return;
@@ -238,12 +297,18 @@ repeat:
 			goto unhash_it;
 	}
 	/* Unreachable? Get rid of it */
+	if (unlikely(nocache))
+		goto unhash_it;
  	if (d_unhashed(dentry))
 		goto kill_it;
   	if (list_empty(&dentry->d_lru)) {
   		dentry->d_flags |= DCACHE_REFERENCED;
+		if (dentry->d_flags & DCACHE_BCTOP)
+			ub_dcache_clear_owner(dentry);
 		dentry_lru_add(dentry);
-  	}
+  	} else if (ub_dcache_lru_popup)
+		dentry_lru_popup(dentry);
+
  	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
@@ -258,6 +323,11 @@ kill_it:
 		goto repeat;
 }
 
+void dput(struct dentry *dentry)
+{
+	dput_nocache(dentry, 0);
+}
+
 /**
  * d_invalidate - invalidate a dentry
  * @dentry: dentry to invalidate
@@ -440,6 +510,91 @@ static void prune_one_dentry(struct dent
 	}
 }
 
+int __shrink_dcache_ub(struct user_beancounter *ub, int count, int popup)
+{
+	LIST_HEAD(referenced);
+	LIST_HEAD(tmp);
+	struct dentry *dentry;
+	struct super_block *sb = NULL;
+	int pruned = 0;
+	unsigned int d_time = 0;
+
+	while (!list_empty(&ub->ub_dentry_lru)) {
+		dentry = list_entry(ub->ub_dentry_lru.prev,
+				struct dentry, d_bclru);
+
+		if (popup && dentry->d_lru_time) {
+			if (!d_time)
+				d_time = dentry->d_lru_time;
+			else if (dentry->d_lru_time - d_time > ub_dcache_time_thresh)
+				break;
+		}
+
+		spin_lock(&dentry->d_lock);
+		if (!popup && (dentry->d_flags & DCACHE_REFERENCED)) {
+			dentry->d_flags &= ~DCACHE_REFERENCED;
+			list_move(&dentry->d_bclru, &referenced);
+		} else
+			list_move_tail(&dentry->d_bclru, &tmp);
+		spin_unlock(&dentry->d_lock);
+		cond_resched_lock(&dcache_lock);
+
+		count--;
+		if (!count)
+			break;
+	}
+
+	while (!list_empty(&tmp)) {
+		dentry = list_entry(tmp.prev, struct dentry, d_bclru);
+		dentry_lru_del_init(dentry);
+		spin_lock(&dentry->d_lock);
+		/*
+		 * We found an inuse dentry which was not removed from
+		 * the LRU because of laziness during lookup.  Do not free
+		 * it - just keep it off the LRU list.
+		 */
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+
+		pruned++;
+
+		/*
+		 * Do not prune dentry if the filesystem is being unmounted,
+		 * otherwise we could race with shrink_dcache_for_umount() and
+		 * end up holding a reference to a dentry while the filesystem
+		 * is unmounted.
+		 */
+		if (dentry->d_sb != sb) {
+			if (!down_read_trylock(&dentry->d_sb->s_umount)) {
+				spin_unlock(&dentry->d_lock);
+				continue;
+			}
+			if (sb)
+				up_read(&sb->s_umount);
+			sb = dentry->d_sb;
+		}
+
+		prune_one_dentry(dentry);
+		/* dentry->d_lock was dropped in prune_one_dentry() */
+		cond_resched_lock(&dcache_lock);
+	}
+
+	list_splice(&referenced, &ub->ub_dentry_lru);
+
+	ub->ub_dentry_pruned += pruned;
+
+	/* report fake progress if lru isn't empty */
+	if (!pruned && !list_empty(&ub->ub_dentry_lru))
+		pruned = 1;
+
+	if (sb)
+		up_read(&sb->s_umount);
+
+	return pruned;
+}
+
 /*
  * Shrink the dentry LRU on a given superblock.
  * @sb   : superblock to shrink dentry LRU.
@@ -526,75 +681,97 @@ restart:
  *
  * This function may fail to free any resources if all the dentries are in use.
  */
-static void prune_dcache(int count)
+static void prune_dcache_rr(int count, gfp_t gfp_mask)
 {
-	struct super_block *sb;
+	struct user_beancounter *ub;
 	int w_count;
 	int unused = dentry_stat.nr_unused;
 	int prune_ratio;
-	int pruned;
 
 	if (unused == 0 || count == 0)
 		return;
 	spin_lock(&dcache_lock);
-restart:
 	if (count >= unused)
 		prune_ratio = 1;
 	else
 		prune_ratio = unused / count;
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_nr_dentry_unused == 0)
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!get_beancounter_rcu(ub))
 			continue;
-		sb->s_count++;
-		/* Now, we reclaim unused dentrins with fairness.
-		 * We reclaim them same percentage from each superblock.
-		 * We calculate number of dentries to scan on this sb
-		 * as follows, but the implementation is arranged to avoid
-		 * overflows:
-		 * number of dentries to scan on this sb =
-		 * count * (number of dentries on this sb /
-		 * number of dentries in the machine)
-		 */
-		spin_unlock(&sb_lock);
+		rcu_read_unlock();
+
 		if (prune_ratio != 1)
-			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
+			w_count = (ub->ub_dentry_unused / prune_ratio) + 1;
 		else
-			w_count = sb->s_nr_dentry_unused;
-		pruned = w_count;
-		/*
-		 * We need to be sure this filesystem isn't being unmounted,
-		 * otherwise we could race with generic_shutdown_super(), and
-		 * end up holding a reference to an inode while the filesystem
-		 * is unmounted.  So we try to get s_umount, and make sure
-		 * s_root isn't NULL.
-		 */
-		if (down_read_trylock(&sb->s_umount)) {
-			if ((sb->s_root != NULL) &&
-			    (!list_empty(&sb->s_dentry_lru))) {
-				spin_unlock(&dcache_lock);
-				__shrink_dcache_sb(sb, &w_count,
-						DCACHE_REFERENCED);
-				pruned -= w_count;
-				spin_lock(&dcache_lock);
-			}
-			up_read(&sb->s_umount);
-		}
-		spin_lock(&sb_lock);
-		count -= pruned;
-		/*
-		 * restart only when sb is no longer on the list and
-		 * we have more work to do.
-		 */
-		if (__put_super_and_need_restart(sb) && count > 0) {
-			spin_unlock(&sb_lock);
-			goto restart;
+			w_count = ub->ub_dentry_unused;
+
+		if (!(gfp_mask & __GFP_REPEAT)) {
+			int delta;
+
+			delta = ub->ub_dentry_unused - ub->ub_dcache_threshold;
+			if (delta <= 0)
+				goto skip;
+			if (w_count > delta)
+				w_count = delta;
 		}
+
+		__shrink_dcache_ub(ub, w_count, 0);
+
+skip:
+		rcu_read_lock();
+		put_beancounter(ub);
 	}
-	spin_unlock(&sb_lock);
+	rcu_read_unlock();
+
 	spin_unlock(&dcache_lock);
 }
 
+static void prune_dcache_popup(int count, gfp_t gfp_mask)
+{
+	spin_lock(&dcache_lock);
+	rcu_read_lock();
+	while (1) {
+		struct user_beancounter *ub;
+		unsigned int cnt;
+
+		if (count <= 0)
+			break;
+
+		ub = ub_dcache_next();
+		if (!ub)
+			break;
+		rcu_read_unlock();
+
+		cnt = __shrink_dcache_ub(ub, count, 1);
+		count -= cnt;
+
+		rcu_read_lock();
+		put_beancounter(ub);
+	}
+	rcu_read_unlock();
+	spin_unlock(&dcache_lock);
+
+	/*
+	 * As an optimization we do not keep beancounters whose dcache usage is
+	 * below threshold on the priority queue. As a result the code above
+	 * will not shrink them even if we are really short on memory, which is
+	 * not good. So if the caller insists by passing __GFP_REPEAT, let's
+	 * reclaim as much as possible by using the RR algorithm.
+	 */
+	if (count > 0 && (gfp_mask & __GFP_REPEAT))
+		prune_dcache_rr(count, gfp_mask);
+}
+
+static void prune_dcache(int count, gfp_t gfp_mask)
+{
+	if (ub_dcache_lru_popup)
+		prune_dcache_popup(count, gfp_mask);
+	else
+		prune_dcache_rr(count, gfp_mask);
+}
+
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -687,6 +864,13 @@ static void shrink_dcache_for_umount_sub
 					iput(inode);
 			}
 
+			ub_dcache_uncharge(dentry->d_ub, dentry->d_name.len);
+			if (dentry->d_flags & DCACHE_BCTOP) {
+				spin_lock(&dcache_lock);
+				list_del(&dentry->d_bclru);
+				spin_unlock(&dcache_lock);
+			}
+
 			d_free(dentry);
 
 			/* finished when we fall off the top of the tree,
@@ -888,12 +1072,20 @@ void shrink_dcache_parent(struct dentry 
  */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+	int res = -1;
+
+	KSTAT_PERF_ENTER(shrink_dcache)
+	if (!ub_dcache_shrinkable(gfp_mask))
+		goto out;
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
-			return -1;
-		prune_dcache(nr);
+			goto out;
+		prune_dcache(nr, gfp_mask);
 	}
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+	res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+out:
+	KSTAT_PERF_LEAVE(shrink_dcache)
+	return res;
 }
 
 static struct shrinker dcache_shrinker = {
@@ -911,18 +1103,25 @@ static struct shrinker dcache_shrinker =
  * copied and the copy passed in may be reused after this call.
  */
  
-struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
+static struct dentry *__d_alloc(struct dentry *parent, const struct qstr *name,
+				struct user_beancounter *ub)
 {
 	struct dentry *dentry;
 	char *dname;
 
+	if (ub_dcache_charge(ub, name->len))
+		return NULL;
+
 	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
-	if (!dentry)
+	if (!dentry) {
+		ub_dcache_uncharge(ub, name->len);
 		return NULL;
+	}
 
 	if (name->len > DNAME_INLINE_LEN-1) {
 		dname = kmalloc(name->len + 1, GFP_KERNEL);
 		if (!dname) {
+			ub_dcache_uncharge(ub, name->len);
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
 		}
@@ -945,22 +1144,40 @@ struct dentry *d_alloc(struct dentry * p
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
 	dentry->d_mounted = 0;
+	dentry->d_ub = ub;
+	dentry->d_lru_time = 0;
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
+	INIT_LIST_HEAD(&dentry->d_bclru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
 	INIT_LIST_HEAD(&dentry->d_alias);
+	INIT_LIST_HEAD(&dentry->d_u.d_child);
+
+	spin_lock(&dcache_lock);
+	dentry_stat.nr_dentry++;
+	spin_unlock(&dcache_lock);
+
+	return dentry;
+}
+
+struct dentry *d_alloc(struct dentry *parent, const struct qstr *name)
+{
+	struct dentry *dentry;
+	
+	dentry = __d_alloc(parent, name,
+			   parent ? parent->d_ub : get_exec_ub());
+	if (!dentry)
+		return NULL;
 
+	spin_lock(&dcache_lock);
 	if (parent) {
 		dentry->d_parent = dget(parent);
 		dentry->d_sb = parent->d_sb;
+		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	} else {
-		INIT_LIST_HEAD(&dentry->d_u.d_child);
+		dentry->d_flags |= DCACHE_BCTOP;
+		list_add_tail(&dentry->d_bclru, &dentry->d_ub->ub_dentry_top);
 	}
-
-	spin_lock(&dcache_lock);
-	if (parent)
-		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
 	return dentry;
@@ -968,7 +1185,7 @@ struct dentry *d_alloc(struct dentry * p
 
 struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 {
-	struct dentry *dentry = d_alloc(NULL, name);
+	struct dentry *dentry = __d_alloc(NULL, name, get_ub0());
 	if (dentry) {
 		dentry->d_sb = sb;
 		dentry->d_parent = dentry;
@@ -1705,6 +1922,10 @@ already_unhashed:
 	switch_names(dentry, target);
 	swap(dentry->d_name.hash, target->d_name.hash);
 
+	if (dentry->d_ub != target->d_parent->d_ub &&
+			!(dentry->d_flags & DCACHE_BCTOP))
+		ub_dcache_change_owner(dentry, target->d_parent->d_ub);
+
 	/* ... and switch the parents */
 	if (IS_ROOT(dentry)) {
 		dentry->d_parent = target->d_parent;
@@ -1922,6 +2143,16 @@ static int prepend_name(char **buffer, i
 }
 
 /**
+ * d_root_check - checks if dentry is accessible from current's fs root
+ * @dentry: dentry to be verified
+ * @vfsmnt: vfsmnt to which the dentry belongs
+ */
+int d_root_check(struct path *path)
+{
+	return PTR_ERR(d_path(path, NULL, 0));
+}
+
+/**
  * __d_path - return the path of a dentry
  * @path: the dentry/vfsmount to report
  * @root: root vfsmnt/dentry (may be modified by this function)
@@ -1946,18 +2177,21 @@ char *__d_path(const struct path *path, 
 	struct vfsmount *vfsmnt = path->mnt;
 	char *end = buffer + buflen;
 	char *retval;
+	int deleted;
+	struct vfsmount *oldmnt = vfsmnt;
 
 	spin_lock(&vfsmount_lock);
-	prepend(&end, &buflen, "\0", 1);
-	if (d_unlinked(dentry) &&
-		(prepend(&end, &buflen, " (deleted)", 10) != 0))
+	if (buffer) {
+		prepend(&end, &buflen, "\0", 1);
+		if (buflen < 1)
 			goto Elong;
+	}
+	deleted = (!IS_ROOT(dentry) && d_unhashed(dentry));
 
-	if (buflen < 1)
-		goto Elong;
 	/* Get '/' right */
 	retval = end-1;
-	*retval = '/';
+	if (buffer)
+		*retval = '/';
 
 	for (;;) {
 		struct dentry * parent;
@@ -1975,20 +2209,44 @@ char *__d_path(const struct path *path, 
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
-		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
-		    (prepend(&end, &buflen, "/", 1) != 0))
+		if (buffer && ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+		    (prepend(&end, &buflen, "/", 1) != 0)))
 			goto Elong;
 		retval = end;
 		dentry = parent;
 	}
 
 out:
+	if (deleted && buffer &&
+			prepend(&retval, &buflen, " (deleted)", 10) != 0)
+		goto Elong;
+
 	spin_unlock(&vfsmount_lock);
-	return retval;
+	return buffer ? retval : NULL;
 
 global_root:
+	/*
+	 * We traversed the tree upward and reached a root, but the given
+	 * lookup terminal point wasn't encountered.  It means either that the
+	 * dentry is out of our scope or belongs to an abstract space like
+	 * sock_mnt or pipe_mnt.  Check for it.
+	 *
+	 * There are different options to check it.
+	 * We may assume that any dentry tree is unreachable unless it's
+	 * connected to `root' (defined as fs root of init aka child reaper)
+	 * and expose all paths that are not connected to it.
+	 * The other option is to allow exposing of known abstract spaces
+	 * explicitly and hide the path information for other cases.
+	 * This approach is more safe, let's take it.  2001/04/22  SAW
+	 */
+	if (!(oldmnt->mnt_sb->s_flags & MS_NOUSER) &&
+	    !ve_accessible_veid(vfsmnt->owner, get_exec_env()->veid)) {
+		retval = ERR_PTR(-EINVAL);
+		goto out_err;
+	}
+
 	retval += 1;	/* hit the slash */
-	if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
+	if (buffer && prepend_name(&retval, &buflen, &dentry->d_name) != 0)
 		goto Elong;
 	root->mnt = vfsmnt;
 	root->dentry = dentry;
@@ -1996,8 +2254,12 @@ global_root:
 
 Elong:
 	retval = ERR_PTR(-ENAMETOOLONG);
-	goto out;
+out_err:
+	spin_unlock(&vfsmount_lock);
+	return retval;
+
 }
+EXPORT_SYMBOL(__d_path);
 
 /**
  * d_path - return the path of a dentry
@@ -2027,14 +2289,14 @@ char *d_path(const struct path *path, ch
 	 * thus don't need to be hashed.  They also don't need a name until a
 	 * user wants to identify the object in /proc/pid/fd/.  The little hack
 	 * below allows us to generate a name for these objects on demand:
+	 *
+	 * pipefs and socketfs methods assume valid buffer, d_root_check()
+	 * supplies NULL one for access checks.
 	 */
-	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+	if (buf && path->dentry->d_op && path->dentry->d_op->d_dname)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
-	read_lock(&current->fs->lock);
-	root = current->fs->root;
-	path_get(&root);
-	read_unlock(&current->fs->lock);
+	get_fs_root(current->fs, &root);
 	spin_lock(&dcache_lock);
 	tmp = root;
 	res = __d_path(path, &tmp, buf, buflen);
@@ -2128,12 +2390,15 @@ SYSCALL_DEFINE2(getcwd, char __user *, b
 	if (!page)
 		return -ENOMEM;
 
-	read_lock(&current->fs->lock);
-	pwd = current->fs->pwd;
-	path_get(&pwd);
-	root = current->fs->root;
-	path_get(&root);
-	read_unlock(&current->fs->lock);
+	get_fs_root_and_pwd(current->fs, &root, &pwd);
+
+	if (pwd.dentry->d_inode->i_op &&
+			pwd.dentry->d_inode->i_op->permission) {
+		error = pwd.dentry->d_inode->i_op->permission(
+				pwd.dentry->d_inode, 0);
+		if (error == -ERESTARTSYS)
+			goto out;
+	}
 
 	error = -ENOENT;
 	spin_lock(&dcache_lock);
@@ -2143,6 +2408,16 @@ SYSCALL_DEFINE2(getcwd, char __user *, b
 		char * cwd;
 
 		cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
+
+		error = PTR_ERR(cwd);
+		if (error == -EINVAL) {
+			struct ve_struct *ve;
+
+			ve = get_exec_env();
+			tmp = ve->root_path;
+			cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
+		}
+
 		spin_unlock(&dcache_lock);
 
 		error = PTR_ERR(cwd);
@@ -2362,6 +2637,7 @@ void __init vfs_caches_init(unsigned lon
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
+	dcache_time_init();
 }
 
 EXPORT_SYMBOL(d_alloc);
@@ -2381,6 +2657,7 @@ EXPORT_SYMBOL(d_add_ci);
 EXPORT_SYMBOL(d_validate);
 EXPORT_SYMBOL(dget_locked);
 EXPORT_SYMBOL(dput);
+EXPORT_SYMBOL(dput_nocache);
 EXPORT_SYMBOL(find_inode_number);
 EXPORT_SYMBOL(have_submounts);
 EXPORT_SYMBOL(names_cachep);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/debugfs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/debugfs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/debugfs/inode.c	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/debugfs/inode.c	2015-01-21 12:02:43.338223345 +0300
@@ -228,9 +228,12 @@ struct dentry *debugfs_create_file(const
 {
 	struct dentry *dentry = NULL;
 	int error;
+	struct user_beancounter *ub;
 
 	pr_debug("debugfs: creating file '%s'\n",name);
 
+	ub = set_exec_ub(get_ub0());
+
 	error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
 			      &debugfs_mount_count);
 	if (error)
@@ -244,6 +247,7 @@ struct dentry *debugfs_create_file(const
 		goto exit;
 	}
 exit:
+	set_exec_ub(ub);
 	return dentry;
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/devpts/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/devpts/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/devpts/inode.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/devpts/inode.c	2015-01-21 12:02:47.714107171 +0300
@@ -88,7 +88,11 @@ static struct ctl_table pty_root_table[]
 
 static DEFINE_MUTEX(allocated_ptys_lock);
 
+#ifndef CONFIG_VE
 static struct vfsmount *devpts_mnt;
+#else
+# define devpts_mnt	(get_exec_env()->devpts_mnt)
+#endif
 
 struct pts_mount_opts {
 	int setuid;
@@ -350,7 +354,7 @@ devpts_fill_super(struct super_block *s,
 
 	inode = new_inode(s);
 	if (!inode)
-		goto free_fsi;
+		goto fail;
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -365,8 +369,6 @@ devpts_fill_super(struct super_block *s,
 	printk(KERN_ERR "devpts: get root dentry failed\n");
 	iput(inode);
 
-free_fsi:
-	kfree(s->s_fs_info);
 fail:
 	return -ENOMEM;
 }
@@ -471,11 +473,13 @@ static void devpts_kill_sb(struct super_
 	kill_litter_super(sb);
 }
 
-static struct file_system_type devpts_fs_type = {
+struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
 	.get_sb		= devpts_get_sb,
 	.kill_sb	= devpts_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
 };
+EXPORT_SYMBOL(devpts_fs_type);
 
 /*
  * The normal naming convention is simply /dev/pts/<number>; this conforms
@@ -535,6 +539,7 @@ int devpts_pty_new(struct inode *ptmx_in
 	struct dentry *root = sb->s_root;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
+	int ret = 0;
 	char s[12];
 
 	/* We're supposed to be given the slave end of a pty */
@@ -550,21 +555,25 @@ int devpts_pty_new(struct inode *ptmx_in
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|opts->mode, device);
 	inode->i_private = tty;
-	tty->driver_data = inode;
 
 	sprintf(s, "%d", number);
 
 	mutex_lock(&root->d_inode->i_mutex);
 
 	dentry = d_alloc_name(root, s);
-	if (!IS_ERR(dentry)) {
+	if (dentry) {
 		d_add(dentry, inode);
 		fsnotify_create(root->d_inode, dentry);
+	} else {
+		iput(inode);
+		ret = -ENOMEM;
 	}
 
 	mutex_unlock(&root->d_inode->i_mutex);
 
-	return 0;
+	if (!ret)
+		tty->driver_data = inode;
+	return ret;
 }
 
 struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
@@ -600,17 +609,12 @@ void devpts_pty_kill(struct tty_struct *
 	mutex_lock(&root->d_inode->i_mutex);
 
 	dentry = d_find_alias(inode);
-	if (IS_ERR(dentry))
-		goto out;
-
-	if (dentry) {
-		inode->i_nlink--;
-		d_delete(dentry);
-		dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
-	}
 
+	inode->i_nlink--;
+	d_delete(dentry);
+	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 	dput(dentry);		/* d_find_alias above */
-out:
+
 	mutex_unlock(&root->d_inode->i_mutex);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/direct-io.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/direct-io.c
--- linux-2.6.32-504.3.3.el6.orig/fs/direct-io.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/direct-io.c	2015-01-21 12:02:43.338223345 +0300
@@ -724,6 +724,8 @@ submit_page_section(struct dio *dio, str
 {
 	int ret = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	if (dio->rw & WRITE) {
 		/*
 		 * Read accounting is performed in submit_bio()
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/drop_caches.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/drop_caches.c
--- linux-2.6.32-504.3.3.el6.orig/fs/drop_caches.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/drop_caches.c	2015-01-21 12:02:43.338223345 +0300
@@ -58,7 +58,7 @@ static void drop_slab(void)
 	int nr_objects;
 
 	do {
-		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+		nr_objects = shrink_slab(1000, GFP_KERNEL|__GFP_REPEAT, 1000);
 	} while (nr_objects > 10);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/eventfd.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/eventfd.c
--- linux-2.6.32-504.3.3.el6.orig/fs/eventfd.c	2014-12-12 23:28:52.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/eventfd.c	2015-01-21 12:02:48.123096314 +0300
@@ -19,21 +19,6 @@
 #include <linux/kref.h>
 #include <linux/eventfd.h>
 
-struct eventfd_ctx {
-	struct kref kref;
-	wait_queue_head_t wqh;
-	/*
-	 * Every time that a write(2) is performed on an eventfd, the
-	 * value of the __u64 being written is added to "count" and a
-	 * wakeup is performed on "wqh". A read(2) will return the "count"
-	 * value to userspace, and will reset "count" to zero. The kernel
-	 * side eventfd_signal() also, adds to the "count" counter and
-	 * issue a wakeup.
-	 */
-	__u64 count;
-	unsigned int flags;
-};
-
 /**
  * eventfd_signal - Adds @n to the eventfd counter.
  * @ctx: [in] Pointer to the eventfd context.
@@ -263,12 +248,13 @@ static ssize_t eventfd_write(struct file
 	return res;
 }
 
-static const struct file_operations eventfd_fops = {
+const struct file_operations eventfd_fops = {
 	.release	= eventfd_release,
 	.poll		= eventfd_poll,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
 };
+EXPORT_SYMBOL(eventfd_fops);
 
 /**
  * eventfd_fget - Acquire a reference of an eventfd file descriptor.
@@ -405,6 +391,7 @@ err_put_unused_fd:
 
 	return error;
 }
+EXPORT_SYMBOL_GPL(sys_eventfd2);
 
 SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/eventpoll.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/eventpoll.c
--- linux-2.6.32-504.3.3.el6.orig/fs/eventpoll.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/eventpoll.c	2015-01-21 12:02:57.963835087 +0300
@@ -31,6 +31,7 @@
 #include <linux/eventpoll.h>
 #include <linux/mount.h>
 #include <linux/bitops.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <asm/uaccess.h>
@@ -100,12 +101,6 @@
 
 #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
 
-struct epoll_filefd {
-	struct file *file;
-	int fd;
-	int added;
-};
-
 /*
  * Structure used to track possible nested calls, for too deep recursions
  * and loop cycles.
@@ -125,92 +120,6 @@ struct nested_calls {
 	spinlock_t lock;
 };
 
-/*
- * Each file descriptor added to the eventpoll interface will
- * have an entry of this type linked to the "rbr" RB tree.
- */
-struct epitem {
-	union {
-		/* RB tree node links this structure to the eventpoll RB tree */
-		struct rb_node rbn;
-		/* Used to free the struct epitem */
-		struct rcu_head rcu;
-	};
-
-	/* List header used to link this structure to the eventpoll ready list */
-	struct list_head rdllink;
-
-	/*
-	 * Works together "struct eventpoll"->ovflist in keeping the
-	 * single linked chain of items.
-	 */
-	struct epitem *next;
-
-	/* The file descriptor information this item refers to */
-	struct epoll_filefd ffd;
-
-	/* Number of active wait queue attached to poll operations */
-	int nwait;
-
-	/* List containing poll wait queues */
-	struct list_head pwqlist;
-
-	/* The "container" of this item */
-	struct eventpoll *ep;
-
-	/* List header used to link this item to the "struct file" items list */
-	struct list_head fllink;
-
-	/* The structure that describe the interested events and the source fd */
-	struct epoll_event event;
-};
-
-/*
- * This structure is stored inside the "private_data" member of the file
- * structure and rapresent the main data sructure for the eventpoll
- * interface.
- */
-struct eventpoll {
-	/* Protect the this structure access */
-	spinlock_t lock;
-
-	/*
-	 * This mutex is used to ensure that files are not removed
-	 * while epoll is using them. This is held during the event
-	 * collection loop, the file cleanup path, the epoll file exit
-	 * code and the ctl operations.
-	 */
-	struct mutex mtx;
-
-	/* Wait queue used by sys_epoll_wait() */
-	wait_queue_head_t wq;
-
-	/* Wait queue used by file->poll() */
-	wait_queue_head_t poll_wait;
-
-	/* List of ready file descriptors */
-	struct list_head rdllist;
-
-	/* RB tree root used to store monitored fd structs */
-	struct rb_root rbr;
-
-	/*
-	 * This is a single linked list that chains all the "struct epitem" that
-	 * happened while transfering ready events to userspace w/out
-	 * holding ->lock.
-	 */
-	struct epitem *ovflist;
-
-	/* The user that created the eventpoll descriptor */
-	struct user_struct *user;
-
-	struct file *file;
-
-	/* used to optimize loop detection check */
-	int visited;
-	struct list_head visited_list_link;
-};
-
 /* Wait structure used by the poll hooks */
 struct eppoll_entry {
 	/* List header used to link this structure to the "struct epitem" */
@@ -250,7 +159,8 @@ static int max_user_watches __read_mostl
 /*
  * This mutex is used to serialize ep_free() and eventpoll_release_file().
  */
-static DEFINE_MUTEX(epmutex);
+DEFINE_MUTEX(epmutex);
+EXPORT_SYMBOL(epmutex);
 
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
@@ -316,7 +226,7 @@ static void clear_added_flag(struct tfil
 		tfile_check_iter->tfile_arr[i]->added = 0;
 }
 
-static void clear_tfile_check_list(void)
+void clear_tfile_check_list(void)
 {
 	struct tfile_check *tfile_check_iter, *tmp;
 
@@ -333,6 +243,7 @@ static void clear_tfile_check_list(void)
 	}
 	current_tfile_check = &base_tfile_check;
 }
+EXPORT_SYMBOL_GPL(clear_tfile_check_list);
 
 
 
@@ -355,7 +266,7 @@ ctl_table epoll_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static const struct file_operations eventpoll_fops;
+const struct file_operations eventpoll_fops;
 
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -739,6 +650,7 @@ static void ep_free(struct eventpoll *ep
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
 	free_uid(ep->user);
+	ep->user = NULL;
 	kfree(ep);
 }
 
@@ -819,10 +731,11 @@ static unsigned int ep_eventpoll_poll(st
 }
 
 /* File callbacks that implement the eventpoll file behaviour */
-static const struct file_operations eventpoll_fops = {
+const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
 	.poll		= ep_eventpoll_poll
 };
+EXPORT_SYMBOL(eventpoll_fops);
 
 /* Fast test to see if the file is an evenpoll file */
 static inline int is_file_epoll(struct file *f)
@@ -898,7 +811,7 @@ free_uid:
  * are protected by the "mtx" mutex, and ep_find() must be called with
  * "mtx" held.
  */
-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 {
 	int kcmp;
 	struct rb_node *rbp;
@@ -921,6 +834,7 @@ static struct epitem *ep_find(struct eve
 
 	return epir;
 }
+EXPORT_SYMBOL(ep_find);
 
 /*
  * This is the callback that is passed to the wait queue wakeup
@@ -1139,7 +1053,7 @@ static int reverse_path_check(void)
 /*
  * Must be called with "mtx" held.
  */
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		     struct file *tfile, int fd, int full_check)
 {
 	int error, revents, pwake = 0;
@@ -1256,6 +1170,7 @@ error_unregister:
 
 	return error;
 }
+EXPORT_SYMBOL(ep_insert);
 
 /*
  * Modify the interest event mask by dropping an event if the new mask
@@ -1624,6 +1539,7 @@ SYSCALL_DEFINE1(epoll_create, int, size)
 
 	return sys_epoll_create1(0);
 }
+EXPORT_SYMBOL(sys_epoll_create);
 
 /*
  * The following function implements the controller interface for
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/exec.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/exec.c
--- linux-2.6.32-504.3.3.el6.orig/fs/exec.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/exec.c	2015-01-21 12:02:49.154068944 +0300
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mm.h>
+#include <linux/virtinfo.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
@@ -62,6 +63,9 @@
 #include <asm/tlb.h>
 #include "internal.h"
 
+#include <bc/vmpages.h>
+#include <bc/kmem.h>
+
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
@@ -257,9 +261,14 @@ static int __bprm_mm_init(struct linux_b
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	err = -ENOMEM;
+	if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags,
+				NULL, UB_SOFT))
+		goto err_charge;
+
+	bprm->vma = vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (!vma)
-		return -ENOMEM;
+		goto err_alloc;
 
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
@@ -292,7 +301,10 @@ static int __bprm_mm_init(struct linux_b
 err:
 	up_write(&mm->mmap_sem);
 	bprm->vma = NULL;
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
+err_alloc:
+	ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL);
+err_charge:
 	return err;
 }
 
@@ -551,6 +563,8 @@ static int shift_arg_pages(struct vm_are
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
 	struct mmu_gather *tlb;
+	unsigned long moved;
+	struct vm_area_struct *prev;
 
 	BUG_ON(new_start > new_end);
 
@@ -568,12 +582,11 @@ static int shift_arg_pages(struct vm_are
 		return -ENOMEM;
 
 	/*
-	 * move the page tables downwards, on failure we rely on
-	 * process cleanup to remove whatever mess we made.
+	 * move the page tables downwards, on failure undo changes.
 	 */
-	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length))
-		return -ENOMEM;
+	moved = move_page_tables(vma, old_start, vma, new_start, length);
+	if (length != moved)
+		goto undo;
 
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
@@ -601,6 +614,36 @@ static int shift_arg_pages(struct vm_are
 	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 
 	return 0;
+
+undo:
+	/*
+	 * move the page tables back.
+	 */
+	length = move_page_tables(vma, new_start, vma, old_start, moved);
+	if (WARN_ON(length != moved))
+		return -EFAULT;
+
+	/*
+	 * release unused page tables.
+	 */
+	find_vma_prev(mm, vma->vm_start, &prev);
+	tlb = tlb_gather_mmu(mm, 0);
+	if (new_end > old_start)
+		free_pgd_range(tlb, new_start, old_start,
+				prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				old_start);
+	else
+		free_pgd_range(tlb, new_start, new_end,
+				prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				old_start);
+	tlb_finish_mmu(tlb, new_start, new_end);
+
+	/*
+	 * shrink the vma to the old range.
+	 */
+	vma_adjust(vma, old_start, old_end, vma->vm_pgoff, NULL);
+
+	return -ENOMEM;
 }
 
 #define EXTRA_STACK_VM_PAGES	20	/* random */
@@ -770,10 +813,10 @@ int kernel_read(struct file *file, loff_
 
 EXPORT_SYMBOL(kernel_read);
 
-static int exec_mmap(struct mm_struct *mm)
+static int exec_mmap(struct linux_binprm *bprm)
 {
 	struct task_struct *tsk;
-	struct mm_struct * old_mm, *active_mm;
+	struct mm_struct *old_mm, *active_mm, *mm;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
@@ -793,17 +836,18 @@ static int exec_mmap(struct mm_struct *m
 			return -EINTR;
 		}
 	}
+
+	mm = bprm->mm;
+	mm->vps_dumpable = VD_PTRACE_COREDUMP;
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
-	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-		atomic_dec(&old_mm->oom_disable_count);
-		atomic_inc(&tsk->mm->oom_disable_count);
-	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
+	bprm->mm = NULL;		/* We're using it now */
+
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
@@ -907,6 +951,12 @@ static int de_thread(struct task_struct 
 		transfer_pid(leader, tsk, PIDTYPE_PGID);
 		transfer_pid(leader, tsk, PIDTYPE_SID);
 		list_replace_rcu(&leader->tasks, &tsk->tasks);
+#ifdef CONFIG_VE
+		list_replace_rcu(&leader->ve_task_info.vetask_list,
+				&tsk->ve_task_info.vetask_list);
+		list_replace(&leader->ve_task_info.aux_list,
+			     &tsk->ve_task_info.aux_list);
+#endif
 
 		tsk->group_leader = tsk;
 		leader->group_leader = tsk;
@@ -1036,12 +1086,10 @@ int flush_old_exec(struct linux_binprm *
 	 * Release all of the old mmap stuff
 	 */
 	acct_arg_size(bprm, 0);
-	retval = exec_mmap(bprm->mm);
+	retval = exec_mmap(bprm);
 	if (retval)
 		goto out;
 
-	bprm->mm = NULL;		/* We're using it now */
-
 	current->flags &= ~PF_RANDOMIZE;
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
@@ -1191,7 +1239,7 @@ int check_unsafe_exec(struct linux_binpr
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
-	write_lock(&p->fs->lock);
+	spin_lock(&p->fs->lock);
 	rcu_read_lock();
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1208,7 +1256,7 @@ int check_unsafe_exec(struct linux_binpr
 			res = 1;
 		}
 	}
-	write_unlock(&p->fs->lock);
+	spin_unlock(&p->fs->lock);
 
 	return res;
 }
@@ -1713,7 +1761,7 @@ static int zap_process(struct task_struc
 			signal_wake_up(t, 1);
 			nr++;
 		}
-	} while_each_thread(start, t);
+	} while_each_thread_ve(start, t);
 
 	return nr;
 }
@@ -1771,7 +1819,7 @@ static int zap_threads(struct task_struc
 	 *	next_thread().
 	 */
 	rcu_read_lock();
-	for_each_process(g) {
+	for_each_process_all(g) {
 		if (g == tsk->group_leader)
 			continue;
 		if (g->flags & PF_KTHREAD)
@@ -1787,7 +1835,7 @@ static int zap_threads(struct task_struc
 				}
 				break;
 			}
-		} while_each_thread(g, p);
+		} while_each_thread_all(g, p);
 	}
 	rcu_read_unlock();
 done:
@@ -1908,6 +1956,7 @@ void set_dumpable(struct mm_struct *mm, 
 		break;
 	}
 }
+EXPORT_SYMBOL_GPL(set_dumpable);
 
 /*
  * This returns the actual value of the suid_dumpable flag. For things
@@ -1970,7 +2019,7 @@ static void wait_for_dump_helpers(struct
  * is a special value that we use to trap recursive
  * core dumps
  */
-static int umh_pipe_setup(struct subprocess_info *info)
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct file *rp, *wp;
 	struct fdtable *fdt;
@@ -2042,7 +2091,8 @@ void do_coredump(long signr, int exit_co
 	/*
 	 * If another thread got here first, or we are not dumpable, bail out.
 	 */
-	if (mm->core_state || !get_dumpable(mm)) {
+	if (mm->core_state || !get_dumpable(mm) ||
+	    mm->vps_dumpable != VD_PTRACE_COREDUMP) {
 		up_write(&mm->mmap_sem);
 		put_cred(cred);
 		goto fail;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/exportfs/expfs.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/exportfs/expfs.c
--- linux-2.6.32-504.3.3.el6.orig/fs/exportfs/expfs.c	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/exportfs/expfs.c	2015-01-21 12:02:58.048832830 +0300
@@ -319,9 +319,14 @@ static int export_encode_fh(struct dentr
 	struct inode * inode = dentry->d_inode;
 	int len = *max_len;
 	int type = FILEID_INO32_GEN;
-	
-	if (len < 2 || (connectable && len < 4))
+
+	if (connectable && (len < 4)) {
+		*max_len = 4;
+		return 255;
+	} else if (len < 2) {
+		*max_len = 2;
 		return 255;
+	}
 
 	len = 2;
 	fid->i32.ino = inode->i_ino;
@@ -354,7 +359,7 @@ int exportfs_encode_fh(struct dentry *de
 
 	return error;
 }
-EXPORT_SYMBOL_GPL(exportfs_encode_fh);
+EXPORT_SYMBOL(exportfs_encode_fh);
 
 struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		int fh_len, int fileid_type,
@@ -368,6 +373,8 @@ struct dentry *exportfs_decode_fh(struct
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (!result)
 		result = ERR_PTR(-ESTALE);
@@ -484,6 +491,6 @@ struct dentry *exportfs_decode_fh(struct
 	dput(result);
 	return ERR_PTR(err);
 }
-EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+EXPORT_SYMBOL(exportfs_decode_fh);
 
 MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext2/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext2/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext2/namei.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext2/namei.c	2015-01-21 12:02:53.101964142 +0300
@@ -31,6 +31,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -262,6 +263,8 @@ static int ext2_unlink(struct inode * di
 	struct page * page;
 	int err = -ENOENT;
 
+	vfs_dq_init(inode);
+
 	de = ext2_find_entry (dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
@@ -304,6 +307,9 @@ static int ext2_rename (struct inode * o
 	struct ext2_dir_entry_2 * old_de;
 	int err = -ENOENT;
 
+	if (new_inode)
+		vfs_dq_init(new_inode);
+
 	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext2/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext2/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext2/super.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext2/super.c	2015-01-21 12:02:43.617215938 +0300
@@ -117,6 +117,8 @@ static void ext2_put_super (struct super
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
+	vfs_dq_off(sb, 0);
+
 	lock_kernel();
 
 	if (sb->s_dirt)
@@ -1483,7 +1485,8 @@ static struct file_system_type ext2_fs_t
 	.name		= "ext2",
 	.get_sb		= ext2_get_sb,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_NEW_FREEZE | FS_HANDLE_QUOTA,
+	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_NEW_FREEZE |
+			  FS_HANDLE_QUOTA | FS_VIRTUALIZED,
 };
 
 static int __init init_ext2_fs(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext3/fsync.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/fsync.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext3/fsync.c	2014-12-12 23:29:08.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/fsync.c	2015-01-21 12:02:52.549978794 +0300
@@ -56,9 +56,13 @@ int ext3_sync_file(struct file * file, s
 
 	trace_ext3_sync_file_enter(file, datasync);
 
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make shure that we read updated state */
+		smp_rmb();
+		if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+			return -EROFS;
 		return 0;
-
+	}
 
 	/*
 	 * data=writeback,ordered:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext3/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext3/inode.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/inode.c	2015-01-21 12:02:53.287959204 +0300
@@ -3273,7 +3273,9 @@ int ext3_setattr(struct dentry *dentry, 
 
 	rc = inode_setattr(inode, attr);
 
-	if (!rc && (ia_valid & ATTR_MODE))
+	/* Openvz want to change permission for symlink, but not interested
+	 * in acl support -dmon */ 
+	if (!rc && (ia_valid & ATTR_MODE) && !S_ISLNK(inode->i_mode))
 		rc = ext3_acl_chmod(inode);
 
 err_out:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext3/ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext3/ioctl.c	2014-12-12 23:29:14.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/ioctl.c	2015-01-21 12:02:43.969206593 +0300
@@ -78,7 +78,7 @@ long ext3_ioctl(struct file *filp, unsig
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_ADMIN))
 				goto flags_out;
 		}
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext3/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext3/namei.c	2014-12-12 23:29:08.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/namei.c	2015-01-21 12:02:52.018992889 +0300
@@ -1341,7 +1341,7 @@ static int add_dirent_to_buf(handle_t *h
 	if (err)
 		ext3_std_error(dir->i_sb, err);
 	brelse(bh);
-	return 0;
+	return err;
 }
 
 /*
@@ -2229,13 +2229,6 @@ static int ext3_link (struct dentry * ol
 
 	if (inode->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext3/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext3/super.c	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/super.c	2015-01-21 12:02:52.944968308 +0300
@@ -189,6 +189,11 @@ static void ext3_handle_error(struct sup
 			journal_abort(journal, -EIO);
 	}
 	if (test_opt (sb, ERRORS_RO)) {
+		/*
+		 * Make shure updated value of ->s_mount_state will be visiable
+		 * before ->s_flags update.
+		 */
+		smp_wmb();
 		ext3_msg(sb, KERN_CRIT,
 			"error: remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
@@ -297,9 +302,13 @@ void ext3_abort (struct super_block * sb
 
 	ext3_msg(sb, KERN_CRIT,
 		"error: remounting filesystem read-only");
-	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS | EXT3_MOUNT_ABORT;
+	/*
+	 * Make shure updated value of ->s_mount_state will be visiable
+	 * before ->s_flags update.
+	 */
+	smp_wmb();
 	sb->s_flags |= MS_RDONLY;
-	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
 	if (EXT3_SB(sb)->s_journal)
 		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
@@ -413,6 +422,7 @@ static void ext3_put_super (struct super
 	struct ext3_super_block *es = sbi->s_es;
 	int i, err;
 
+	vfs_dq_off(sb, 0);
 	ext3_xattr_put_super(sb);
 	err = journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
@@ -1304,7 +1314,7 @@ static int ext3_setup_super(struct super
 		res = MS_RDONLY;
 	}
 	if (read_only)
-		return res;
+		goto out;
 	if (!(sbi->s_mount_state & EXT3_VALID_FS))
 		ext3_msg(sb, KERN_WARNING,
 			"warning: mounting unchecked fs, "
@@ -1356,6 +1366,8 @@ static int ext3_setup_super(struct super
 	} else {
 		ext3_msg(sb, KERN_INFO, "using internal journal");
 	}
+out:
+	sb->s_mnt_count = le16_to_cpu(es->s_mnt_count);
 	return res;
 }
 
@@ -1944,6 +1956,7 @@ static int ext3_fill_super (struct super
 	sb->s_qcop = &ext3_qctl_operations;
 	sb->dq_op = &ext3_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_resize_lock);
 
@@ -3082,7 +3095,8 @@ static struct file_system_type ext3_fs_t
 	.name		= "ext3",
 	.get_sb		= ext3_get_sb,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV  | FS_HAS_NEW_FREEZE | FS_HANDLE_QUOTA,
+	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_NEW_FREEZE |
+			  FS_HANDLE_QUOTA | FS_VIRTUALIZED,
 };
 
 static int __init init_ext3_fs(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext3/symlink.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/symlink.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext3/symlink.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext3/symlink.c	2015-01-21 12:02:53.279959416 +0300
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symli
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symli
 const struct inode_operations ext3_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext3_follow_link,
+	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/Kconfig	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/Kconfig	2015-01-21 12:02:52.033992490 +0300
@@ -2,6 +2,7 @@ config EXT4_FS
 	tristate "The Extended 4 (ext4) filesystem"
 	select JBD2
 	select CRC16
+	select FHANDLE
 	help
 	  This is the next generation of the ext3 filesystem.
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/Makefile	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/Makefile	2015-01-21 12:02:51.915995623 +0300
@@ -12,3 +12,4 @@ ext4-y	:= balloc.o bitmap.o dir.o file.o
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
+ext4-y					+= csum.o
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/balloc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/balloc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/balloc.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/balloc.c	2015-01-21 12:02:58.093831635 +0300
@@ -229,6 +229,7 @@ struct ext4_group_desc * ext4_get_group_
 		*bh = sbi->s_group_desc[group_desc];
 	return desc;
 }
+EXPORT_SYMBOL(ext4_get_group_desc);
 
 static int ext4_valid_block_bitmap(struct super_block *sb,
 					struct ext4_group_desc *desc,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/csum.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/csum.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/csum.c	2015-01-21 12:02:51.915995623 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/csum.c	2015-01-21 12:02:52.432981899 +0300
@@ -0,0 +1,753 @@
+/*
+ * linux/fs/ext4/csum.c
+ *
+ * Automatic SHA-1 (FIPS 180-1) data checksummig
+ *
+ * Copyright (C) 2012 Parallels, inc.
+ *
+ * Author: Konstantin Khlebnikov
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/cryptohash.h>
+#include <linux/namei.h>
+#include <linux/init_task.h>	/* for init_cred */
+#include "ext4.h"
+#include "xattr.h"
+
+#include <trace/events/ext4.h>
+
+#define PFCACHE_MAX_PATH	(EXT4_DATA_CSUM_SIZE * 2 + 2)
+static void pfcache_path(struct inode *inode, char *path)
+{
+	char *p;
+	int i;
+
+	/* like .git/objects hex[0]/hex[1..] */
+	p = pack_hex_byte(path, EXT4_I(inode)->i_data_csum[0]);
+	*p++ = '/';
+	for ( i = 1 ; i < EXT4_DATA_CSUM_SIZE ; i++ )
+		p = pack_hex_byte(p, EXT4_I(inode)->i_data_csum[i]);
+	*p = 0;
+}
+
+/* require inode->i_mutex held or unreachable inode */
+int ext4_open_pfcache(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	const struct cred *cur_cred;
+	char path[PFCACHE_MAX_PATH];
+	struct filename filename = { .name = path };
+	struct nameidata nd = {
+		.flags = LOOKUP_STRICT,
+		.last_type = LAST_ROOT,
+	};
+	int ret;
+
+	if (!(ext4_test_inode_state(inode, EXT4_STATE_CSUM) &&
+	      EXT4_I(inode)->i_data_csum_end < 0))
+		return -ENODATA;
+
+	if (!EXT4_SB(sb)->s_pfcache_root.mnt)
+		return -ENODEV;
+
+	spin_lock(&EXT4_SB(sb)->s_pfcache_lock);
+	nd.path = EXT4_SB(sb)->s_pfcache_root;
+	path_get(&nd.path);
+	spin_unlock(&EXT4_SB(sb)->s_pfcache_lock);
+
+	if (!nd.path.mnt)
+		return -ENODEV;
+
+	pfcache_path(inode, path);
+
+	cur_cred = override_creds(&init_cred);
+	/*
+	 * Files in cache area must not have csum attributes or
+	 * pfcache must be disabled for underlain filesystem,
+	 * otherwise real lock-recursion can happens for i_mutex.
+	 * Here we disable lockdep to avoid false-positive reports.
+	 */
+	lockdep_off();
+	ret = path_walk(&filename, &nd);
+	lockdep_on();
+	revert_creds(cur_cred);
+	if (ret)
+		return ret;
+
+	ret = open_inode_peer(inode, &nd.path, &init_cred);
+	if (!ret)
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_pfcache_peers);
+	return ret;
+}
+
+/* require inode->i_mutex held or unreachable inode */
+int ext4_close_pfcache(struct inode *inode)
+{
+	if (!inode->i_peer_file)
+		return -ENOENT;
+	close_inode_peer(inode);
+	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_pfcache_peers);
+	return 0;
+}
+
+/* under sb->s_umount write lock */
+int ext4_relink_pfcache(struct super_block *sb, char *new_root, bool new_sb)
+{
+	int old_root = !!EXT4_SB(sb)->s_pfcache_root.mnt;
+	struct inode *inode, *old_inode = NULL;
+	char path[PFCACHE_MAX_PATH];
+	struct filename filename = { .name = path };
+	struct nameidata nd;
+	struct file *file;
+	long nr_opened = 0, nr_closed = 0, nr_total;
+	bool reload_csum = false;
+	struct path old_path;
+
+	if (new_root) {
+		int err = path_lookup(new_root, LOOKUP_FOLLOW |
+				LOOKUP_DIRECTORY, &nd);
+		if (err) {
+			printk(KERN_ERR"PFCache: lookup \"%s\" failed %d\n",
+					new_root, err);
+			return new_sb ? 0 : err;
+		}
+		if (!test_opt2(sb, CSUM)) {
+			set_opt2(sb, CSUM);
+			reload_csum = true;
+		}
+	} else {
+		nd.path.mnt = NULL;
+		nd.path.dentry = NULL;
+	}
+
+	if (new_sb) {
+		path_put(&EXT4_SB(sb)->s_pfcache_root);
+		EXT4_SB(sb)->s_pfcache_root = nd.path;
+		return 0;
+	}
+
+	spin_lock(&EXT4_SB(sb)->s_pfcache_lock);
+	old_path = EXT4_SB(sb)->s_pfcache_root;
+	EXT4_SB(sb)->s_pfcache_root = nd.path;
+	spin_unlock(&EXT4_SB(sb)->s_pfcache_lock);
+	path_put(&old_path);
+
+	spin_lock(&inode_lock);
+
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			continue;
+		if (!ext4_test_inode_state(inode, EXT4_STATE_CSUM)) {
+			if (!reload_csum)
+				continue;
+		} else if (!(EXT4_I(inode)->i_data_csum_end < 0))
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
+		iput(old_inode);
+		old_inode = inode;
+
+		nd.path.mnt = NULL;
+		nd.path.dentry = NULL;
+
+		mutex_lock(&inode->i_mutex);
+
+		if (!ext4_test_inode_state(inode, EXT4_STATE_CSUM)) {
+			if (!reload_csum)
+				goto next;
+			if (S_ISDIR(inode->i_mode)) {
+				ext4_load_dir_csum(inode);
+				goto next;
+			}
+			if (ext4_load_data_csum(inode))
+				goto next;
+		} else if (!(EXT4_I(inode)->i_data_csum_end < 0) ||
+				S_ISDIR(inode->i_mode))
+			goto next;
+
+		if (new_root) {
+			const struct cred *cur_cred;
+			int err;
+
+			nd.flags = LOOKUP_STRICT;
+			nd.last_type = LAST_ROOT;
+			nd.depth = 0;
+			nd.path = EXT4_SB(sb)->s_pfcache_root;
+			path_get(&nd.path);
+
+			pfcache_path(inode, path);
+			cur_cred = override_creds(&init_cred);
+			err = path_walk(&filename, &nd);
+			revert_creds(cur_cred);
+			if (err) {
+				nd.path.mnt = NULL;
+				nd.path.dentry = NULL;
+			}
+		}
+
+		file = inode->i_peer_file;
+		if ((!nd.path.mnt && !file) || (nd.path.mnt && file &&
+		     file->f_mapping == nd.path.dentry->d_inode->i_mapping))
+			goto next;
+
+		if (file) {
+			close_inode_peer(inode);
+			nr_closed++;
+		}
+
+		if (nd.path.mnt) {
+			path_get(&nd.path);
+			if (!open_inode_peer(inode, &nd.path, &init_cred))
+				nr_opened++;
+		}
+next:
+		mutex_unlock(&inode->i_mutex);
+		path_put(&nd.path);
+		cond_resched();
+		spin_lock(&inode_lock);
+	}
+	spin_unlock(&inode_lock);
+	iput(old_inode);
+
+	percpu_counter_add(&EXT4_SB(sb)->s_pfcache_peers,
+			   nr_opened - nr_closed);
+	nr_total = percpu_counter_sum(&EXT4_SB(sb)->s_pfcache_peers);
+
+	if (new_root && (old_root || nr_total))
+		printk(KERN_INFO"PFCache: relink %u:%u to \"%s\""
+				" +%ld -%ld =%ld peers\n",
+				MAJOR(sb->s_dev), MINOR(sb->s_dev), new_root,
+				nr_opened, nr_closed, nr_total);
+	if (!new_root && nr_total)
+		printk(KERN_ERR"PFCache: %ld peers lost", nr_total);
+
+	return 0;
+}
+
+#define MAX_LOCK_BATCH	256
+
+long ext4_dump_pfcache(struct super_block *sb,
+		      struct pfcache_dump_request __user *user_req)
+{
+	struct inode *inode, *old_inode = NULL;
+	struct pfcache_dump_request req;
+	u8 __user *user_buffer;
+	u64 state, *x;
+	void *buffer, *p;
+	long ret, size;
+	int lock_batch = 0;
+
+	if (copy_from_user(&req, user_req, sizeof(req)))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_WRITE, user_req,
+		       req.header_size + req.buffer_size))
+		return -EFAULT;
+
+	/* check for unknown flags */
+	if ((req.filter & ~PFCACHE_FILTER_MASK) ||
+	    (req.payload & ~PFCACHE_PAYLOAD_MASK))
+		return -EINVAL;
+
+	buffer = kzalloc(PFCACHE_PAYLOAD_MAX_SIZE, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	ret = 0;
+	/* skip all new fields in the user request header */
+	user_buffer = (void*)user_req + req.header_size;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!S_ISREG(inode->i_mode) ||
+		    inode == EXT4_SB(sb)->s_balloon_ino)
+			goto next;
+
+		/* evaluate the inode state */
+		state = 0;
+
+		if (ext4_test_inode_state(inode, EXT4_STATE_CSUM) &&
+		    EXT4_I(inode)->i_data_csum_end < 0)
+			state |= PFCACHE_FILTER_WITH_CSUM;
+		else
+			state |= PFCACHE_FILTER_WITHOUT_CSUM;
+
+		if (inode->i_peer_file)
+			state |= PFCACHE_FILTER_WITH_PEER;
+		else
+			state |= PFCACHE_FILTER_WITHOUT_PEER;
+
+		/* check state-filter */
+		if (req.filter & state)
+			goto next;
+
+		/* check csum-filter */
+		if ((req.filter & PFCACHE_FILTER_COMPARE_CSUM) &&
+		    memcmp(EXT4_I(inode)->i_data_csum,
+			    req.csum_filter, EXT4_DATA_CSUM_SIZE))
+			goto next;
+
+		/* -- add new filters above this line -- */
+
+		/* check offset-filter at the last */
+		if (req.offset > 0) {
+			req.offset--;
+			goto next;
+		}
+
+		/* construct the payload */
+		p = buffer;
+
+		if (req.payload & PFCACHE_PAYLOAD_CSUM) {
+			BUILD_BUG_ON(PFCACHE_CSUM_SIZE != EXT4_DATA_CSUM_SIZE);
+			if (state & PFCACHE_FILTER_WITH_CSUM)
+				memcpy(p, EXT4_I(inode)->i_data_csum,
+						EXT4_DATA_CSUM_SIZE);
+			else
+				memset(p, 0, EXT4_DATA_CSUM_SIZE);
+			p += ALIGN(PFCACHE_CSUM_SIZE, sizeof(u64));
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_FHANDLE) {
+			int fh_ret = vfs_inode_fhandle(inode, p,
+					PFCACHE_FHANDLE_MAX);
+			if (fh_ret < 0) {
+				ret = fh_ret;
+				goto out;
+			}
+			p += ALIGN(fh_ret, sizeof(u64));
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_STATE) {
+			x = p;
+			*x = state;
+			p += sizeof(u64);
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_FSIZE) {
+			x = p;
+			*x = i_size_read(inode);
+			p += sizeof(u64);
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_PAGES) {
+			x = p;
+			*x = inode->i_mapping->nrpages;
+			p += sizeof(u64);
+		}
+
+		/* -- add new payloads above this line -- */
+
+		size = p - buffer;
+		BUG_ON(!IS_ALIGNED(size, sizeof(u64)));
+		BUG_ON(size > PFCACHE_PAYLOAD_MAX_SIZE);
+
+		if (size > req.buffer_size)
+			goto out;
+
+		pagefault_disable();
+		if (!__copy_to_user_inatomic(user_buffer, buffer, size)) {
+			pagefault_enable();
+		} else {
+			pagefault_enable();
+			__iget(inode);
+			spin_unlock(&inode_lock);
+			iput(old_inode);
+			old_inode = inode;
+			if (copy_to_user(user_buffer, buffer, size)) {
+				ret = -EFAULT;
+				goto out_nolock;
+			}
+			cond_resched();
+			lock_batch = 0;
+			spin_lock(&inode_lock);
+		}
+
+		ret++;
+		user_buffer += size;
+		req.buffer_size -= size;
+next:
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -EINTR;
+			goto out;
+		}
+		if (++lock_batch > MAX_LOCK_BATCH || need_resched() ||
+				spin_needbreak(&inode_lock)) {
+			__iget(inode);
+			spin_unlock(&inode_lock);
+			iput(old_inode);
+			old_inode = inode;
+			cond_resched();
+			lock_batch = 0;
+			spin_lock(&inode_lock);
+		}
+	}
+out:
+	spin_unlock(&inode_lock);
+out_nolock:
+	iput(old_inode);
+
+	kfree(buffer);
+
+	return ret;
+}
+
+static void ext4_init_data_csum(struct inode *inode)
+{
+	EXT4_I(inode)->i_data_csum_end = 0;
+	sha_init((__u32 *)EXT4_I(inode)->i_data_csum);
+	ext4_set_inode_state(inode, EXT4_STATE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_partial);
+}
+
+void ext4_clear_data_csum(struct inode *inode)
+{
+	ext4_clear_inode_state(inode, EXT4_STATE_CSUM);
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (EXT4_I(inode)->i_data_csum_end < 0)
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	else
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_csum_partial);
+}
+
+void ext4_start_data_csum(struct inode *inode)
+{
+	if (!ext4_test_inode_state(inode, EXT4_STATE_CSUM)) {
+		spin_lock(&inode->i_lock);
+		if (!ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+			ext4_init_data_csum(inode);
+		spin_unlock(&inode->i_lock);
+	}
+	trace_ext4_start_data_csum(inode, inode->i_size);
+}
+
+int ext4_load_data_csum(struct inode *inode)
+{
+	int ret;
+
+	ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME, EXT4_I(inode)->i_data_csum,
+			EXT4_DATA_CSUM_SIZE);
+	if (ret < 0)
+		return ret;
+	if (ret != EXT4_DATA_CSUM_SIZE)
+		return -EIO;
+
+	EXT4_I(inode)->i_data_csum_end = -1;
+	ext4_set_inode_state(inode, EXT4_STATE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	return 0;
+}
+
+static int ext4_save_data_csum(struct inode *inode, u8 *csum)
+{
+	int ret;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM) &&
+	    EXT4_I(inode)->i_data_csum_end < 0 &&
+	    memcmp(EXT4_I(inode)->i_data_csum, csum, EXT4_DATA_CSUM_SIZE))
+		ext4_close_pfcache(inode);
+
+	spin_lock(&inode->i_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_clear_data_csum(inode);
+	memcpy(EXT4_I(inode)->i_data_csum, csum, EXT4_DATA_CSUM_SIZE);
+	EXT4_I(inode)->i_data_csum_end = -1;
+	ext4_set_inode_state(inode, EXT4_STATE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	spin_unlock(&inode->i_lock);
+	trace_ext4_save_data_csum(inode, inode->i_size);
+
+	ext4_open_pfcache(inode);
+
+	/* In order to guarantie csum consistenty force block allocation first */
+	ret = ext4_alloc_da_blocks(inode);
+	if (ret)
+		return ret;
+
+	WARN_ON(journal_current_handle());
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME, EXT4_I(inode)->i_data_csum,
+			EXT4_DATA_CSUM_SIZE, 0);
+}
+
+void ext4_load_dir_csum(struct inode *inode)
+{
+	char value[EXT4_DIR_CSUM_VALUE_LEN];
+	int ret;
+
+	ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+			     EXT4_DATA_CSUM_NAME, value, sizeof(value));
+	if (ret == EXT4_DIR_CSUM_VALUE_LEN &&
+	    !strncmp(value, EXT4_DIR_CSUM_VALUE, sizeof(value)))
+		ext4_set_inode_state(inode, EXT4_STATE_CSUM);
+}
+
+void ext4_save_dir_csum(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_CSUM);
+	ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME,
+			EXT4_DIR_CSUM_VALUE,
+			EXT4_DIR_CSUM_VALUE_LEN, 0);
+}
+
+void ext4_truncate_data_csum(struct inode *inode, loff_t pos)
+{
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	trace_ext4_truncate_data_csum(inode, pos);
+
+	if (EXT4_I(inode)->i_data_csum_end < 0) {
+		WARN_ON(journal_current_handle());
+		ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+				EXT4_DATA_CSUM_NAME, NULL, 0, 0);
+		ext4_close_pfcache(inode);
+	}
+
+	spin_lock(&inode->i_lock);
+	ext4_clear_data_csum(inode);
+	if (!pos && test_opt2(inode->i_sb, CSUM))
+		ext4_init_data_csum(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+void ext4_check_pos_data_csum(struct inode *inode, loff_t pos)
+{
+	if ((pos & ~(loff_t)(SHA_MESSAGE_BYTES-1)) !=
+	     EXT4_I(inode)->i_data_csum_end)
+		ext4_truncate_data_csum(inode, pos);
+}
+
+void ext4_update_data_csum(struct inode *inode, loff_t pos,
+			   unsigned len, struct page* page)
+{
+	__u32 *digest = (__u32 *)EXT4_I(inode)->i_data_csum;
+	const u8 *kaddr, *data;
+
+	if (!len)
+		return;
+
+	len += pos & (SHA_MESSAGE_BYTES-1);
+	len &= ~(SHA_MESSAGE_BYTES-1);
+	pos &= ~(loff_t)(SHA_MESSAGE_BYTES-1);
+
+	BUG_ON(pos != EXT4_I(inode)->i_data_csum_end);
+
+	EXT4_I(inode)->i_data_csum_end += len;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	data = kaddr + (pos & (PAGE_CACHE_SIZE - 1));
+	sha_batch_transform(digest, data, len / SHA_MESSAGE_BYTES);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	trace_ext4_update_data_csum(inode, pos);
+}
+
+static int ext4_finish_data_csum(struct inode *inode, u8 *csum)
+{
+	__u32 *digest = (__u32 *)csum;
+	__u8 data[SHA_MESSAGE_BYTES * 2];
+	loff_t end;
+	unsigned tail;
+	__be64 bits;
+
+	BUILD_BUG_ON(EXT4_DATA_CSUM_SIZE != SHA_DIGEST_WORDS * 4);
+
+	memcpy(csum, EXT4_I(inode)->i_data_csum, EXT4_DATA_CSUM_SIZE);
+
+	end = EXT4_I(inode)->i_data_csum_end;
+	if (end < 0)
+		return 0;
+	if (!inode->i_size)
+		return -ENODATA;
+
+	tail = inode->i_size - end;
+	if (tail >= SHA_MESSAGE_BYTES)
+		return -EIO;
+
+	if (tail) {
+		struct page *page;
+		const u8 *kaddr;
+
+		page = read_cache_page_gfp(inode->i_mapping,
+					   end >> PAGE_CACHE_SHIFT,
+					   GFP_NOFS);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(data, kaddr + (end & (PAGE_CACHE_SIZE-1)), tail);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+
+	memset(data + tail, 0, sizeof(data) - tail);
+	data[tail] = 0x80;
+
+	bits = cpu_to_be64((end + tail) << 3);
+	if (tail >= SHA_MESSAGE_BYTES - sizeof(bits)) {
+		memcpy(data + SHA_MESSAGE_BYTES * 2 - sizeof(bits),
+				&bits, sizeof(bits));
+		sha_batch_transform(digest, data, 2);
+	} else {
+		memcpy(data + SHA_MESSAGE_BYTES - sizeof(bits),
+				&bits, sizeof(bits));
+		sha_batch_transform(digest, data, 1);
+	}
+
+	for (tail = 0; tail < SHA_DIGEST_WORDS ; tail++)
+		digest[tail] = cpu_to_be32(digest[tail]);
+
+	return 0;
+}
+
+void ext4_commit_data_csum(struct inode *inode)
+{
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+
+	if (!S_ISREG(inode->i_mode) || EXT4_I(inode)->i_data_csum_end < 0)
+		return;
+
+	mutex_lock(&inode->i_mutex);
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM) &&
+	    !ext4_finish_data_csum(inode, csum))
+		ext4_save_data_csum(inode, csum);
+	else
+		ext4_truncate_data_csum(inode, 0);
+	mutex_unlock(&inode->i_mutex);
+}
+
+static int ext4_xattr_trusted_csum_get(struct inode *inode, const char *name,
+				       void *buffer, size_t size)
+{
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+	int i;
+
+	if (strcmp(name, ""))
+		return -ENODATA;
+
+	if (!test_opt2(inode->i_sb, CSUM))
+		return -EOPNOTSUPP;
+
+	if (S_ISDIR(inode->i_mode))
+		return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+				      EXT4_DATA_CSUM_NAME, buffer, size);
+
+	if (!S_ISREG(inode->i_mode))
+		return -ENODATA;
+
+	if (!buffer)
+		return EXT4_DATA_CSUM_SIZE * 2;
+
+	spin_lock(&inode->i_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM) &&
+	    EXT4_I(inode)->i_data_csum_end < 0) {
+		memcpy(csum, EXT4_I(inode)->i_data_csum, EXT4_DATA_CSUM_SIZE);
+	} else {
+		spin_unlock(&inode->i_lock);
+		return -ENODATA;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (size == EXT4_DATA_CSUM_SIZE) {
+		memcpy(buffer, csum, EXT4_DATA_CSUM_SIZE);
+		return EXT4_DATA_CSUM_SIZE;
+	}
+
+	if (size >= EXT4_DATA_CSUM_SIZE * 2) {
+		for ( i = 0 ; i < EXT4_DATA_CSUM_SIZE ; i++ )
+			buffer = pack_hex_byte(buffer, csum[i]);
+		return EXT4_DATA_CSUM_SIZE * 2;
+	}
+
+	return -ERANGE;
+}
+
+static int ext4_xattr_trusted_csum_set(struct inode *inode, const char *name,
+				const void *value, size_t size, int flags)
+{
+	const char *text = value;
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+	int i;
+
+	if (strcmp(name, ""))
+		return -ENODATA;
+
+	if (!test_opt2(inode->i_sb, CSUM))
+		return -EOPNOTSUPP;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (!value)
+			ext4_clear_inode_state(inode, EXT4_STATE_CSUM);
+		else if (size == EXT4_DIR_CSUM_VALUE_LEN &&
+			 !strncmp(value, EXT4_DIR_CSUM_VALUE, size))
+			ext4_set_inode_state(inode, EXT4_STATE_CSUM);
+		else
+			return -EINVAL;
+
+		return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+				      EXT4_DATA_CSUM_NAME, value, size, flags);
+	}
+
+	if (!S_ISREG(inode->i_mode))
+		return -ENODATA;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM)) {
+		if (flags & XATTR_CREATE)
+			return -EEXIST;
+	} else {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+	}
+
+	if (!value) {
+		ext4_truncate_data_csum(inode, 1);
+		return 0;
+	}
+
+	if (size == EXT4_DATA_CSUM_SIZE) {
+		memcpy(csum, value, EXT4_DATA_CSUM_SIZE);
+	} else if (size == EXT4_DATA_CSUM_SIZE * 2) {
+		for ( i = 0 ; i < EXT4_DATA_CSUM_SIZE ; i++ ) {
+			int hi = hex_to_bin(text[i*2]);
+			int lo = hex_to_bin(text[i*2+1]);
+			if ((hi < 0) || (lo < 0))
+				return -EINVAL;
+			csum[i] = (hi << 4) | lo;
+		}
+	} else
+		return -EINVAL;
+
+	if (mapping_writably_mapped(inode->i_mapping))
+		return -EBUSY;
+
+	return ext4_save_data_csum(inode, csum);
+}
+
+#define XATTR_TRUSTED_CSUM_PREFIX XATTR_TRUSTED_PREFIX EXT4_DATA_CSUM_NAME
+#define XATTR_TRUSTED_CSUM_PREFIX_LEN (sizeof (XATTR_TRUSTED_CSUM_PREFIX) - 1)
+
+static size_t
+ext4_xattr_trusted_csum_list(struct inode *inode, char *list, size_t list_size,
+			     const char *name, size_t name_len)
+{
+	return 0;
+}
+
+struct xattr_handler ext4_xattr_trusted_csum_handler = {
+	.prefix = XATTR_TRUSTED_CSUM_PREFIX,
+	.list   = ext4_xattr_trusted_csum_list,
+	.get    = ext4_xattr_trusted_csum_get,
+	.set    = ext4_xattr_trusted_csum_set,
+};
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/dir.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/dir.c	2015-01-21 12:02:43.022231736 +0300
@@ -98,6 +98,14 @@ int ext4_check_dir_entry(const char *fun
 	return 0;
 }
 
+static inline int ext4_balloon(struct super_block *sb, unsigned ino)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	return sbi->s_balloon_ino && (sbi->s_balloon_ino->i_ino == ino);
+}
+
 static int ext4_readdir(struct file *filp,
 			 void *dirent, filldir_t filldir)
 {
@@ -208,7 +216,8 @@ revalidate:
 			}
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
-			if (le32_to_cpu(de->inode)) {
+			if (le32_to_cpu(de->inode) &&
+					!ext4_balloon(sb, le32_to_cpu(de->inode))) {
 				/* We might block in the next section
 				 * if the data destination is
 				 * currently swapped out.  So, use a
@@ -482,6 +491,9 @@ static int call_filldir(struct file *fil
 	}
 	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
 	while (fname) {
+		if (ext4_balloon(sb, fname->inode))
+			goto skip;
+
 		error = filldir(dirent, fname->name,
 				fname->name_len, curr_pos,
 				fname->inode,
@@ -491,6 +503,7 @@ static int call_filldir(struct file *fil
 			info->extra_fname = fname;
 			return error;
 		}
+skip:
 		fname = fname->next;
 	}
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/ext4.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ext4.h
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/ext4.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ext4.h	2015-01-21 12:02:53.180962045 +0300
@@ -29,6 +29,7 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#include <linux/pfcache.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -182,6 +183,12 @@ typedef struct ext4_io_end {
 	int			result;		/* error value for AIO */
 } ext4_io_end_t;
 
+struct ext4_io_submit {
+	int			rw;
+	struct inode		*inode;		/* file being written to */
+	struct bio		*bio;		/* current bio */
+};
+
 /*
  * Special inodes numbers
  */
@@ -452,6 +459,11 @@ struct compat_ext4_new_group_input {
 };
 #endif
 
+struct ext4_ioc_mfsync_info {
+	__u32 size;
+	__u32 fd[0];
+};
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -523,6 +535,10 @@ struct ext4_new_group_data {
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
+#define EXT4_IOC_OPEN_BALLOON		_IO('f', 42)
+#define EXT4_IOC_MFSYNC			_IO('f', 43)
+#define EXT4_IOC_SET_RSV_BLOCKS		_IOW('f', 44, __u64)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -542,11 +558,20 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 
 
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+	BLOCK_BITMAP = 0,	/* block bitmap */
+	INODE_BITMAP,		/* inode bitmap */
+	INODE_TABLE,		/* inode tables */
+	GROUP_TABLE_COUNT,
+};
+
 /*
  *  Mount options
  */
 struct ext4_mount_options {
 	unsigned long s_mount_opt;
+	unsigned long s_mount_opt2;
 	uid_t s_resuid;
 	gid_t s_resgid;
 	unsigned long s_commit_interval;
@@ -597,7 +622,7 @@ struct ext4_inode {
 			__le16	l_i_file_acl_high;
 			__le16	l_i_uid_high;	/* these 2 fields */
 			__le16	l_i_gid_high;	/* were reserved2[0] */
-			__u32	l_i_reserved2;
+			__u32	l_i_dq_cookie;	/* till we have treeid on inode */
 		} linux2;
 		struct {
 			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
@@ -635,6 +660,12 @@ struct move_extent {
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
 
+#define EXT4_DATA_CSUM_SIZE	20
+#define EXT4_DATA_CSUM_NAME	"pfcache"
+
+#define EXT4_DIR_CSUM_VALUE	"auto"
+#define EXT4_DIR_CSUM_VALUE_LEN	4
+
 /*
  * Extended fields will fit into an inode if the filesystem was formatted
  * with large inodes (-I 256 or larger) and there are not currently any EAs
@@ -711,7 +742,7 @@ do {									       \
 #define i_gid_low	i_gid
 #define i_uid_high	osd2.linux2.l_i_uid_high
 #define i_gid_high	osd2.linux2.l_i_gid_high
-#define i_reserved2	osd2.linux2.l_i_reserved2
+#define i_dqcookie	osd2.linux2.l_i_dq_cookie
 
 #elif defined(__GNU__)
 
@@ -746,6 +777,7 @@ struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
 	__u32	i_dtime;
 	ext4_fsblk_t	i_file_acl;
+	__u32	i_dq_cookie;
 
 	/*
 	 * i_block_group is the number of the block group which contains
@@ -838,6 +870,8 @@ struct ext4_inode_info {
 	struct list_head i_aio_dio_complete_list;
 	spinlock_t i_completed_io_lock;
 	atomic_t i_unwritten; /* Number of inflight conversions pending */
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
+	atomic_t i_flush_tag;
 	struct mutex i_aio_mutex; /* big hammer for unaligned AIO */
 
 	/*
@@ -846,6 +880,11 @@ struct ext4_inode_info {
 	 */
 	tid_t i_sync_tid;
 	tid_t i_datasync_tid;
+
+	/* SHA-1 rolling data checksum state */
+	loff_t i_data_csum_end;
+	/* FIPS 180-1 digest if i_data_csum_end == -1, partial SHA-1 otherwise */
+	u8 i_data_csum[EXT4_DATA_CSUM_SIZE];
 };
 
 /*
@@ -896,11 +935,22 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
+#define EXT4_MOUNT2_CSUM		0x10000 /* Data-checksumming enabled */
+#define EXT4_MOUNT2_PRAMCACHE		0x20000 /* Save page cache to PRAM on umount */
+#define EXT4_MOUNT2_PRAMCACHE_NOSYNC	0x40000 /* Do not sync dirty pages saved to PRAM */
+
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
 #define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
 					 EXT4_MOUNT_##opt)
 
+#define clear_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 &= \
+						~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 |= \
+						EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
+					 EXT4_MOUNT2_##opt)
+
 #define ext4_set_bit			ext2_set_bit
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
 #define ext4_clear_bit			ext2_clear_bit
@@ -1064,6 +1114,7 @@ struct ext4_sb_info {
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head **s_group_desc;
 	unsigned int s_mount_opt;
+	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
 	ext4_fsblk_t s_sb_block;
 	atomic64_t s_resv_blocks;
@@ -1086,6 +1137,7 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyblocks_counter;
+	struct percpu_counter s_fsync_counter;
 	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
 	struct kobject s_kobj;
@@ -1096,7 +1148,8 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
-	struct mutex s_resize_lock;
+	unsigned long s_resize_flags;		/* Flags indicating if there
+						   is a resizer */
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
@@ -1141,6 +1194,7 @@ struct ext4_sb_info {
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
 	unsigned int s_max_writeback_mb_bump;
+	unsigned int s_bd_full_ratelimit;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
@@ -1162,6 +1216,8 @@ struct ext4_sb_info {
 	atomic_t s_mb_discarded;
 	atomic_t s_lock_busy;
 
+	struct inode *s_balloon_ino;
+
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
 
@@ -1185,6 +1241,14 @@ struct ext4_sb_info {
 
 	/* record the last minlen when FITRIM is called. */
 	atomic_t s_last_trim_minblks;
+
+	/* data checksumming */
+	struct percpu_counter s_csum_partial;
+	struct percpu_counter s_csum_complete;
+
+	spinlock_t  s_pfcache_lock;
+	struct path s_pfcache_root;
+	struct percpu_counter s_pfcache_peers;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1232,6 +1296,7 @@ enum {
 	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_CSUM,		/* Data-checksumming enabled */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field)					\
@@ -1704,7 +1769,7 @@ extern void ext4_htree_free_dir_info(str
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, struct dentry *, int);
-
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
@@ -1744,6 +1809,7 @@ extern int ext4_mb_add_groupinfo(struct 
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 				ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+extern void mb_set_bits(void *bm, int cur, int len);
 
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
@@ -1791,6 +1857,7 @@ extern qsize_t *ext4_get_reserved_space(
 extern int ext4_flush_unwritten_io(struct inode *);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
+
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1810,8 +1877,35 @@ extern int ext4_group_add(struct super_b
 extern int ext4_group_extend(struct super_block *sb,
 				struct ext4_super_block *es,
 				ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+
+/* csum.c */
+extern int ext4_open_pfcache(struct inode *inode);
+extern int ext4_close_pfcache(struct inode *inode);
+extern int ext4_relink_pfcache(struct super_block *sb, char *new_root, bool new_sb);
+extern long ext4_dump_pfcache(struct super_block *sb,
+					struct pfcache_dump_request __user *dump);
+extern int ext4_load_data_csum(struct inode *inode);
+extern void ext4_start_data_csum(struct inode *inode);
+extern void ext4_check_pos_data_csum(struct inode *inode, loff_t pos);
+extern void ext4_update_data_csum(struct inode *inode, loff_t pos,
+				  unsigned len, struct page* page);
+extern void ext4_commit_data_csum(struct inode *inode);
+extern void ext4_clear_data_csum(struct inode *inode);
+extern void ext4_truncate_data_csum(struct inode *inode, loff_t end);
+extern void ext4_load_dir_csum(struct inode *inode);
+extern void ext4_save_dir_csum(struct inode *inode);
+static inline int ext4_want_data_csum(struct inode *dir)
+{
+	return test_opt2(dir->i_sb, CSUM) &&
+		(ext4_test_inode_state(dir, EXT4_STATE_CSUM) ||
+		 current->data_csum_enabled);
+}
+extern struct xattr_handler ext4_xattr_trusted_csum_handler;
 
 /* super.c */
+extern unsigned int attr_batched_writeback;
+extern unsigned int attr_batched_sync;
 extern void __ext4_error(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 #define ext4_error(sb, message...)	__ext4_error(sb, __func__, ## message)
@@ -2099,6 +2193,7 @@ extern int ext4_data_block_valid(struct 
 				 unsigned int count);
 
 /* extents.c */
+struct ext4_ext_path;
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2118,6 +2213,11 @@ extern int ext4_convert_unwritten_extent
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
 			   struct buffer_head *bh, int flags);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+			     struct inode *inode2, ext4_lblk_t lblk1,
+			     ext4_lblk_t lblk2,  ext4_lblk_t count,
+			     int mark_unwritten,int *err);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 /* move_extent.c */
@@ -2151,7 +2251,55 @@ static inline void set_bitmap_uptodate(s
 extern wait_queue_head_t aio_wq[];
 #define to_aio_wq(v) (&aio_wq[((unsigned long)v) % WQ_HASH_SZ])
 extern void ext4_aio_wait(struct inode *inode);
+extern wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+#define to_ioend_wq(v)	(&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+extern void ext4_ioend_wait(struct inode *inode);
+
+#define EXT4_RESIZING	0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
+
+/*
+ * Ploop support
+ */
+DECLARE_PER_CPU(unsigned long, ext4_bd_full_ratelimits);
+
+static inline int check_bd_full(struct inode *inode, long long nblocks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int (*bd_full_fn) (struct backing_dev_info *, long long, int);
+	unsigned long ratelimit;
+	unsigned long *p;
+
+	bd_full_fn = inode->i_sb->s_bdi->bd_full_fn;
+	if (likely(!bd_full_fn))
+		return 0;
+
+	if (unlikely(inode->i_sb->s_bdi->bd_full))
+		ratelimit = 0;
+	else
+		ratelimit = sbi->s_bd_full_ratelimit;
 
+	preempt_disable();
+
+	p =  &__get_cpu_var(ext4_bd_full_ratelimits);
+	*p += nblocks;
+	if (unlikely(*p >= ratelimit)) {
+		*p = 0;
+		preempt_enable();
+		if (unlikely(bd_full_fn(inode->i_sb->s_bdi,
+					nblocks << inode->i_blkbits,
+					sbi->s_resuid == current_fsuid()))) {
+			inode->i_sb->s_bdi->bd_full = 1;
+			return 1;
+		}
+		inode->i_sb->s_bdi->bd_full = 0;
+		return 0;
+	}
+
+	preempt_enable();
+	return 0;
+}
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/ext4_extents.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ext4_extents.h
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/ext4_extents.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ext4_extents.h	2015-01-21 12:02:41.878262107 +0300
@@ -281,5 +281,11 @@ extern int ext4_ext_search_right(struct 
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
+extern int __ext4_ext_check(const char *function, struct inode *inode,
+			    struct ext4_extent_header *eh,
+			    int depth);
+#define ext4_ext_check(inode, eh, depth)	\
+	__ext4_ext_check(__func__, inode, eh, depth)
+
 #endif /* _EXT4_EXTENTS */
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/ext4_jbd2.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ext4_jbd2.h
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/ext4_jbd2.h	2014-12-12 23:29:12.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ext4_jbd2.h	2015-01-21 12:02:52.551978740 +0300
@@ -268,6 +268,13 @@ static inline void ext4_update_inode_fsy
 		ei->i_sync_tid = handle->h_transaction->t_tid;
 		if (datasync)
 			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	} else {
+		struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
+		if (q)
+			atomic_set(&EXT4_I(inode)->i_flush_tag,
+				   atomic_read(&q->flush_tag));
+		else
+			atomic_set(&EXT4_I(inode)->i_flush_tag, UINT_MAX);
 	}
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/extents.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/extents.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/extents.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/extents.c	2015-01-21 12:02:58.094831609 +0300
@@ -265,6 +265,19 @@ static inline int ext4_ext_space_root_id
 	return size;
 }
 
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+			   struct ext4_ext_path *path, ext4_lblk_t lblk,
+			   int nofail)
+{
+	int unwritten = ext4_ext_is_uninitialized(path[path->p_depth].p_ext);
+
+	return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
+			EXT4_EXT_MARK_UNINIT1|EXT4_EXT_MARK_UNINIT2 : 0,
+			EXT4_GET_BLOCKS_DIO |
+			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
 /*
  * Calculate the number of metadata blocks needed
  * to allocate @blocks
@@ -381,7 +394,7 @@ static int ext4_valid_extent_entries(str
 	return 1;
 }
 
-static int __ext4_ext_check(const char *function, struct inode *inode,
+int __ext4_ext_check(const char *function, struct inode *inode,
 					struct ext4_extent_header *eh,
 					int depth)
 {
@@ -425,9 +438,7 @@ corrupted:
 
 	return -EIO;
 }
-
-#define ext4_ext_check(inode, eh, depth)	\
-	__ext4_ext_check(__func__, inode, eh, depth)
+EXPORT_SYMBOL (__ext4_ext_check);
 
 int ext4_ext_check_inode(struct inode *inode)
 {
@@ -548,6 +559,7 @@ void ext4_ext_drop_refs(struct ext4_ext_
 			path->p_bh = NULL;
 		}
 }
+EXPORT_SYMBOL(ext4_ext_drop_refs);
 
 /*
  * ext4_ext_binsearch_idx:
@@ -1425,7 +1437,7 @@ got_index:
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
 	int depth;
@@ -2574,22 +2586,14 @@ again:
 		 */
 		if (end >= ee_block &&
 		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
-			int split_flag = 0;
-
-			if (ext4_ext_is_uninitialized(ex))
-				split_flag = EXT4_EXT_MARK_UNINIT1 |
-					     EXT4_EXT_MARK_UNINIT2;
-
 			/*
 			 * Split the extent in two so that 'end' is the last
 			 * block in the first new extent. Also we should not
 			 * fail removing space due to ENOSPC so try to use
 			 * reserved block if that happens.
 			 */
-			err = ext4_split_extent_at(handle, inode, path,
-					end + 1, split_flag,
-					EXT4_GET_BLOCKS_DIO |
-					EXT4_GET_BLOCKS_METADATA_NOFAIL);
+			err = ext4_force_split_extent_at(handle, inode, path,
+							 end + 1, 1);
 
 			if (err < 0)
 				goto out;
@@ -4082,6 +4086,127 @@ static void ext4_falloc_update_inode(str
 
 }
 
+static int ext4_convert_and_extend_locked(struct inode *inode, loff_t offset,
+					  loff_t len)
+{
+	struct ext4_ext_path *path = NULL;
+	loff_t new_size = offset + len;
+	ext4_lblk_t iblock = offset >> inode->i_blkbits;
+	ext4_lblk_t new_iblock = new_size >> inode->i_blkbits;
+	unsigned int max_blocks = new_iblock - iblock;
+	handle_t *handle;
+	unsigned int credits;
+	int err = 0;
+	int ret = 0;
+
+	if ((loff_t)iblock << inode->i_blkbits != offset ||
+	    (loff_t)new_iblock << inode->i_blkbits != new_size)
+		return -EINVAL;
+
+	while (max_blocks > 0) {
+		struct ext4_extent *ex;
+		ext4_lblk_t ee_block;
+		ext4_fsblk_t ee_start;
+		unsigned short ee_len;
+		int depth;
+
+		/*
+		 * credits to insert 1 extents into extent tree
+		 */
+		credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		handle = ext4_journal_start(inode, credits);
+		if (IS_ERR(handle))
+		       return PTR_ERR(handle);
+
+		down_write((&EXT4_I(inode)->i_data_sem));
+
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, iblock, NULL);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto done;
+		}
+
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+		BUG_ON(ex == NULL && depth != 0);
+
+		if (ex == NULL) {
+			err = -ENOENT;
+			goto done;
+		}
+
+		ee_block = le32_to_cpu(ex->ee_block);
+		ee_start = ext4_ext_pblock(ex);
+		ee_len = ext4_ext_get_actual_len(ex);
+		if (!in_range(iblock, ee_block, ee_len)) {
+			err = -ERANGE;
+			goto done;
+		}
+
+		if (ext4_ext_is_uninitialized(ex)) {
+			err = ext4_convert_unwritten_extents_dio(handle, inode,
+								 iblock,
+								 max_blocks,
+								 path);
+			if (err < 0)
+				goto done;
+
+			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, iblock, path,
+						 max_blocks);
+			if (err)
+				goto done;
+		}
+
+
+		up_write((&EXT4_I(inode)->i_data_sem));
+
+		iblock += ee_len;
+		max_blocks -= (ee_len < max_blocks) ? ee_len : max_blocks;
+
+		if (!max_blocks && new_size > i_size_read(inode)) {
+			i_size_write(inode, new_size);
+			ext4_update_i_disksize(inode, new_size);
+		}
+
+		ret = ext4_mark_inode_dirty(handle, inode);
+done:
+		if (err)
+			up_write((&EXT4_I(inode)->i_data_sem));
+		else
+			err = ret;
+		
+		if (path) {
+			ext4_ext_drop_refs(path);
+			kfree(path);
+		}
+
+		ret = ext4_journal_stop(handle);
+		if (!err && ret)
+			err = ret;
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int ext4_convert_and_extend(struct inode *inode, loff_t offset,
+				   loff_t len)
+{
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	mutex_lock(&inode->i_mutex);
+	err = ext4_convert_and_extend_locked(inode, offset, len);
+	mutex_unlock(&inode->i_mutex);
+
+	return err;
+}
+
 /*
  * preallocate space for a file. This implements ext4's fallocate inode
  * operation, which gets called from sys_fallocate system call.
@@ -4107,11 +4232,17 @@ long ext4_fallocate(struct inode *inode,
 	 */
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
- 
+
 	/* Return error if mode is not supported */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_CONVERT_AND_EXTEND))
 		return -EOPNOTSUPP;
 
+	/* If data is about to change we must drop csum */
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM) &&
+	    ((mode & ~FALLOC_FL_KEEP_SIZE)  || !(mode & FALLOC_FL_KEEP_SIZE)))
+		ext4_truncate_data_csum(inode, -1);
+
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return ext4_punch_hole(inode, offset, len);
 
@@ -4119,6 +4250,9 @@ long ext4_fallocate(struct inode *inode,
 	if (S_ISDIR(inode->i_mode))
 		return -ENODEV;
 
+	if (mode & FALLOC_FL_CONVERT_AND_EXTEND)
+		return ext4_convert_and_extend(inode, offset, len);
+
 	block = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
@@ -4186,6 +4320,206 @@ retry:
 	return ret > 0 ? ret2 : ret;
 }
 
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:	First inode
+ * @inode2:	Second inode
+ * @lblk1:	Start block for first inode
+ * @lblk2:	Start block for second inode
+ * @count:	Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:	Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ * 		i_mutex is held for both inodes
+ * 		i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *		All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+		     struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, int unwritten, int *erp)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_ext_find_extent(inode1, lblk1, NULL);
+		if (IS_ERR(path1)) {
+			*erp = PTR_ERR(path1);
+			break;
+		}
+		path2 = ext4_ext_find_extent(inode2, lblk2, NULL);
+		if (IS_ERR(path2)) {
+			*erp = PTR_ERR(path2);
+			break;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have somthing to swap ? */
+		if (unlikely(!ex2 || !ex1))
+			break;
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		/* Hole handling */
+		if (!in_range(lblk1, e1_blk, e1_len) ||
+		    !in_range(lblk2, e2_blk, e2_len)) {
+			ext4_lblk_t next1, next2;
+
+			/* if hole after extent, then go to next extent */
+			next1 = ext4_ext_next_allocated_block(path1);
+			next2 = ext4_ext_next_allocated_block(path2);
+			/* If hole before extent, then shift to that extent */
+			if (e1_blk > lblk1)
+				next1 = e1_blk;
+			if (e2_blk > lblk2)
+				next2 = e1_blk;
+			/* Do we have something to swap */
+			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+				break;
+			/* Move to the rightest boundary */
+			len = next1 - lblk1;
+			if (len < next2 - lblk2)
+				len = next2 - lblk2;
+			if (len > count)
+				len = count;
+			lblk1 += len;
+			lblk2 += len;
+			count -= len;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						path1, lblk1, 0);
+			if (*erp)
+				break;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						path2,  lblk2, 0);
+			if (*erp)
+				break;
+		}
+		/* ext4_split_extent_at() may retult in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						path1, lblk1 + len, 0);
+			if (*erp)
+				break;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						path2, lblk2 + len, 0);
+			if (*erp)
+				break;
+		}
+		/* ext4_split_extent_at() may retult in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		ext4_ext_invalidate_cache(inode1);
+		ext4_ext_invalidate_cache(inode2);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (*erp)
+			break;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (*erp)
+			break;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = cpu_to_le16(e2_len);
+		ex2->ee_len = cpu_to_le16(e1_len);
+		if (unwritten)
+			ext4_ext_mark_uninitialized(ex2);
+		if (ext4_ext_is_uninitialized(&tmp_ex))
+			ext4_ext_mark_uninitialized(ex1);
+
+		ext4_ext_try_to_merge(inode2, path2, ex2);
+		ext4_ext_try_to_merge(inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (*erp)
+			break;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (*erp)
+			break;
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+	repeat:
+		if (path1) {
+			ext4_ext_drop_refs(path1);
+			kfree(path1);
+			path1 = NULL;
+		}
+		if (path2) {
+			ext4_ext_drop_refs(path2);
+			kfree(path2);
+			path2 = NULL;
+		}
+	}
+	if (path1) {
+		ext4_ext_drop_refs(path1);
+		kfree(path1);
+	}
+	if (path2) {
+		ext4_ext_drop_refs(path2);
+		kfree(path2);
+	}
+	return replaced_count;
+}
+
 /*
  * This function convert a range of blocks to written extents
  * The caller of this function will pass the start offset and the size.
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/file.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/file.c	2015-01-21 12:02:52.551978740 +0300
@@ -39,14 +39,19 @@ static int ext4_release_file(struct inod
 		ext4_alloc_da_blocks(inode);
 		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 	}
-	/* if we are the last writer on the inode, drop the block reservation */
+	/*
+	 * if we are the last writer on the inode,
+	 * update data csum and drop the block reservation
+	 */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1) &&
-		        !EXT4_I(inode)->i_reserved_data_blocks)
-	{
-		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_preallocations(inode);
-		up_write(&EXT4_I(inode)->i_data_sem);
+	    (atomic_read(&inode->i_writecount) == 1)) {
+		if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+			ext4_commit_data_csum(inode);
+		if (!EXT4_I(inode)->i_reserved_data_blocks) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			ext4_discard_preallocations(inode);
+			up_write(&EXT4_I(inode)->i_data_sem);
+		}
 	}
 	if (is_dx(inode) && filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
@@ -61,6 +66,13 @@ void ext4_unwritten_wait(struct inode *i
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
 }
 
+void ext4_ioend_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = to_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
 /*
  * This tests whether the IO in question is block-aligned or not.
  * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -142,6 +154,16 @@ static int ext4_file_mmap(struct file *f
 
 	if (!mapping->a_ops->readpage)
 		return -ENOEXEC;
+
+	if (!vma) {
+		if (ext4_test_inode_state(mapping->host, EXT4_STATE_CSUM)) {
+			mutex_lock(&mapping->host->i_mutex);
+			ext4_truncate_data_csum(mapping->host, -1);
+			mutex_unlock(&mapping->host->i_mutex);
+		}
+		return 0;
+	}
+
 	file_accessed(file);
 	vma->vm_ops = &ext4_file_vm_ops;
 	vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -177,6 +199,7 @@ static int ext4_file_open(struct inode *
 			sb->s_dirt = 1;
 		}
 	}
+
 	return generic_file_open(inode, filp);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/fsync.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/fsync.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/fsync.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/fsync.c	2015-01-21 12:02:52.578978024 +0300
@@ -53,16 +53,22 @@ int ext4_sync_file(struct file *file, st
 	struct inode *inode = dentry->d_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int ret;
+	int ret, err = 0;
 	tid_t commit_tid;
 	bool needs_barrier = false;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file(file, dentry, datasync);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_fsync_counter);
 
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make shure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
 		return 0;
+	}
 
 	ret = ext4_flush_unwritten_io(inode);
 	if (ret < 0)
@@ -93,7 +99,145 @@ int ext4_sync_file(struct file *file, st
 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 		needs_barrier = true;
 	ret = jbd2_complete_transaction(journal, commit_tid);
+
+	/* Even if we had to wait for commit completion, it does not mean a flush has been
+	 * issued after data demanded by this fsync were written back. Commit could be in state
+	 * after it is already done, but not yet in state where we should not wait.
+	 */
 	if (needs_barrier)
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		err = blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+	if (!ret)
+		ret = err;
 	return ret;
 }
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files)
+{
+	struct super_block *sb;
+	journal_t *journal;
+	int err = 0, err2 = 0, i = 0, j = 0;
+	int force_commit = 0, datawriteback = 0;
+	tid_t commit_tid = 0;
+	int fdsync_cnt = 0, fsync_cnt = 0;
+	int need_barrier = 0;
+	struct user_beancounter *ub;
+
+	J_ASSERT(ext4_journal_current_handle() == NULL);
+	if (!nr_files)
+		return 0;
+
+	sb = files[0]->f_mapping->host->i_sb;
+	journal = EXT4_SB(sb)->s_journal;
+	ub = get_exec_ub();
+	if (sb->s_flags & MS_RDONLY) {
+		/* Make shure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
+		return 0;
+	}
+	for (i = 0; i < nr_files; i++) {
+		struct address_space * mapping = files[i]->f_mapping;
+		struct inode *inode = mapping->host;
+		unsigned int datasync = flags[i];
+
+		BUG_ON(sb != inode->i_sb);
+		trace_ext4_sync_file(files[i], files[i]->f_path.dentry, flags[i]);
+
+		if (datasync) {
+			ub_percpu_inc(ub, fdsync);
+			fdsync_cnt++;
+		} else {
+			ub_percpu_inc(ub, fsync);
+			fsync_cnt++;
+		}
+
+		if (!mapping->nrpages)
+			continue;
+
+		err = filemap_fdatawrite(mapping);
+		if (err)
+			break;
+	}
+	/*
+	 * Even if the above returned error, the pages may be
+	 * written partially (e.g. -ENOSPC), so we wait for it.
+	 * But the -EIO is special case, it may indicate the worst
+	 * thing (e.g. bug) happened, so we avoid waiting for it.
+	 */
+	if (err == -EIO)
+		goto out;
+
+	for (j = 0; j < i; j++) {
+		struct address_space * mapping = files[j]->f_mapping;
+		struct inode *inode = mapping->host;
+		struct ext4_inode_info *ei = EXT4_I(inode);
+		unsigned int datasync = flags[j];
+		tid_t tid;
+
+		if (mapping->nrpages) {
+			err2 = filemap_fdatawait(mapping);
+			if (!err || err2 == -EIO)
+				err = err2;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		err2 = ext4_flush_unwritten_io(inode);
+		if (!err || err2 == -EIO)
+			err = err2;
+		force_commit  |= ext4_should_journal_data(inode);
+		datawriteback |= ext4_should_writeback_data(inode);
+		tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+		mutex_unlock(&inode->i_mutex);
+		trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, datasync);
+		if (j == 0 || !tid_geq(commit_tid, tid))
+			commit_tid = tid;
+	}
+
+	/* Ext4 specific stuff starts here */
+	if (!journal) {
+		for (j = 0; j < i; j++) {
+			err2 = simple_fsync(files[j], files[j]->f_path.dentry, flags[j]);
+			if (!err)
+				err = err2;
+		}
+	} else if (force_commit) {
+		/* data=journal:
+		 *  filemap_fdatawrite won't do anything (the buffers are clean).
+		 *  ext4_force_commit will write the file data into the journal and
+		 *  will wait on that.
+		 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+		 *  (they were dirtied by commit).  But that's OK - the blocks are
+		 *  safe in-journal, which is all fsync() needs to ensure.
+		 */
+		err2 = ext4_force_commit(sb);
+	} else {
+		/*
+		 * data=writeback,ordered:
+		 * The caller's filemap_fdatawrite()/wait will sync the data.
+		 * Metadata is in the journal, we wait for proper transaction to
+		 * commit here.
+		 */
+		if (journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+			need_barrier = true;
+
+		err2 = jbd2_complete_transaction(journal, commit_tid);
+		/* Even if we had to wait for commit completion, it does not
+		 * mean a flush has been issued after data demanded by this
+		 * fsync were written back. Commit could be in state after
+		 * it is already done, but not yet in state where we should
+		 * not wait.
+		 */
+		if (need_barrier)
+			err2 = blkdev_issue_flush(sb->s_bdev, NULL);
+	}
+out:
+	trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, need_barrier);
+	ub_percpu_add(ub, fdsync_done, fdsync_cnt);
+	ub_percpu_add(ub, fsync_done, fsync_cnt);
+	percpu_counter_add(&EXT4_SB(sb)->s_fsync_counter, i);
+	if (!err || err2 == -EIO)
+		err = err2;
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/ialloc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ialloc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/ialloc.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ialloc.c	2015-01-21 12:02:43.089229957 +0300
@@ -1049,6 +1049,11 @@ got:
 		goto fail_drop;
 	}
 
+	if (check_bd_full(inode, 1)) {
+		err = -ENOSPC;
+		goto fail_free_drop;
+	}
+
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/inode.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/inode.c	2015-01-21 12:02:58.289826434 +0300
@@ -48,6 +48,8 @@
 
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
+DEFINE_PER_CPU(unsigned long, ext4_bd_full_ratelimits) = 0;
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
@@ -225,6 +227,8 @@ void ext4_delete_inode(struct inode *ino
 
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
+
+	ext4_ioend_wait(inode);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -235,6 +239,8 @@ void ext4_delete_inode(struct inode *ino
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
+	if (inode->i_blocks && ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
 	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -275,6 +281,7 @@ void ext4_delete_inode(struct inode *ino
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
+			ext4_orphan_del(NULL, inode);
 			sb_end_intwrite(inode->i_sb);
 			goto no_delete;
 		}
@@ -1652,6 +1659,8 @@ static int do_journal_get_write_access(h
 static void ext4_truncate_failed_write(struct inode *inode)
 {
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
 	ext4_truncate(inode);
 }
 
@@ -1678,6 +1687,10 @@ static int ext4_write_begin(struct file 
 	to = from + len;
 
 retry:
+	/* Check csum window position before journal_start */
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_check_pos_data_csum(inode, pos);
+
 	handle = ext4_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -1781,6 +1794,10 @@ static int ext4_generic_write_end(struct
 		ext4_update_i_disksize(inode, (pos + copied));
 		i_size_changed = 1;
 	}
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -1929,6 +1946,9 @@ static int ext4_journalled_write_end(str
 			ret = ret2;
 	}
 
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
@@ -1981,6 +2001,11 @@ static int ext4_da_reserve_space(struct 
 	if (vfs_dq_reserve_block(inode, 1))
 		return -EDQUOT;
 
+	if (check_bd_full(inode, 1)) {
+		vfs_dq_release_reservation_block(inode, 1);
+		return -ENOSPC;
+	}
+
 	/*
 	 * We do still charge estimated metadata to the sb though;
 	 * we cannot afford to run out of free blocks.
@@ -2064,6 +2089,24 @@ static void ext4_da_page_release_reserva
 	ext4_da_release_space(page->mapping->host, to_release);
 }
 
+static void ext4_io_submit(struct ext4_io_submit *io)
+{
+        if (io->bio) {
+		/* Backup original bio because issued one will change.*/
+		struct bio* orig_bio;
+		do {
+			orig_bio = bio_clone(io->bio, GFP_NOIO);
+		} while(!orig_bio);
+		orig_bio->bi_private = io->inode;
+		io->bio->bi_private = orig_bio;
+		bio_get(io->bio);
+		submit_bio(io->rw, io->bio);
+                BUG_ON(bio_flagged(io->bio, BIO_EOPNOTSUPP));
+                bio_put(io->bio);
+	}
+        io->bio = NULL;
+}
+
 /*
  * Delayed allocation stuff
  */
@@ -2134,6 +2177,7 @@ static int mpage_da_submit_io(struct mpa
 		}
 		pagevec_release(&pvec);
 	}
+	ext4_io_submit((struct ext4_io_submit*)mpd->wbc->fsdata);
 	return ret;
 }
 
@@ -2412,16 +2456,6 @@ static void mpage_da_map_and_submit(stru
 	    (mpd->b_state & (1 << BH_Unwritten)))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
-	if (buffer_new(&new) &&
-	    ext4_should_order_data(mpd->inode)) {
-		err = ext4_jbd2_file_inode(handle, mpd->inode);
-		if (err) {
-			/* This only happens if the journal is aborted */
-			mpd->retval = err;
-			goto submit_io;
-		}
-	}
-
 	/*
 	 * Update on-disk size along with block allocation.
 	 */
@@ -2773,6 +2807,126 @@ out:
 	return ret;
 }
 
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_fsblk_t err_block;
+	unsigned int idx, blocks;
+	struct bio *orig_bio = (struct bio*)bio->bi_private;
+	struct inode *inode = (struct inode *)orig_bio->bi_private;
+
+        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                error = 0;
+
+	err_block = bio->bi_sector >> (inode->i_blkbits - 9);
+
+	if (error) {
+		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+			     "(size %ld starting block %llu)",
+			     inode->i_ino, (long) bio->bi_size,
+			     (unsigned long long) err_block);
+        }
+	idx = 0;
+	blocks = orig_bio->bi_size >> inode->i_blkbits;
+	do {
+		struct buffer_head *bh, *head;
+		struct page *page = bio_iovec_idx(orig_bio, idx)->bv_page;
+		unsigned int pg_off = bio_iovec_idx(orig_bio, idx)->bv_offset;
+		unsigned int pg_end = bio_iovec_idx(orig_bio, idx)->bv_len +
+			pg_off;
+		unsigned int offset = 0;
+
+                head = page_buffers(page);
+                BUG_ON(!head);
+		bh = head;
+
+		if (error)
+                        SetPageError(page);
+		do {
+			if (offset < pg_off)
+				goto next_bh;
+			if (offset >= pg_end)
+				break;
+			if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+				set_bit(BH_Quiet, &bh->b_state);
+			bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+			blocks--;
+next_bh:
+			offset += bh->b_size;
+			bh = bh->b_this_page;
+		} while (bh != head && blocks);
+		idx++;
+	} while (blocks);
+	bio_put(orig_bio);
+	bio_put(bio);
+}
+
+/* Try to merge adjacent bh's in to one bio */
+static int ext4_bh_add_or_submit(struct ext4_io_submit *io, struct buffer_head *bh,
+	int rw)
+{
+	int ret;
+retry:
+	if (!io->bio) {
+		struct bio *bio;
+		int nvecs = bio_get_nr_vecs(bh->b_bdev);
+		do {
+			bio = bio_alloc(GFP_NOIO, nvecs);
+			nvecs >>= 1;
+		} while (bio == NULL);
+
+		bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+		bio->bi_bdev = bh->b_bdev;
+		bio->bi_end_io = ext4_end_bio;
+		io->bio = bio;
+		io->rw = rw;
+	}
+	/* Merge bh in to one bio only if it is adjacent and has same rw flags */
+	if (rw != io->rw || io->bio->bi_sector + bio_sectors(io->bio) !=
+		bh->b_blocknr * (bh->b_size >> 9))
+		goto submit_and_retry;
+
+        ret = bio_add_page(io->bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (ret != bh->b_size)
+                goto submit_and_retry;
+	return 0;
+submit_and_retry:
+	ext4_io_submit(io);
+	goto retry;
+}
+
+/* Per inode io requests tracker helpers */
+static int ext4_submit_bh(int rw, struct buffer_head *bh, void *fsdata)
+{
+	/* For now, we track only write requests */
+	if (rw) {
+		struct inode *inode = bh->b_page->mapping->host;
+		struct ext4_io_submit *io = (struct ext4_io_submit*)fsdata;
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		/*
+		 * If we have writepages context we can perform huge io
+		 * optimization, sice caller guarantie that pended bio
+		 * will be issued at the end.
+		 */
+		if (io && attr_batched_writeback)
+			return ext4_bh_add_or_submit(io, bh, rw);
+	}
+	return submit_bh(rw, bh);
+
+}
+
+/* Page is under writeback so no one can truncate it */
+void ext4_end_buffer_async_write(struct buffer_head *bh, int uptodate)
+{
+	struct inode *inode = bh->b_page->mapping->host;
+	wait_queue_head_t *wq = to_ioend_wq(inode);
+	BUG_ON(!PageWriteback(bh->b_page));
+	ext4_update_inode_fsync_trans(NULL, inode, 1);
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_ioend_count))
+		wake_up_all(wq);
+	end_buffer_async_write(bh, uptodate);
+}
+
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2822,7 +2976,6 @@ static int ext4_writepage(struct page *p
 	unsigned int len;
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
-
 	trace_ext4_writepage(inode, page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
@@ -2901,12 +3054,43 @@ static int ext4_writepage(struct page *p
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		ret = block_write_full_page(page, noalloc_get_block_write,
-					    wbc);
+		ret = generic_block_write_full_page(page, noalloc_get_block_write,
+						    wbc,
+						    ext4_submit_bh,
+						    ext4_end_buffer_async_write);
 
 	return ret;
 }
 
+static void init_io_submit(struct ext4_io_submit *io, struct inode *inode)
+{
+	io->inode = inode;
+	io->bio = NULL;
+}
+
+static int do_writepage(struct page *page, struct writeback_control *wbc,
+		       void *data)
+{
+	struct address_space *mapping = data;
+	int ret = mapping->a_ops->writepage(page, wbc);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
+
+int ext4_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	int ret;
+	struct ext4_io_submit io;
+	init_io_submit(&io, mapping->host);
+	wbc->fsdata = &io;
+	ret =  write_cache_pages(mapping, wbc, do_writepage, mapping);
+	/* io may be pended after ext4_writepage(), issue it now */
+	ext4_io_submit(&io);
+	wbc->fsdata = NULL;
+	return ret;
+}
+
 /*
  * This is called via ext4_da_writepages() to
  * calulate the total number of credits to reserve to fit
@@ -2914,7 +3098,6 @@ static int ext4_writepage(struct page *p
  * ext4_da_writpeages() will loop calling this before
  * the block allocation.
  */
-
 static int ext4_da_writepages_trans_blocks(struct inode *inode)
 {
 	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
@@ -3075,6 +3258,7 @@ static int ext4_da_writepages(struct add
 	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	struct ext4_io_submit io;
 	pgoff_t done_index = 0;
 	pgoff_t end;
 
@@ -3150,9 +3334,10 @@ static int ext4_da_writepages(struct add
 		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
 		wbc->nr_to_write = desired_nr_to_write;
 	}
-
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
+	init_io_submit(&io, mpd.inode);
+	mpd.wbc->fsdata = &io;
 
 	pages_skipped = wbc->pages_skipped;
 
@@ -3264,6 +3449,9 @@ retry:
 out_writepages:
 	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
+	/* io may be pended after ext4_writepage(), issue it now */
+	ext4_io_submit(&io);
+	wbc->fsdata = NULL;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
@@ -3281,11 +3469,17 @@ static int ext4_nonda_switch(struct supe
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
+	 *
+	 * NOTE: Delalloc make data=writeback mode safer, similar to ordered
+	 * mode, so stale blocks after power failure no longer an issue Do not
+	 * disable delalloc to guarantee data security on data=writeback mode.
+	 *                                                              -dmon
 	 */
 	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
-	if (2 * free_blocks < 3 * dirty_blocks ||
-		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+	if (test_opt(sb, DATA_FLAGS) != EXT4_MOUNT_WRITEBACK_DATA && (
+		2 * free_blocks < 3 * dirty_blocks ||
+		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK))) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
@@ -3296,8 +3490,12 @@ static int ext4_nonda_switch(struct supe
 	 * Even if we don't switch but are nearing capacity,
 	 * start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
-	if (free_blocks < 2 * dirty_blocks)
-		writeback_inodes_sb_if_idle(sb);
+	if (free_blocks < 2 * dirty_blocks &&
+	    !writeback_in_progress(sb->s_bdi) &&
+	    down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb(sb);
+		up_read(&sb->s_umount);
+	}
 
 	return 0;
 }
@@ -3325,6 +3523,9 @@ static int ext4_da_write_begin(struct fi
 	*fsdata = (void *)0;
 	trace_ext4_da_write_begin(inode, pos, len, flags);
 retry:
+	/* Check csum window position before journal_start */
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_check_pos_data_csum(inode, pos);
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
 	 * if there is delayed block allocation. But we still need
@@ -3431,17 +3632,8 @@ static int ext4_da_write_end(struct file
 	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
 		if (ext4_da_should_update_i_disksize(page, end)) {
 			down_write(&EXT4_I(inode)->i_data_sem);
-			if (new_i_size > EXT4_I(inode)->i_disksize) {
-				/*
-				 * Updating i_disksize when extending file
-				 * without needing block allocation
-				 */
-				if (ext4_should_order_data(inode))
-					ret = ext4_jbd2_file_inode(handle,
-								   inode);
-
+			if (new_i_size > EXT4_I(inode)->i_disksize)
 				EXT4_I(inode)->i_disksize = new_i_size;
-			}
 			up_write(&EXT4_I(inode)->i_data_sem);
 			/* We need to mark inode dirty even if
 			 * new_i_size is less that inode->i_size
@@ -3450,6 +3642,10 @@ static int ext4_da_write_end(struct file
 			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
@@ -3775,6 +3971,10 @@ static void ext4_free_io_end(ext4_io_end
 	BUG_ON(!io);
 	BUG_ON(!list_empty(&io->list));
 	BUG_ON(io->flag & DIO_AIO_UNWRITTEN);
+	/* The only waiter of i_endio_count is ext4_delete_inode, since
+	 * we hold inode ref, it is safe to skip explicit wakeup */
+	atomic_dec(&EXT4_I(io->inode)->i_ioend_count);
+	ext4_update_inode_fsync_trans(NULL, io->inode, 1);
 
 	iput(io->inode);
 	kfree(io);
@@ -3827,6 +4027,7 @@ static int ext4_end_aio_dio(ext4_io_end_
 			 "extents -- potential data loss!  "
 			 "(inode %lu, offset %llu, size %zd, error %d)",
 			 inode->i_ino, offset, size, ret);
+		io->result = ret;
 	}
 
 	if (io->iocb)
@@ -3949,6 +4150,7 @@ static ext4_io_end_t *ext4_init_io_end (
 		io->result = 0;
 		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
 	}
 
 	return io;
@@ -4108,6 +4310,9 @@ static ssize_t ext4_direct_IO(int rw, st
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
+	if (rw == WRITE && ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_truncate_data_csum(inode, -1);
+
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
 
@@ -4137,6 +4342,7 @@ static const struct address_space_operat
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_ordered_write_end,
@@ -4153,6 +4359,7 @@ static const struct address_space_operat
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_writeback_write_end,
@@ -4169,6 +4376,7 @@ static const struct address_space_operat
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
@@ -5012,6 +5220,9 @@ void ext4_truncate(struct inode *inode)
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
+
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
 		return;
@@ -5408,6 +5619,7 @@ struct inode *ext4_iget(struct super_blo
 	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	ei->i_dq_cookie = le32_to_cpu(raw_inode->i_dqcookie);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
@@ -5531,9 +5743,13 @@ struct inode *ext4_iget(struct super_blo
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
+		if (test_opt2(sb, CSUM) && !ext4_load_data_csum(inode))
+			ext4_open_pfcache(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
+		if (test_opt2(sb, CSUM))
+			ext4_load_dir_csum(inode);
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
@@ -5666,6 +5882,7 @@ static int ext4_do_update_inode(handle_t
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_dqcookie = cpu_to_le32(ei->i_dq_cookie);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
@@ -5787,6 +6004,12 @@ int ext4_write_inode(struct inode *inode
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			return 0;
 
+		/* If caller is going to call sync_fs() after completing batch of inode syncs,
+		 * we should skip commit.
+		 */
+		if (wbc->for_sync && attr_batched_sync)
+			return 0;
+
 		err = ext4_force_commit(inode->i_sb);
 	} else {
 		struct ext4_iloc iloc;
@@ -5882,6 +6105,9 @@ int ext4_setattr(struct dentry *dentry, 
 	    (attr->ia_size < inode->i_size)) {
 		handle_t *handle;
 
+		if (ext4_test_inode_state(inode, EXT4_STATE_CSUM))
+			ext4_truncate_data_csum(inode, attr->ia_size);
+
 		handle = ext4_journal_start(inode, 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
@@ -5910,7 +6136,6 @@ int ext4_setattr(struct dentry *dentry, 
 				goto err_out;
 			}
 		}
-		ext4_truncate(inode);
 	}
 
 	rc = inode_setattr(inode, attr);
@@ -5920,8 +6145,9 @@ int ext4_setattr(struct dentry *dentry, 
 	 * orphan list manually. */
 	if (inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
-
-	if (!rc && (ia_valid & ATTR_MODE))
+	/* Openvz want to change permission for symlink, but not interested
+	 * in acl support -dmon */
+        if (!rc && (ia_valid & ATTR_MODE) && !S_ISLNK(inode->i_mode))
 		rc = ext4_acl_chmod(inode);
 
 err_out:
@@ -6353,12 +6579,19 @@ int ext4_page_mkwrite(struct vm_area_str
 	unsigned long len;
 	int ret;
 	struct file *file = vma->vm_file;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct address_space *mapping = inode->i_mapping;
+	struct inode *inode;
+	struct address_space *mapping;
 	handle_t *handle;
 	int retries = 0;
 
+	if (file->f_op->get_host)
+		file = file->f_op->get_host(file);
+
+	inode = file->f_path.dentry->d_inode;
+	mapping = inode->i_mapping;
+
 	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/ioctl.c	2014-12-12 23:29:14.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/ioctl.c	2015-01-21 12:02:55.838891488 +0300
@@ -18,6 +18,61 @@
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
+static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct inode *balloon_ino;
+	int err, fd;
+	struct file *filp;
+	struct dentry *de;
+	struct path path;
+	fmode_t mode;
+
+	balloon_ino = EXT4_SB(sb)->s_balloon_ino;
+	err = -ENOENT;
+	if (balloon_ino == NULL)
+		goto err;
+
+	err = fd = get_unused_fd();
+	if (err < 0)
+		goto err_fd;
+
+	__iget(balloon_ino);
+	de = d_obtain_alias(balloon_ino);
+	err = PTR_ERR(de);
+	if (IS_ERR(de))
+		goto err_de;
+
+	path.dentry = de;
+	path.mnt = mntget(mnt);
+	err = mnt_want_write(path.mnt);
+	if (err)
+		mode = FMODE_READ;
+	else
+		mode = FMODE_READ | FMODE_WRITE;
+	filp = alloc_file(&path, mode,
+			&ext4_file_operations);
+	if (mode & FMODE_WRITE)
+		mnt_drop_write(path.mnt);
+	err = -ENOMEM;
+	if (filp == NULL)
+		goto err_filp;
+
+	filp->f_flags |= O_LARGEFILE;
+	fd_install(fd, filp);
+	return fd;
+
+err_filp:
+	path_put(&path);
+err_de:
+	put_unused_fd(fd);
+err_fd:
+	/* nothing */
+err:
+	return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -77,7 +132,7 @@ long ext4_ioctl(struct file *filp, unsig
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_ADMIN))
 				goto flags_out;
 		}
 		if (oldflags & EXT4_EXTENTS_FL) {
@@ -202,15 +257,18 @@ setversion_out:
 		struct super_block *sb = inode->i_sb;
 		int err, err2=0;
 
-		if (!capable(CAP_SYS_RESOURCE))
-			return -EPERM;
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
 
-		if (get_user(n_blocks_count, (__u32 __user *)arg))
-			return -EFAULT;
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
 
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
-			return err;
+			goto group_extend_out;
 
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		if (EXT4_SB(sb)->s_journal) {
@@ -221,6 +279,8 @@ setversion_out:
 		if (err == 0)
 			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
+group_extend_out:
+		ext4_resize_end(sb);
 
 		return err;
 	}
@@ -271,8 +331,9 @@ mext_out:
 		struct super_block *sb = inode->i_sb;
 		int err, err2=0;
 
-		if (!capable(CAP_SYS_RESOURCE))
-			return -EPERM;
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
 
 		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
 				sizeof(input)))
@@ -291,6 +352,7 @@ mext_out:
 		if (err == 0)
 			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
+		ext4_resize_end(sb);
 
 		return err;
 	}
@@ -331,6 +393,113 @@ mext_out:
 		return err;
 	}
 
+	case EXT4_IOC_RESIZE_FS: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err = 0, err2 = 0;
+
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+			       EXT4_FEATURE_INCOMPAT_META_BG)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with meta_bg");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		if (n_blocks_count > MAX_32_NUM &&
+		    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+					       EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "File system only supports 32-bit block numbers");
+			return -EOPNOTSUPP;
+		}
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto resizefs_out;
+
+		err = ext4_resize_fs(sb, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+	case EXT4_IOC_SET_RSV_BLOCKS: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		handle_t *handle;
+		int err = 0, err2 = 0;
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		if (n_blocks_count > MAX_32_NUM &&
+		    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+					       EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "File system only supports 32-bit block numbers");
+			return -EOPNOTSUPP;
+		}
+
+		if (n_blocks_count > ext4_blocks_count(EXT4_SB(sb)->s_es))
+			return -EINVAL;
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto resize_out;
+
+		handle = ext4_journal_start_sb(sb, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto mnt_out;
+		}
+		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+		if (err) {
+			goto journal_out;
+		}
+		ext4_r_blocks_count_set(EXT4_SB(sb)->s_es, n_blocks_count);
+		ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+journal_out:
+		err2 = ext4_journal_stop(handle);
+		if (err == 0)
+			err = err2;
+
+		if (!err && EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+mnt_out:
+		mnt_drop_write(filp->f_path.mnt);
+resize_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
+
 	case FITRIM:
 	{
 		struct super_block *sb = inode->i_sb;
@@ -361,6 +530,98 @@ mext_out:
 		return 0;
 	}
 
+	case EXT4_IOC_OPEN_BALLOON:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return ext4_open_balloon(inode->i_sb, filp->f_vfsmnt);
+
+	case FS_IOC_PFCACHE_OPEN:
+	{
+		int err;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		mutex_lock(&inode->i_mutex);
+		err = ext4_open_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+
+		return err;
+	}
+	case FS_IOC_PFCACHE_CLOSE:
+	{
+		int err;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		mutex_lock(&inode->i_mutex);
+		err = ext4_close_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+
+		return err;
+	}
+	case FS_IOC_PFCACHE_DUMP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return ext4_dump_pfcache(inode->i_sb,
+				(struct pfcache_dump_request __user *) arg);
+	case EXT4_IOC_MFSYNC:
+	{
+		struct ext4_ioc_mfsync_info mfsync;
+		struct file **filpp;
+		unsigned int *flags;
+		int i, err;
+		__u32 __user *usr_fd;
+
+		if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg,
+				   sizeof(mfsync)))
+			return -EFAULT;
+
+		usr_fd = (__u32 __user*) (arg + sizeof(__u32));
+		if (mfsync.size == 0)
+			return 0;
+		filpp = kzalloc(mfsync.size * sizeof(*filp), GFP_KERNEL);
+		if (!filpp)
+			return -ENOMEM;
+		flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL);
+		if (!flags) {
+			kfree(filpp);
+			return -ENOMEM;
+		}
+		for (i = 0; i < mfsync.size; i++) {
+			int fd;
+			int ret;
+
+			err = -EFAULT;
+			ret = get_user(fd, usr_fd + i);
+			if (ret)
+				goto mfsync_fput;
+
+			/* negative fd means fdata_sync */
+			flags[i] = (fd & (1<< 31)) != 0;
+			fd &= ~(1<< 31);
+
+			err = -EBADF;
+			filpp[i] = fget(fd);
+			if (!filpp[i])
+				goto mfsync_fput;
+			if (filpp[i]->f_mapping->host->i_sb != filp->f_mapping->host->i_sb) {
+				err = -EXDEV;
+				goto mfsync_fput;
+			}
+		}
+		err = ext4_sync_files(filpp, flags, mfsync.size);
+mfsync_fput:
+		for (i = 0; i < mfsync.size; i++)
+			if (filpp[i])
+				fput(filpp[i]);
+		kfree(filpp);
+		kfree(flags);
+		return err;
+	}
 	default:
 		return -ENOTTY;
 	}
@@ -426,6 +687,10 @@ long ext4_compat_ioctl(struct file *file
 		set_fs(old_fs);
 		return err;
 	}
+	case FS_IOC_PFCACHE_OPEN:
+	case FS_IOC_PFCACHE_CLOSE:
+	case FS_IOC_PFCACHE_DUMP:
+		break;
 	case FITRIM:
 		break;
 	default:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/mballoc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/mballoc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/mballoc.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/mballoc.c	2015-01-21 12:02:43.090229930 +0300
@@ -1258,7 +1258,7 @@ static void mb_clear_bits(void *bm, int 
 	}
 }
 
-static void mb_set_bits(void *bm, int cur, int len)
+void mb_set_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -2199,7 +2199,7 @@ int ext4_mb_add_groupinfo(struct super_b
 	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
 		metalen = sizeof(*meta_group_info) <<
 			EXT4_DESC_PER_BLOCK_BITS(sb);
-		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		meta_group_info = kmalloc(metalen, GFP_NOFS);
 		if (meta_group_info == NULL) {
 			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
 			       "buddy group\n");
@@ -2220,7 +2220,7 @@ int ext4_mb_add_groupinfo(struct super_b
 		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
 	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
-	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	meta_group_info[i] = kzalloc(len, GFP_NOFS);
 	if (meta_group_info[i] == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
 		goto exit_group_info;
@@ -2249,7 +2249,7 @@ int ext4_mb_add_groupinfo(struct super_b
 	{
 		struct buffer_head *bh;
 		meta_group_info[i]->bb_bitmap =
-			kmalloc(sb->s_blocksize, GFP_KERNEL);
+			kmalloc(sb->s_blocksize, GFP_NOFS);
 		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
 		bh = ext4_read_block_bitmap(sb, group);
 		BUG_ON(bh == NULL);
@@ -4268,6 +4268,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
 			*errp = -EDQUOT;
 			goto out;
 		}
+
+		if (check_bd_full(ar->inode, inquota)) {
+			ar->len = 0;
+			*errp = -ENOSPC;
+			goto out;
+		}
 	}
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
@@ -4291,14 +4297,19 @@ repeat:
 		/* allocate space in core */
 		*errp = ext4_mb_regular_allocator(ac);
 		if (*errp)
-			goto errout;
+			goto discard_and_exit;
 
 		/* as we've just preallocated more space than
-		 * user requested orinally, we store allocated
+		 * user requested originally, we store allocated
 		 * space in a special descriptor */
 		if (ac->ac_status == AC_STATUS_FOUND &&
-				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-			ext4_mb_new_preallocation(ac);
+		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			*errp = ext4_mb_new_preallocation(ac);
+		if (*errp) {
+		discard_and_exit:
+			ext4_discard_allocated_blocks(ac);
+			goto errout;
+		}
 	}
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
@@ -4313,10 +4324,10 @@ repeat:
 			ac->ac_b_ex.fe_len = 0;
 			ac->ac_status = AC_STATUS_CONTINUE;
 			goto repeat;
-		} else if (*errp)
-		errout:
+		} else if (*errp) {
 			ext4_discard_allocated_blocks(ac);
-		else {
+			goto errout;
+		} else {
 			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 			ar->len = ac->ac_b_ex.fe_len;
 		}
@@ -4327,6 +4338,7 @@ repeat:
 		*errp = -ENOSPC;
 	}
 
+errout:
 	if (*errp) {
 		ac->ac_b_ex.fe_len = 0;
 		ar->len = 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/move_extent.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/move_extent.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/move_extent.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/move_extent.c	2015-01-21 12:02:41.910261259 +0300
@@ -31,601 +31,70 @@
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-		struct ext4_ext_path **path)
+		struct ext4_ext_path **orig_path)
 {
 	int ret = 0;
+	struct ext4_ext_path *path;
 
-	*path = ext4_ext_find_extent(inode, lblock, *path);
-	if (IS_ERR(*path)) {
-		ret = PTR_ERR(*path);
-		*path = NULL;
-	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
+	path = ext4_ext_find_extent(inode, lblock, *orig_path);
+	if (IS_ERR(path))
+		ret = PTR_ERR(path);
+	else if (path[ext_depth(inode)].p_ext == NULL)
 		ret = -ENODATA;
-
-	return ret;
-}
-
-/**
- * copy_extent_status - Copy the extent's initialization status
- *
- * @src:	an extent for getting initialize status
- * @dest:	an extent to be set the status
- */
-static void
-copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
-{
-	if (ext4_ext_is_uninitialized(src))
-		ext4_ext_mark_uninitialized(dest);
 	else
-		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
-}
+		*orig_path = path;
 
-/**
- * mext_next_extent - Search for the next extent and set it to "extent"
- *
- * @inode:	inode which is searched
- * @path:	this will obtain data for the next extent
- * @extent:	pointer to the next extent we have just gotten
- *
- * Search the next extent in the array of ext4_ext_path structure (@path)
- * and set it to ext4_extent structure (@extent). In addition, the member of
- * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
- * ext4_ext_path structure refers to the last extent, or a negative error
- * value on failure.
- */
-static int
-mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-		      struct ext4_extent **extent)
-{
-	struct ext4_extent_header *eh;
-	int ppos, leaf_ppos = path->p_depth;
-
-	ppos = leaf_ppos;
-	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
-		/* leaf block */
-		*extent = ++path[ppos].p_ext;
-		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
-		return 0;
-	}
-
-	while (--ppos >= 0) {
-		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
-		    path[ppos].p_idx) {
-			int cur_ppos = ppos;
-
-			/* index block */
-			path[ppos].p_idx++;
-			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
-			if (path[ppos+1].p_bh)
-				brelse(path[ppos+1].p_bh);
-			path[ppos+1].p_bh =
-				sb_bread(inode->i_sb, path[ppos].p_block);
-			if (!path[ppos+1].p_bh)
-				return -EIO;
-			path[ppos+1].p_hdr =
-				ext_block_hdr(path[ppos+1].p_bh);
-
-			/* Halfway index block */
-			while (++cur_ppos < leaf_ppos) {
-				path[cur_ppos].p_idx =
-					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
-				path[cur_ppos].p_block =
-					ext4_idx_pblock(path[cur_ppos].p_idx);
-				if (path[cur_ppos+1].p_bh)
-					brelse(path[cur_ppos+1].p_bh);
-				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
-					path[cur_ppos].p_block);
-				if (!path[cur_ppos+1].p_bh)
-					return -EIO;
-				path[cur_ppos+1].p_hdr =
-					ext_block_hdr(path[cur_ppos+1].p_bh);
-			}
-
-			path[leaf_ppos].p_ext = *extent = NULL;
-
-			eh = path[leaf_ppos].p_hdr;
-			if (le16_to_cpu(eh->eh_entries) == 0)
-				/* empty leaf is found */
-				return -ENODATA;
-
-			/* leaf block */
-			path[leaf_ppos].p_ext = *extent =
-				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
-			path[leaf_ppos].p_block =
-					ext4_ext_pblock(path[leaf_ppos].p_ext);
-			return 0;
-		}
-	}
-	/* We found the last extent */
-	return 1;
-}
-
-/**
- * mext_check_null_inode - NULL check for two inodes
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
-mext_check_null_inode(struct inode *inode1, struct inode *inode2,
-		const char *function)
-{
-	int ret = 0;
-
-	if (inode1 == NULL) {
-		__ext4_error(inode2->i_sb, function,
-			"Both inodes should not be NULL: "
-			"inode1 NULL inode2 %lu", inode2->i_ino);
-		ret = -EIO;
-	} else if (inode2 == NULL) {
-		__ext4_error(inode1->i_sb, function,
-			"Both inodes should not be NULL: "
-			"inode1 %lu inode2 NULL", inode1->i_ino);
-		ret = -EIO;
-	}
 	return ret;
 }
 
-/**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
- *
- * @orig_inode:		original inode structure
- * @donor_inode:	donor inode structure
- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
- * i_ino order.
- */
-static void
-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
-{
-	struct inode *first = orig_inode, *second = donor_inode;
-
-	/*
-	 * Use the inode number to provide the stable locking order instead
-	 * of its address, because the C language doesn't guarantee you can
-	 * compare pointers that don't come from the same array.
-	 */
-	if (donor_inode->i_ino < orig_inode->i_ino) {
-		first = donor_inode;
-		second = orig_inode;
-	}
-
-	down_write(&EXT4_I(first)->i_data_sem);
-	down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
-}
-
-/**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
- *
- * @orig_inode:		original inode structure to be released its lock first
- * @donor_inode:	donor inode structure to be released its lock second
- * Release write lock of i_data_sem of two inodes (orig and donor).
- */
 static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_sem(struct rw_semaphore *first, struct rw_semaphore *second)
 {
-	up_write(&EXT4_I(orig_inode)->i_data_sem);
-	up_write(&EXT4_I(donor_inode)->i_data_sem);
+	if (first > second)
+		swap(first, second);
+	down_write(first);
+	down_write_nested(second, SINGLE_DEPTH_NESTING);
 }
 
 /**
- * mext_insert_across_blocks - Insert extents across leaf block
+ * mext_check_coverage - Check that all extents in range has the same type
  *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @o_start:		first original extent to be changed
- * @o_end:		last original extent to be changed
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
+ * @inode:		inode in question
+ * @from:		block offset of inode
+ * @count:		block count to be checked
+ * @unwritten:		extents expected to be unwritten
+ * @err:		pointer to save error value
  *
- * Allocate a new leaf block and insert extents into it. Return 0 on success,
- * or a negative error value on failure.
+ * Return 1 if all extents in range has expected type, and zero otherwise.
  */
 static int
-mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
-		struct ext4_extent *o_start, struct ext4_extent *o_end,
-		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
-		struct ext4_extent *end_ext)
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+		    int unwritten, int *err)
 {
-	struct ext4_ext_path *orig_path = NULL;
-	ext4_lblk_t eblock = 0;
-	int new_flag = 0;
-	int end_flag = 0;
-	int err = 0;
-
-	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
-		if (o_start == o_end) {
-
-			/*       start_ext   new_ext    end_ext
-			 * donor |---------|-----------|--------|
-			 * orig  |------------------------------|
-			 */
-			end_flag = 1;
-		} else {
-
-			/*       start_ext   new_ext   end_ext
-			 * donor |---------|----------|---------|
-			 * orig  |---------------|--------------|
-			 */
-			o_end->ee_block = end_ext->ee_block;
-			o_end->ee_len = end_ext->ee_len;
-			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-		}
-
-		o_start->ee_len = start_ext->ee_len;
-		new_flag = 1;
-
-	} else if (start_ext->ee_len && new_ext->ee_len &&
-		   !end_ext->ee_len && o_start == o_end) {
-
-		/*	 start_ext	new_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_start->ee_len = start_ext->ee_len;
-		new_flag = 1;
-
-	} else if (!start_ext->ee_len && new_ext->ee_len &&
-		   end_ext->ee_len && o_start == o_end) {
-
-		/*	  new_ext	end_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_end->ee_block = end_ext->ee_block;
-		o_end->ee_len = end_ext->ee_len;
-		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-
-		/*
-		 * Set 0 to the extent block if new_ext was
-		 * the first block.
-		 */
-		if (new_ext->ee_block)
-			eblock = le32_to_cpu(new_ext->ee_block);
-
-		new_flag = 1;
-	} else {
-		ext4_debug("ext4 move extent: Unexpected insert case\n");
-		return -EIO;
-	}
-
-	if (new_flag) {
-		err = get_ext_path(orig_inode, eblock, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					orig_path, new_ext, 0))
-			goto out;
-	}
-
-	if (end_flag) {
-		err = get_ext_path(orig_inode,
-				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-		if (err)
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ext;
+	int ret = 0;
+	ext4_lblk_t last = from + count;
+	while (from < last) {
+		*err = get_ext_path(inode, from, &path);
+		if (*err)
 			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					   orig_path, end_ext, 0))
+		ext = path[ext_depth(inode)].p_ext;
+		if (unwritten != ext4_ext_is_uninitialized(ext))
 			goto out;
+		from += ext4_ext_get_actual_len(ext);
+		ext4_ext_drop_refs(path);
 	}
+	ret = 1;
 out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-
-	return err;
-
-}
-
-/**
- * mext_insert_inside_block - Insert new extent to the extent block
- *
- * @o_start:		first original extent to be moved
- * @o_end:		last original extent to be moved
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- * @eh:			extent header of target leaf block
- * @range_to_move:	used to decide how to insert extent
- *
- * Insert extents into the leaf block. The extent (@o_start) is overwritten
- * by inserted extents.
- */
-static void
-mext_insert_inside_block(struct ext4_extent *o_start,
-			      struct ext4_extent *o_end,
-			      struct ext4_extent *start_ext,
-			      struct ext4_extent *new_ext,
-			      struct ext4_extent *end_ext,
-			      struct ext4_extent_header *eh,
-			      int range_to_move)
-{
-	int i = 0;
-	unsigned long len;
-
-	/* Move the existing extents */
-	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
-		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
-			(unsigned long)(o_end + 1);
-		memmove(o_end + 1 + range_to_move, o_end + 1, len);
-	}
-
-	/* Insert start entry */
-	if (start_ext->ee_len)
-		o_start[i++].ee_len = start_ext->ee_len;
-
-	/* Insert new entry */
-	if (new_ext->ee_len) {
-		o_start[i] = *new_ext;
-		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
-	}
-
-	/* Insert end entry */
-	if (end_ext->ee_len)
-		o_start[i] = *end_ext;
-
-	/* Increment the total entries counter on the extent block */
-	le16_add_cpu(&eh->eh_entries, range_to_move);
-}
-
-/**
- * mext_insert_extents - Insert new extent
- *
- * @handle:	journal handle
- * @orig_inode:	original inode
- * @orig_path:	path indicates first extent to be changed
- * @o_start:	first original extent to be changed
- * @o_end:	last original extent to be changed
- * @start_ext:	first new extent to be inserted
- * @new_ext:	middle of new extent to be inserted
- * @end_ext:	last new extent to be inserted
- *
- * Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
- * on success, or a negative error value on failure.
- */
-static int
-mext_insert_extents(handle_t *handle, struct inode *orig_inode,
-			 struct ext4_ext_path *orig_path,
-			 struct ext4_extent *o_start,
-			 struct ext4_extent *o_end,
-			 struct ext4_extent *start_ext,
-			 struct ext4_extent *new_ext,
-			 struct ext4_extent *end_ext)
-{
-	struct  ext4_extent_header *eh;
-	unsigned long need_slots, slots_range;
-	int	range_to_move, depth, ret;
-
-	/*
-	 * The extents need to be inserted
-	 * start_extent + new_extent + end_extent.
-	 */
-	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
-		(new_ext->ee_len ? 1 : 0);
-
-	/* The number of slots between start and end */
-	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
-		/ sizeof(struct ext4_extent);
-
-	/* Range to move the end of extent */
-	range_to_move = need_slots - slots_range;
-	depth = orig_path->p_depth;
-	orig_path += depth;
-	eh = orig_path->p_hdr;
-
-	if (depth) {
-		/* Register to journal */
-		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
-		if (ret)
-			return ret;
-	}
-
-	/* Expansion */
-	if (range_to_move > 0 &&
-		(range_to_move > le16_to_cpu(eh->eh_max)
-			- le16_to_cpu(eh->eh_entries))) {
-
-		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
-					o_end, start_ext, new_ext, end_ext);
-		if (ret < 0)
-			return ret;
-	} else
-		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
-						end_ext, eh, range_to_move);
-
-	if (depth) {
-		ret = ext4_handle_dirty_metadata(handle, orig_inode,
-						 orig_path->p_bh);
-		if (ret)
-			return ret;
-	} else {
-		ret = ext4_mark_inode_dirty(handle, orig_inode);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
-}
-
-/**
- * mext_leaf_block - Move one leaf extent block into the inode.
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @orig_path:		path indicates first extent to be changed
- * @dext:		donor extent
- * @from:		start offset on the target file
- *
- * In order to insert extents into the leaf block, we must divide the extent
- * in the leaf block into three extents. The one is located to be inserted
- * extents, and the others are located around it.
- *
- * Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling mext_insert_extents() with
- * created extents. Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_leaf_block(handle_t *handle, struct inode *orig_inode,
-		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
-		     ext4_lblk_t *from)
-{
-	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
-	struct ext4_extent new_ext, start_ext, end_ext;
-	ext4_lblk_t new_ext_end;
-	ext4_fsblk_t new_phys_end;
-	int oext_alen, new_ext_alen, end_ext_alen;
-	int depth = ext_depth(orig_inode);
-	int ret;
-
-	o_start = o_end = oext = orig_path[depth].p_ext;
-	oext_alen = ext4_ext_get_actual_len(oext);
-	start_ext.ee_len = end_ext.ee_len = 0;
-
-	new_ext.ee_block = cpu_to_le32(*from);
-	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
-	new_ext.ee_len = dext->ee_len;
-	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
-	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-	new_phys_end = ext4_ext_pblock(&new_ext) + new_ext_alen - 1;
-
-	/*
-	 * Case: original extent is first
-	 * oext      |--------|
-	 * new_ext      |--|
-	 * start_ext |--|
-	 */
-	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
-		le32_to_cpu(new_ext.ee_block) <
-		le32_to_cpu(oext->ee_block) + oext_alen) {
-		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
-					       le32_to_cpu(oext->ee_block));
-		copy_extent_status(oext, &start_ext);
-	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
-		prev_ext = oext - 1;
-		/*
-		 * We can merge new_ext into previous extent,
-		 * if these are contiguous and same extent type.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
-					       &new_ext)) {
-			o_start = prev_ext;
-			start_ext.ee_len = cpu_to_le16(
-				ext4_ext_get_actual_len(prev_ext) +
-				new_ext_alen);
-			copy_extent_status(prev_ext, &start_ext);
-			new_ext.ee_len = 0;
-		}
-	}
-
-	/*
-	 * Case: new_ext_end must be less than oext
-	 * oext      |-----------|
-	 * new_ext       |-------|
-	 */
-	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		ext4_error(orig_inode->i_sb,
-			"new_ext_end(%u) should be less than or equal to "
-			"oext->ee_block(%u) + oext_alen(%d) - 1",
-			new_ext_end, le32_to_cpu(oext->ee_block),
-			oext_alen);
-		ret = -EIO;
-		goto out;
-	}
-
-	/*
-	 * Case: new_ext is smaller than original extent
-	 * oext    |---------------|
-	 * new_ext |-----------|
-	 * end_ext             |---|
-	 */
-	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
-		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
-		end_ext.ee_len =
-			cpu_to_le16(le32_to_cpu(oext->ee_block) +
-			oext_alen - 1 - new_ext_end);
-		copy_extent_status(oext, &end_ext);
-		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
-		ext4_ext_store_pblock(&end_ext,
-			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
-		end_ext.ee_block =
-			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
-			oext_alen - end_ext_alen);
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
 	}
-
-	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
-				o_end, &start_ext, &new_ext, &end_ext);
-out:
 	return ret;
 }
 
 /**
- * mext_calc_swap_extents - Calculate extents for extent swapping.
- *
- * @tmp_dext:		the extent that will belong to the original inode
- * @tmp_oext:		the extent that will belong to the donor inode
- * @orig_off:		block offset of original inode
- * @donor_off:		block offset of donor inode
- * @max_count:		the maximun length of extents
- *
- * Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_calc_swap_extents(struct ext4_extent *tmp_dext,
-			      struct ext4_extent *tmp_oext,
-			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
-			      ext4_lblk_t max_count)
-{
-	ext4_lblk_t diff, orig_diff;
-	struct ext4_extent dext_old, oext_old;
-
-	BUG_ON(orig_off != donor_off);
-
-	/* original and donor extents have to cover the same block offset */
-	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
-	    le32_to_cpu(tmp_oext->ee_block) +
-			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
-		return -ENODATA;
-
-	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
-	    le32_to_cpu(tmp_dext->ee_block) +
-			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
-		return -ENODATA;
-
-	dext_old = *tmp_dext;
-	oext_old = *tmp_oext;
-
-	/* When tmp_dext is too large, pick up the target range. */
-	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-
-	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
-	tmp_dext->ee_block =
-			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
-	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
-
-	if (max_count < ext4_ext_get_actual_len(tmp_dext))
-		tmp_dext->ee_len = cpu_to_le16(max_count);
-
-	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
-
-	/* Adjust extent length if donor extent is larger than orig */
-	if (ext4_ext_get_actual_len(tmp_dext) >
-	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
-		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
-						orig_diff);
-
-	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
-
-	copy_extent_status(&oext_old, tmp_dext);
-	copy_extent_status(&dext_old, tmp_oext);
-
-	return 0;
-}
-
-/**
  * mext_replace_branches - Replace original extents with new extents
  *
  * @handle:		journal handle
@@ -646,124 +115,124 @@ mext_calc_swap_extents(struct ext4_exten
  *
  * Return replaced block count.
  */
+
+/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1:	the inode structure
+ * @inode2:	the inode structure
+ * @index:	page index
+ * @page:	result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
 static int
-mext_replace_branches(handle_t *handle, struct inode *orig_inode,
-			   struct inode *donor_inode, ext4_lblk_t from,
-			   ext4_lblk_t count, int *err)
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+		      pgoff_t index1, pgoff_t index2, struct page *page[2])
 {
-	struct ext4_ext_path *orig_path = NULL;
-	struct ext4_ext_path *donor_path = NULL;
-	struct ext4_extent *oext, *dext;
-	struct ext4_extent tmp_dext, tmp_oext;
-	ext4_lblk_t orig_off = from, donor_off = from;
-	int depth;
-	int replaced_count = 0;
-	int dext_alen;
+	struct address_space *mapping[2];
+	unsigned fl = AOP_FLAG_NOFS;
 
-	/* Protect extent trees against block allocations via delalloc */
-	double_down_write_data_sem(orig_inode, donor_inode);
+	BUG_ON(!inode1 || !inode2);
+	if (inode1 < inode2) {
+		mapping[0] = inode1->i_mapping;
+		mapping[1] = inode2->i_mapping;
+	} else {
+		pgoff_t tmp = index1;
+		index1 = index2;
+		index2 = tmp;
+		mapping[0] = inode2->i_mapping;
+		mapping[1] = inode1->i_mapping;
+	}
+
+	page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
+	if (!page[0])
+		return -ENOMEM;
+
+	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
+	if (!page[1]) {
+		unlock_page(page[0]);
+		page_cache_release(page[0]);
+		return -ENOMEM;
+	}
+	/*
+	 * grab_cache_page_write_begin() may not wait on page's writeback if
+	 * BDI not demand that. But it is reasonable to be very conservative
+	 * here and explicitly wait on page's writeback
+	 */
+	wait_on_page_writeback(page[0]);
+	wait_on_page_writeback(page[1]);
+	if (inode1 > inode2) {
+		struct page *tmp;
+		tmp = page[0];
+		page[0] = page[1];
+		page[1] = tmp;
+	}
+	return 0;
+}
 
-	/* Get the original extent for the block "orig_off" */
-	*err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (*err)
-		goto out;
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	sector_t block;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, block_start, block_end;
+	int i, err,  nr = 0, partial = 0;
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
 
-	/* Get the donor extent for the head */
-	*err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (*err)
-		goto out;
-	depth = ext_depth(orig_inode);
-	oext = orig_path[depth].p_ext;
-	tmp_oext = *oext;
-
-	depth = ext_depth(donor_inode);
-	dext = donor_path[depth].p_ext;
-	tmp_dext = *dext;
+	if (PageUptodate(page))
+		return 0;
 
-	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-				      donor_off, count);
-	if (*err)
-		goto out;
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
 
-	/* Loop for the donor extents */
-	while (1) {
-		/* The extent for donor must be found. */
-		if (!dext) {
-			ext4_error(donor_inode->i_sb,
-				   "The extent for donor must be found");
-			*err = -EIO;
-			goto out;
-		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			ext4_error(donor_inode->i_sb,
-				"Donor offset(%u) and the first block of donor "
-				"extent(%u) should be equal",
-				donor_off,
-				le32_to_cpu(tmp_dext.ee_block));
-			*err = -EIO;
-			goto out;
+	head = page_buffers(page);
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     block++, block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+			continue;
 		}
-
-		/* Set donor extent to orig extent */
-		*err = mext_leaf_block(handle, orig_inode,
-					   orig_path, &tmp_dext, &orig_off);
-		if (*err)
-			goto out;
-
-		/* Set orig extent to donor extent */
-		*err = mext_leaf_block(handle, donor_inode,
-					   donor_path, &tmp_oext, &donor_off);
-		if (*err)
-			goto out;
-
-		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
-		replaced_count += dext_alen;
-		donor_off += dext_alen;
-		orig_off += dext_alen;
-
-		/* Already moved the expected blocks */
-		if (replaced_count >= count)
-			break;
-
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		*err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(orig_inode);
-		oext = orig_path[depth].p_ext;
-		tmp_oext = *oext;
-
-		if (donor_path)
-			ext4_ext_drop_refs(donor_path);
-		*err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(donor_inode);
-		dext = donor_path[depth].p_ext;
-		tmp_dext = *dext;
-
-		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-					   donor_off, count - replaced_count);
-		if (*err)
-			goto out;
+		if (buffer_uptodate(bh))
+			continue;
+		if (!buffer_mapped(bh)) {
+			err = ext4_get_block(inode, block, bh, 0);
+			if (err) {
+				SetPageError(page);
+				return err;
+			}
+			if (!buffer_mapped(bh)) {
+				zero_user(page, block_start, blocksize);
+				set_buffer_uptodate(bh);
+				continue;
+			}
+		}
+		BUG_ON(nr >= MAX_BUF_PER_PAGE);
+		arr[nr++] = bh;
 	}
+	/* No io required */
+	if (!nr)
+		goto out;
 
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (donor_path) {
-		ext4_ext_drop_refs(donor_path);
-		kfree(donor_path);
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (!bh_uptodate_or_lock(bh)) {
+			err = bh_submit_read(bh);
+			if (err)
+				return err;
+		}
 	}
-
-	ext4_ext_invalidate_cache(orig_inode);
-	ext4_ext_invalidate_cache(donor_inode);
-
-	double_up_write_data_sem(orig_inode, donor_inode);
-
-	return replaced_count;
+out:
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
 }
 
 /**
@@ -774,7 +243,7 @@ out:
  * @orig_page_offset:		page index on original file
  * @data_offset_in_page:	block index where data swapping starts
  * @block_len_in_page:		the number of blocks to be swapped
- * @uninit:			orig extent is uninitialized or not
+ * @unwritten:			orig extent is unwritten or not
  * @err:			pointer to save return value
  *
  * Save the data in original inode blocks and replace original inode extents
@@ -784,30 +253,28 @@ out:
  */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
-		  pgoff_t orig_page_offset, int data_offset_in_page,
-		  int block_len_in_page, int uninit, int *err)
+		     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+		     int data_offset_in_page,
+		     int block_len_in_page, int unwritten, int *err)
 {
 	struct inode *orig_inode = o_filp->f_dentry->d_inode;
-	struct address_space *mapping = orig_inode->i_mapping;
-	struct buffer_head *bh;
-	struct page *page = NULL;
-	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct page *pagep[2] = {NULL, NULL};
 	handle_t *handle;
-	ext4_lblk_t orig_blk_offset;
-	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+	ext4_lblk_t orig_blk_offset, donor_blk_offset;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int w_flags = 0;
 	unsigned int tmp_data_size, data_size, replaced_size;
-	void *fsdata;
-	int i, jblocks;
-	int err2 = 0;
+	int err2, jblocks, retries = 0;
 	int replaced_count = 0;
+	int from = data_offset_in_page << orig_inode->i_blkbits;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
 
 	/*
 	 * It needs twice the amount of ordinary journal buffers because
 	 * inode and donor_inode may change each different metadata blocks.
 	 */
+again:
+	*err = 0;
 	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
 	handle = ext4_journal_start(orig_inode, jblocks);
 	if (IS_ERR(handle)) {
@@ -821,20 +288,8 @@ move_extent_per_page(struct file *o_filp
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 
-	/*
-	 * If orig extent is uninitialized one,
-	 * it's not necessary force the page into memory
-	 * and then force it to be written out again.
-	 * Just swap data blocks between orig and donor.
-	 */
-	if (uninit) {
-		replaced_count = mext_replace_branches(handle, orig_inode,
-						donor_inode, orig_blk_offset,
-						block_len_in_page, err);
-		goto out2;
-	}
-
-	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+	donor_blk_offset = donor_page_offset * blocks_per_page +
+		data_offset_in_page;
 
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
@@ -855,79 +310,137 @@ move_extent_per_page(struct file *o_filp
 
 	replaced_size = data_size;
 
-	*err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
-				 &page, &fsdata);
+	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+				     donor_page_offset, pagep);
 	if (unlikely(*err < 0))
-		goto out;
-
-	if (!PageUptodate(page)) {
-		mapping->a_ops->readpage(o_filp, page);
-		lock_page(page);
-	}
-
+		goto stop_journal;
 	/*
-	 * try_to_release_page() doesn't call releasepage in writeback mode.
-	 * We should care about the order of writing to the same file
-	 * by multiple move extent processes.
-	 * It needs to call wait_on_page_writeback() to wait for the
-	 * writeback of the page.
+	 * If orig extent was unwritten it can become initialized
+	 * at any time after i_data_sem was dropped, in order to
+	 * serialize with delalloc we have recheck extent while we
+	 * hold page's lock, if it is still the case data copy is not
+	 * necessary, just swap data blocks between orig and donor.
 	 */
-	wait_on_page_writeback(page);
+	if (unwritten) {
+		double_down_write_sem(&EXT4_I(orig_inode)->i_data_sem,
+				      &EXT4_I(donor_inode)->i_data_sem);
+		/* If any of extents in range became initialized we have to
+		 * fallback to data copying */
+		unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+						block_len_in_page, 1, err);
+		if (*err)
+			goto drop_data_sem;
+
+		unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
+						 block_len_in_page, 1, err);
+		if (*err)
+			goto drop_data_sem;
+
+		if (!unwritten) {
+			up_write(&EXT4_I(orig_inode)->i_data_sem);
+			up_write(&EXT4_I(donor_inode)->i_data_sem);
+			goto data_copy;
+		}
+		if ((page_has_private(pagep[0]) &&
+		     !try_to_release_page(pagep[0], 0)) ||
+		    (page_has_private(pagep[1]) &&
+		     !try_to_release_page(pagep[1], 0))) {
+			*err = -EBUSY;
+			goto drop_data_sem;
+		}
+		replaced_count = ext4_swap_extents(handle, orig_inode,
+						   donor_inode, orig_blk_offset,
+						   donor_blk_offset,
+						   block_len_in_page, 1, err);
+	drop_data_sem:
+		up_write(&EXT4_I(orig_inode)->i_data_sem);
+		up_write(&EXT4_I(donor_inode)->i_data_sem);
+		goto unlock_pages;
+	}
+data_copy:
+	*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+	if (*err)
+		goto unlock_pages;
 
-	/* Release old bh and drop refs */
-	try_to_release_page(page, 0);
+	/* At this point all buffers in range are uptodate, old mapping layout
+	 * is no longer required, try to drop it now. */
+	if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+	    (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+		*err = -EBUSY;
+		goto unlock_pages;
+	}
+	double_down_write_sem(&EXT4_I(orig_inode)->i_data_sem,
+			      &EXT4_I(donor_inode)->i_data_sem);
+	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 1, err);
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
 
-	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-					orig_blk_offset, block_len_in_page,
-					&err2);
-	if (err2) {
+	if (*err) {
 		if (replaced_count) {
 			block_len_in_page = replaced_count;
 			replaced_size =
 				block_len_in_page << orig_inode->i_blkbits;
 		} else
-			goto out;
+			goto unlock_pages;
 	}
+	/* Perform all necessary steps similar write_begin()/write_end()
+	 * but keeping in mind that i_size will not change */
+/* 	*err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, */
+/* -				 &page, &fsdata); */
+
+	*err = __block_prepare_write(orig_inode, pagep[0], from, replaced_size,
+				     ext4_get_block);
+	if (!*err)
+		*err = block_commit_write(pagep[0], from, from + replaced_size);
 
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
-
-	bh = page_buffers(page);
-	for (i = 0; i < data_offset_in_page; i++)
-		bh = bh->b_this_page;
-
-	for (i = 0; i < block_len_in_page; i++) {
-		*err = ext4_get_block(orig_inode,
-				(sector_t)(orig_blk_offset + i), bh, 0);
-		if (*err < 0)
-			goto out;
-
-		if (bh->b_this_page != NULL)
-			bh = bh->b_this_page;
-	}
-
-	*err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
-			       page, fsdata);
-	page = NULL;
+	if (unlikely(*err < 0))
+		goto repair_branches;
 
-out:
-	if (unlikely(page)) {
-		if (PageLocked(page))
-			unlock_page(page);
-		page_cache_release(page);
-		ext4_journal_stop(handle);
-	}
-out2:
+	/* Even in case of data=writeback it is reasonable to pin
+	 * inode to transaction, to prevent unexpected data loss */
+	*err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+	unlock_page(pagep[0]);
+	page_cache_release(pagep[0]);
+	unlock_page(pagep[1]);
+	page_cache_release(pagep[1]);
+stop_journal:
 	ext4_journal_stop(handle);
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it. */
+	if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+						      &retries))
+		goto again;
+	return replaced_count;
 
-	if (err2)
-		*err = err2;
+repair_branches:
+	/*
+	 * This should never ever happen!
+	 * Extents are swapped already, but we are not able to copy data.
+	 * Try to swap extents to it's original places
+	 */
+	double_down_write_sem(&EXT4_I(orig_inode)->i_data_sem,
+			      &EXT4_I(donor_inode)->i_data_sem);
+	replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 0, &err2);
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
 
-	return replaced_count;
+	if (replaced_count != block_len_in_page) {
+		EXT4_ERROR_INODE(orig_inode, "Unable to copy data block %u,"
+				 " data will be lost.", orig_blk_offset);
+		*err = -EIO;
+	}
+	replaced_count = 0;
+	goto unlock_pages;
 }
 
 /**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
  *
  * @orig_inode:		original inode
  * @donor_inode:	donor inode
@@ -944,17 +457,13 @@ mext_check_arguments(struct inode *orig_
 		     struct inode *donor_inode, __u64 orig_start,
 		     __u64 donor_start, __u64 *len)
 {
-	ext4_lblk_t orig_blocks, donor_blocks;
+	__u64 orig_eof, donor_eof;
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 
-	/* Regular file check */
-	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
-		ext4_debug("ext4 move extent: The argument files should be "
-			"regular file [ino:orig %lu, donor %lu]\n",
-			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
-	}
+	orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+	donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
 
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
@@ -971,15 +480,7 @@ mext_check_arguments(struct inode *orig_
 		ext4_debug("ext4 move extent: The argument files should "
 			"not be swapfile [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
-	}
-
-	/* Files should be in the same ext4 FS */
-	if (orig_inode->i_sb != donor_inode->i_sb) {
-		ext4_debug("ext4 move extent: The argument files "
-			"should be in same FS [ino:orig %lu, donor %lu]\n",
-			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
+		return -EBUSY;
 	}
 
 	/* Ext4 move extent supports only extent based file */
@@ -999,9 +500,10 @@ mext_check_arguments(struct inode *orig_
 	}
 
 	/* Start offset should be same */
-	if (orig_start != donor_start) {
+	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
 		ext4_debug("ext4 move extent: orig and donor's start "
-			"offset are not same [ino:orig %lu, donor %lu]\n",
+			"offset are not alligned [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
@@ -1009,60 +511,19 @@ mext_check_arguments(struct inode *orig_
 	if ((orig_start >= EXT_MAX_BLOCKS) ||
 	    (donor_start >= EXT_MAX_BLOCKS) ||
 	    (*len > EXT_MAX_BLOCKS) ||
+	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
 	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
 		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
 			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-
-	if (orig_inode->i_size > donor_inode->i_size) {
-		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start >= donor_blocks) {
-			ext4_debug("ext4 move extent: orig start offset "
-			"[%llu] should be less than donor file blocks "
-			"[%u] [ino:orig %lu, donor %lu]\n",
-			orig_start, donor_blocks,
-			orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start + *len > donor_blocks) {
-			ext4_debug("ext4 move extent: End offset [%llu] should "
-				"be less than donor file blocks [%u]."
-				"So adjust length from %llu to %llu "
-				"[ino:orig %lu, donor %lu]\n",
-				orig_start + *len, donor_blocks,
-				*len, donor_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = donor_blocks - orig_start;
-		}
-	} else {
-		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
-		if (orig_start >= orig_blocks) {
-			ext4_debug("ext4 move extent: start offset [%llu] "
-				"should be less than original file blocks "
-				"[%u] [ino:orig %lu, donor %lu]\n",
-				 orig_start, orig_blocks,
-				orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		if (orig_start + *len > orig_blocks) {
-			ext4_debug("ext4 move extent: Adjust length "
-				"from %llu to %llu. Because it should be "
-				"less than original file blocks "
-				"[ino:orig %lu, donor %lu]\n",
-				*len, orig_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = orig_blocks - orig_start;
-		}
-	}
-
+	if (orig_eof < orig_start + *len - 1)
+		*len = orig_eof - orig_start;
+	if (donor_eof < donor_start + *len - 1)
+		*len = donor_eof - donor_start;
 	if (!*len) {
-		ext4_debug("ext4 move extent: len shoudld not be 0 "
+		ext4_debug("ext4 move extent: len should not be 0 "
 			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
 			donor_inode->i_ino);
 		return -EINVAL;
@@ -1078,34 +539,24 @@ mext_check_arguments(struct inode *orig_
  * @inode2:	the inode structure
  *
  * Lock two inodes' i_mutex by i_ino order.
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
+  */
+static void
 mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
-	int ret = 0;
-
-	BUG_ON(inode1 == NULL && inode2 == NULL);
-
-	ret = mext_check_null_inode(inode1, inode2, __func__);
-	if (ret < 0)
-		goto out;
+	BUG_ON(inode1 == NULL || inode2 == NULL);
 
 	if (inode1 == inode2) {
 		mutex_lock(&inode1->i_mutex);
-		goto out;
+		return;
 	}
 
-	if (inode1->i_ino < inode2->i_ino) {
+	if (inode1 < inode2) {
 		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
 		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
 	} else {
 		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
 		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
 	}
-
-out:
-	return ret;
 }
 
 /**
@@ -1113,29 +564,16 @@ out:
  *
  * @inode1:     the inode that is released first
  * @inode2:     the inode that is released second
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
 
-static int
+static void
 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
-	int ret = 0;
+	BUG_ON(inode1 == NULL || inode2 == NULL);
 
-	BUG_ON(inode1 == NULL && inode2 == NULL);
-
-	ret = mext_check_null_inode(inode1, inode2, __func__);
-	if (ret < 0)
-		goto out;
-
-	if (inode1)
-		mutex_unlock(&inode1->i_mutex);
-
-	if (inode2 && inode2 != inode1)
+	mutex_unlock(&inode1->i_mutex);
+	if (inode2 != inode1)
 		mutex_unlock(&inode2->i_mutex);
-
-out:
-	return ret;
 }
 
 /**
@@ -1180,155 +618,112 @@ out:
  * 7:Return 0 on success, or a negative error value on failure.
  */
 int
-ext4_move_extents(struct file *o_filp, struct file *d_filp,
-		 __u64 orig_start, __u64 donor_start, __u64 len,
-		 __u64 *moved_len)
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		  __u64 donor_blk, __u64 len, __u64 *moved_len)
 {
 	struct inode *orig_inode = o_filp->f_dentry->d_inode;
 	struct inode *donor_inode = d_filp->f_dentry->d_inode;
-	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
-	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
-	ext4_lblk_t block_start = orig_start;
-	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
-	ext4_lblk_t rest_blocks;
-	pgoff_t orig_page_offset = 0, seq_end_page;
-	int ret1, ret2, depth, last_extent = 0;
+	struct ext4_ext_path *path = NULL;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
-	int data_offset_in_page;
-	int block_len_in_page;
-	int uninit;
+	ext4_lblk_t o_end, o_start = orig_blk;
+	ext4_lblk_t d_start = donor_blk;
+	int ret;
 
-	/* orig and donor should be different file */
-	if (orig_inode->i_ino == donor_inode->i_ino) {
+	if (orig_inode->i_sb != donor_inode->i_sb) {
+		ext4_debug("ext4 move extent: The argument files "
+			"should be in same FS [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* orig and donor should be different inodes */
+	if (orig_inode == donor_inode) {
 		ext4_debug("ext4 move extent: The argument files should not "
-			"be same file [ino:orig %lu, donor %lu]\n",
+			"be same inode [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+	/* TODO: This is non obvious task to swap blocks for inodes with full
+	   jornaling enabled */
+	if (ext4_should_journal_data(orig_inode) ||
+	    ext4_should_journal_data(donor_inode)) {
+		return -EINVAL;
+	}
 	/* Protect orig and donor inodes against a truncate */
-	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
-	if (ret1 < 0)
-		return ret1;
-
-	/* Protect extent tree against block allocations via delalloc */
-	double_down_write_data_sem(orig_inode, donor_inode);
-	/* Check the filesystem environment whether move_extent can be done */
-	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
-				    donor_start, &len);
-	if (ret1)
-		goto out;
-
-	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
-	block_end = block_start + len - 1;
-	if (file_end < block_end)
-		len -= block_end - file_end;
-
-	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
-	if (ret1)
+	mext_inode_double_lock(orig_inode, donor_inode);
+	/* Wait for all existing dio workers */
+	ret = ext4_flush_unwritten_io(orig_inode);
+	if (ret)
 		goto out;
-
-	/* Get path structure to check the hole */
-	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
-	if (ret1)
+	ret = ext4_flush_unwritten_io(donor_inode);
+	if (ret)
 		goto out;
+	double_down_write_sem(&orig_inode->i_alloc_sem,
+			      &donor_inode->i_alloc_sem);
+	/* Protect extent tree against block allocations via delalloc */
+	double_down_write_sem(&EXT4_I(orig_inode)->i_data_sem,
+			      &EXT4_I(donor_inode)->i_data_sem);
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
+	if (ret)
+		goto out;
+	o_end = o_start + len;
+
+	while (o_start < o_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk, next_blk;
+		pgoff_t orig_page_index, donor_page_index;
+		int offset_in_page;
+		int unwritten, cur_len;
 
-	depth = ext_depth(orig_inode);
-	ext_cur = holecheck_path[depth].p_ext;
-
-	/*
-	 * Get proper starting location of block replacement if block_start was
-	 * within the hole.
-	 */
-	if (le32_to_cpu(ext_cur->ee_block) +
-		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
-		/*
-		 * The hole exists between extents or the tail of
-		 * original file.
-		 */
-		last_extent = mext_next_extent(orig_inode,
-					holecheck_path, &ext_cur);
-		if (last_extent < 0) {
-			ret1 = last_extent;
-			goto out;
-		}
-		last_extent = mext_next_extent(orig_inode, orig_path,
-							&ext_dummy);
-		if (last_extent < 0) {
-			ret1 = last_extent;
+		ret = get_ext_path(orig_inode, o_start, &path);
+		if (ret)
 			goto out;
-		}
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
-		/* The hole exists at the beginning of original file. */
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	else
-		seq_start = block_start;
-
-	/* No blocks within the specified range. */
-	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
-		ext4_debug("ext4 move extent: The specified range of file "
-							"may be the hole\n");
-		ret1 = -EINVAL;
-		goto out;
-	}
-
-	/* Adjust start blocks */
-	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
-			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
-		     max(le32_to_cpu(ext_cur->ee_block), block_start);
-
-	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
-		seq_blocks += add_blocks;
-
-		/* Adjust tail blocks */
-		if (seq_start + seq_blocks - 1 > block_end)
-			seq_blocks = block_end - seq_start + 1;
-
-		ext_prev = ext_cur;
-		last_extent = mext_next_extent(orig_inode, holecheck_path,
-						&ext_cur);
-		if (last_extent < 0) {
-			ret1 = last_extent;
-			break;
-		}
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-
-		/*
-		 * Extend the length of contiguous block (seq_blocks)
-		 * if extents are contiguous.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode,
-					       ext_prev, ext_cur) &&
-		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
-		    !last_extent)
-			continue;
-
-		/* Is original extent is uninitialized */
-		uninit = ext4_ext_is_uninitialized(ext_prev);
-
-		data_offset_in_page = seq_start % blocks_per_page;
-
-		/*
-		 * Calculate data blocks count that should be swapped
-		 * at the first page.
-		 */
-		if (data_offset_in_page + seq_blocks > blocks_per_page) {
-			/* Swapped blocks are across pages */
-			block_len_in_page =
-					blocks_per_page - data_offset_in_page;
-		} else {
-			/* Swapped blocks are in a page */
-			block_len_in_page = seq_blocks;
-		}
-
-		orig_page_offset = seq_start >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_end_page = (seq_start + seq_blocks - 1) >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-		rest_blocks = seq_blocks;
-
+		ex = path[path->p_depth].p_ext;
+		next_blk = ext4_ext_next_allocated_block(path);
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* Check hole before the start pos */
+		if (cur_blk + cur_len - 1 < o_start) {
+			if (next_blk == EXT_MAX_BLOCKS) {
+				o_start = o_end;
+				ret = -ENODATA;
+				goto out;
+			}
+			d_start += next_blk - o_start;
+			o_start = next_blk;
+			goto repeat;
+		/* Check hole after the start pos */
+		} else if (cur_blk > o_start) {
+			/* Skip hole */
+			d_start += cur_blk - o_start;
+			o_start = cur_blk;
+			/* Extent inside requested range ?*/
+			if (cur_blk >= o_end)
+				goto out;
+		} else { /* in_range(o_start, o_blk, o_len) */
+			cur_len += cur_blk - o_start;
+		}
+		unwritten = ext4_ext_is_uninitialized(ex);
+		if (o_end - o_start < cur_len)
+			cur_len = o_end - o_start;
+
+		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+					       orig_inode->i_blkbits);
+		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+					       donor_inode->i_blkbits);
+		offset_in_page = o_start % blocks_per_page;
+		if (cur_len > blocks_per_page- offset_in_page)
+			cur_len = blocks_per_page - offset_in_page;
 		/*
 		 * Up semaphore to avoid following problems:
 		 * a. transaction deadlock among ext4_journal_start,
@@ -1336,85 +731,47 @@ ext4_move_extents(struct file *o_filp, s
 		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
 		 *    in move_extent_per_page
 		 */
-		double_up_write_data_sem(orig_inode, donor_inode);
-
-		while (orig_page_offset <= seq_end_page) {
-
-			/* Swap original branches with new branches */
-			block_len_in_page = move_extent_per_page(
-						o_filp, donor_inode,
-						orig_page_offset,
-						data_offset_in_page,
-						block_len_in_page, uninit,
-						&ret1);
-
-			/* Count how many blocks we have exchanged */
-			*moved_len += block_len_in_page;
-			if (ret1 < 0)
-				break;
-			if (*moved_len > len) {
-				ext4_error(orig_inode->i_sb,
-					"We replaced blocks too much! "
-					"sum of replaced: %llu requested: %llu",
-					*moved_len, len);
-				ret1 = -EIO;
-				break;
-			}
-
-			orig_page_offset++;
-			data_offset_in_page = 0;
-			rest_blocks -= block_len_in_page;
-			if (rest_blocks > blocks_per_page)
-				block_len_in_page = blocks_per_page;
-			else
-				block_len_in_page = rest_blocks;
-		}
-
-		double_down_write_data_sem(orig_inode, donor_inode);
-		if (ret1 < 0)
-			break;
-
-		/* Decrease buffer counter */
-		if (holecheck_path)
-			ext4_ext_drop_refs(holecheck_path);
-		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
-		if (ret1)
-			break;
-		depth = holecheck_path->p_depth;
-
-		/* Decrease buffer counter */
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
-		if (ret1)
+		up_write(&EXT4_I(orig_inode)->i_data_sem);
+		up_write(&EXT4_I(donor_inode)->i_data_sem);
+		/* Swap original branches with new branches */
+		move_extent_per_page(o_filp, donor_inode,
+				     orig_page_index, donor_page_index,
+				     offset_in_page, cur_len,
+				     unwritten, &ret);
+		double_down_write_sem(&EXT4_I(orig_inode)->i_data_sem,
+				      &EXT4_I(donor_inode)->i_data_sem);
+		if (ret < 0)
 			break;
-
-		ext_cur = holecheck_path[depth].p_ext;
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-		seq_blocks = 0;
-
+		o_start += cur_len;
+		d_start += cur_len;
+	repeat:
+		if (path) {
+			ext4_ext_drop_refs(path);
+			kfree(path);
+			path = NULL;
+		}
 	}
+	*moved_len = o_start - orig_blk;
+	if (*moved_len > len)
+		*moved_len = len;
+
 out:
 	if (*moved_len) {
 		ext4_discard_preallocations(orig_inode);
 		ext4_discard_preallocations(donor_inode);
 	}
 
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (holecheck_path) {
-		ext4_ext_drop_refs(holecheck_path);
-		kfree(holecheck_path);
-	}
-	double_up_write_data_sem(orig_inode, donor_inode);
-	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
-
-	if (ret1)
-		return ret1;
-	else if (ret2)
-		return ret2;
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+	up_write(&orig_inode->i_alloc_sem);
+	up_write(&donor_inode->i_alloc_sem);
+
 
-	return 0;
+	mext_inode_double_unlock(orig_inode, donor_inode);
+
+	return ret;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/namei.c	2014-12-12 23:29:12.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/namei.c	2015-01-21 12:02:53.182961992 +0300
@@ -1055,6 +1055,9 @@ static struct dentry *ext4_lookup(struct
 			} else {
 				return ERR_CAST(inode);
 			}
+		} else if (inode == EXT4_SB(inode->i_sb)->s_balloon_ino) {
+			iput(inode);
+			return ERR_PTR(-EPERM);
 		}
 	}
 	return d_splice_alias(inode, dentry);
@@ -1768,6 +1771,8 @@ retry:
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
+	if (!err && S_ISREG(mode) && ext4_want_data_csum(dir))
+		ext4_start_data_csum(inode);
 	return err;
 }
 
@@ -1874,6 +1879,8 @@ out_clear_inode:
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
+	if (ext4_test_inode_state(dir, EXT4_STATE_CSUM))
+		ext4_save_dir_csum(inode);
 	unlock_new_inode(inode);
 out_stop:
 	ext4_journal_stop(handle);
@@ -2009,6 +2016,7 @@ int ext4_orphan_add(handle_t *handle, st
 		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
 			goto mem_insert;
 
+	EXT4_I(inode)->i_dq_cookie = DQUOT_ORPHAN_COOKIE(inode);
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
 	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
@@ -2203,6 +2211,10 @@ static int ext4_unlink(struct inode *dir
 	if (le32_to_cpu(de->inode) != inode->i_ino)
 		goto end_unlink;
 
+	retval = -EPERM;
+	if (inode == EXT4_SB(dir->i_sb)->s_balloon_ino)
+		goto end_unlink;
+
 	if (!inode->i_nlink) {
 		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
@@ -2297,13 +2309,6 @@ static int ext4_link(struct dentry *old_
 	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
@@ -2345,7 +2350,7 @@ static int ext4_rename(struct inode *old
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh, *dir_bh;
 	struct ext4_dir_entry_2 *old_de, *new_de;
-	int retval, force_da_alloc = 0;
+	int retval;
 
 	old_bh = new_bh = dir_bh = NULL;
 
@@ -2353,14 +2358,6 @@ static int ext4_rename(struct inode *old
 	 * in separate transaction */
 	if (new_dentry->d_inode)
 		vfs_dq_init(new_dentry->d_inode);
-	handle = ext4_journal_start(old_dir, 2 *
-					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-		ext4_handle_sync(handle);
 
 	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
@@ -2372,7 +2369,7 @@ static int ext4_rename(struct inode *old
 	old_inode = old_dentry->d_inode;
 	retval = -ENOENT;
 	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
-		goto end_rename;
+		goto out_release;
 
 	new_inode = new_dentry->d_inode;
 	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
@@ -2382,6 +2379,19 @@ static int ext4_rename(struct inode *old
 			new_bh = NULL;
 		}
 	}
+
+	if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC) && new_inode)
+ 		ext4_alloc_da_blocks(old_inode);
+
+	handle = ext4_journal_start(old_dir, 2 *
+					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		ext4_handle_sync(handle);
+
 	if (S_ISDIR(old_inode->i_mode)) {
 		if (new_inode) {
 			retval = -ENOTEMPTY;
@@ -2485,18 +2495,15 @@ static int ext4_rename(struct inode *old
 		ext4_mark_inode_dirty(handle, new_inode);
 		if (!new_inode->i_nlink)
 			ext4_orphan_add(handle, new_inode);
-		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
-			force_da_alloc = 1;
 	}
 	retval = 0;
 
 end_rename:
+	ext4_journal_stop(handle);
+out_release:
 	brelse(dir_bh);
 	brelse(old_bh);
 	brelse(new_bh);
-	ext4_journal_stop(handle);
-	if (retval == 0 && force_da_alloc)
-		ext4_alloc_da_blocks(old_inode);
 	return retval;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/resize.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/resize.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/resize.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/resize.c	2015-01-21 12:02:52.566978342 +0300
@@ -16,6 +16,25 @@
 
 #include "ext4_jbd2.h"
 
+int ext4_resize_begin(struct super_block *sb)
+{
+	int ret = 0;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+		ret = -EBUSY;
+
+	return ret;
+}
+
+void ext4_resize_end(struct super_block *sb)
+{
+	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+	smp_mb__after_clear_bit();
+}
+
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
 #define inside(b, first, last)	((b) >= (first) && (b) < (last))
 
@@ -105,6 +124,185 @@ static int verify_group_input(struct sup
 	return err;
 }
 
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+	struct ext4_new_group_data *groups;	/* new_group_data for groups
+						   in the flex group */
+	__u16 *bg_flags;			/* block group flags of groups
+						   in @groups */
+	ext4_group_t count;			/* number of groups in @groups
+						 */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+	struct ext4_new_flex_group_data *flex_gd;
+
+	flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+	if (flex_gd == NULL)
+		goto out3;
+
+	flex_gd->count = flexbg_size;
+
+	flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+				  flexbg_size, GFP_NOFS);
+	if (flex_gd->groups == NULL)
+		goto out2;
+
+	flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+	if (flex_gd->bg_flags == NULL)
+		goto out1;
+
+	return flex_gd;
+
+out1:
+	kfree(flex_gd->groups);
+out2:
+	kfree(flex_gd);
+out3:
+	return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+	kfree(flex_gd->bg_flags);
+	kfree(flex_gd->groups);
+	kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize.  Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
+ */
+static int ext4_alloc_group_tables(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd,
+				int flexbg_size)
+{
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	ext4_fsblk_t start_blk;
+	ext4_fsblk_t last_blk;
+	ext4_group_t src_group;
+	ext4_group_t bb_index = 0;
+	ext4_group_t ib_index = 0;
+	ext4_group_t it_index = 0;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	unsigned overhead;
+	__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+	src_group = group_data[0].group;
+	last_group  = src_group + flex_gd->count - 1;
+
+	BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+	       (last_group & ~(flexbg_size - 1))));
+next_group:
+	group = group_data[0].group;
+	if (src_group >= group_data[0].group + flex_gd->count)
+		return -ENOSPC;
+	start_blk = ext4_group_first_block_no(sb, src_group);
+	last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+	overhead = ext4_bg_has_super(sb, src_group) ?
+		   (1 + ext4_bg_num_gdb(sb, src_group) +
+		    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+
+	start_blk += overhead;
+
+	/* We collect contiguous blocks as much as possible. */
+	src_group++;
+	for (; src_group <= last_group; src_group++)
+		if (!ext4_bg_has_super(sb, src_group))
+			last_blk += group_data[src_group - group].blocks_count;
+		else
+			break;
+
+	/* Allocate block bitmaps */
+	for (; bb_index < flex_gd->count; bb_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[bb_index].block_bitmap = start_blk++;
+		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		flex_gd->bg_flags[group] &= uninit_mask;
+	}
+
+	/* Allocate inode bitmaps */
+	for (; ib_index < flex_gd->count; ib_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[ib_index].inode_bitmap = start_blk++;
+		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		flex_gd->bg_flags[group] &= uninit_mask;
+	}
+
+	/* Allocate inode tables */
+	for (; it_index < flex_gd->count; it_index++) {
+		unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+		ext4_fsblk_t next_group_start;
+		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+			goto next_group;
+		group_data[it_index].inode_table = start_blk;
+		ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+		next_group_start = ext4_group_first_block_no(sb, group + 1);
+		group -= group_data[0].group;
+
+		if (start_blk + itb > next_group_start) {
+			flex_gd->bg_flags[group + 1] &= uninit_mask;
+			overhead = start_blk + itb - next_group_start;
+			group_data[group + 1].free_blocks_count -= overhead;
+			itb -= overhead;
+		}
+
+		group_data[group].free_blocks_count -= itb;
+		flex_gd->bg_flags[group] &= uninit_mask;
+
+		start_blk += EXT4_SB(sb)->s_itb_per_group;
+	}
+
+	if (test_opt(sb, DEBUG)) {
+		int i;
+		group = group_data[0].group;
+
+		printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+		       "%d groups, flexbg size is %d:\n", flex_gd->count,
+		       flexbg_size);
+
+		for (i = 0; i < flex_gd->count; i++) {
+			printk(KERN_DEBUG "adding %s group %u: %u "
+			       "blocks (%d free)\n",
+			       ext4_bg_has_super(sb, group + i) ? "normal" :
+			       "no-super", group + i,
+			       group_data[i].blocks_count,
+			       group_data[i].free_blocks_count);
+		}
+	}
+	return 0;
+}
+
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 				  ext4_fsblk_t blk)
 {
@@ -132,8 +330,7 @@ static struct buffer_head *bclean(handle
  * If that fails, restart the transaction & regain write access for the
  * buffer head which is used for block_bitmap modifications.
  */
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
-					 struct buffer_head *bh)
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
 {
 	int err;
 
@@ -144,139 +341,259 @@ static int extend_or_restart_transaction
 	if (err < 0)
 		return err;
 	if (err) {
-		if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+		err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
+		if (err)
 			return err;
-		if ((err = ext4_journal_get_write_access(handle, bh)))
+	}
+
+	return 0;
+}
+
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for setup_new_flex_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+			struct ext4_new_flex_group_data *flex_gd,
+			ext4_fsblk_t block, ext4_group_t count)
+{
+	ext4_group_t count2;
+
+	ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+	for (count2 = count; count > 0; count -= count2, block += count2) {
+		ext4_fsblk_t start;
+		struct buffer_head *bh;
+		ext4_group_t group;
+		int err;
+
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+		start = ext4_group_first_block_no(sb, group);
+		group -= flex_gd->groups[0].group;
+
+		count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+		if (count2 > count)
+			count2 = count;
+
+		if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+			BUG_ON(flex_gd->count > 1);
+			continue;
+		}
+
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			return err;
+
+		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+		if (!bh)
+			return -EIO;
+
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			return err;
+		ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+			   block - start, count2);
+		mb_set_bits(bh->b_data, block - start, count2);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
 			return err;
+		brelse(bh);
 	}
 
 	return 0;
 }
 
 /*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
  * This doesn't need to be part of the main transaction, since we are only
  * changing blocks outside the actual filesystem.  We still do journaling to
  * ensure the recovery is correct in case of a failure just after resize.
  * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ *  1. copy super block and GDT, and initialize group tables if necessary.
+ *     In this step, we only set bits in blocks bitmaps for blocks taken by
+ *     super block and GDT.
+ *  2. allocate group tables in block bitmaps, that is, set bits in block
+ *     bitmap for blocks taken by group tables.
  */
-static int setup_new_group_blocks(struct super_block *sb,
-				  struct ext4_new_group_data *input)
+static int setup_new_flex_group_blocks(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
 {
+	int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+	ext4_fsblk_t start;
+	ext4_fsblk_t block;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
-	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
-		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
-	unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
-	struct buffer_head *bh;
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	__u16 *bg_flags = flex_gd->bg_flags;
 	handle_t *handle;
-	ext4_fsblk_t block;
-	ext4_grpblk_t bit;
-	int i;
-	int err = 0, err2;
+	ext4_group_t group, count;
+	struct buffer_head *bh = NULL;
+	int reserved_gdb, i, j, err = 0, err2;
+
+	BUG_ON(!flex_gd->count || !group_data ||
+	       group_data[0].group != sbi->s_groups_count);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
 
 	/* This transaction may be extended/restarted along the way */
 	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
-
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	mutex_lock(&sbi->s_resize_lock);
-	if (input->group != sbi->s_groups_count) {
-		err = -EBUSY;
-		goto exit_journal;
-	}
-
-	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	if (ext4_bg_has_super(sb, input->group)) {
-		ext4_debug("mark backup superblock %#04llx (+0)\n", start);
-		ext4_set_bit(0, bh->b_data);
-	}
-
-	/* Copy all of the GDT blocks into the backup in this group */
-	for (i = 0, bit = 1, block = start + 1;
-	     i < gdblocks; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+	group = group_data[0].group;
+	for (i = 0; i < flex_gd->count; i++, group++) {
+		unsigned long gdblocks;
+
+		gdblocks = ext4_bg_num_gdb(sb, group);
+		start = ext4_group_first_block_no(sb, group);
+
+		/* Copy all of the GDT blocks into the backup in this group */
+		for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+			struct buffer_head *gdb;
+
+			ext4_debug("update backup group %#04llx\n", block);
+			err = extend_or_restart_transaction(handle, 1);
+			if (err)
+				goto out;
+
+			gdb = sb_getblk(sb, block);
+			if (!gdb) {
+				err = -EIO;
+				goto out;
+			}
+
+			err = ext4_journal_get_write_access(handle, gdb);
+			if (err) {
+				brelse(gdb);
+				goto out;
+			}
+			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+			       gdb->b_size);
+			set_buffer_uptodate(gdb);
+
+			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+			if (unlikely(err)) {
+				brelse(gdb);
+				goto out;
+			}
+			brelse(gdb);
+		}
 
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
+		/* Zero out all of the reserved backup group descriptor
+		 * table blocks
+		 */
+		if (ext4_bg_has_super(sb, group)) {
+			err = sb_issue_zeroout(sb, gdblocks + start + 1,
+					reserved_gdb, GFP_NOFS);
+			if (err)
+				goto out;
+		}
 
-		gdb = sb_getblk(sb, block);
-		if (!gdb) {
-			err = -EIO;
-			goto exit_bh;
+		/* Initialize group tables of the grop @group */
+		if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+			goto handle_bb;
+
+		/* Zero out all of the inode table blocks */
+		block = group_data[i].inode_table;
+		ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			   block, sbi->s_itb_per_group);
+		err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+				       GFP_NOFS);
+		if (err)
+			goto out;
+
+handle_bb:
+		if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+			goto handle_ib;
+
+		/* Initialize block bitmap of the @group */
+		block = group_data[i].block_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
 		}
-		if ((err = ext4_journal_get_write_access(handle, gdb))) {
-			brelse(gdb);
-			goto exit_bh;
+		if (ext4_bg_has_super(sb, group)) {
+			ext4_debug("mark backup superblock %#04llx (+0)\n",
+				   start);
+			mb_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
+						     1);
 		}
-		lock_buffer(gdb);
-		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
-		set_buffer_uptodate(gdb);
-		unlock_buffer(gdb);
-		ext4_handle_dirty_metadata(handle, NULL, gdb);
-		ext4_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
-
-	/* Zero out all of the reserved backup group descriptor table blocks */
-	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-			block, sbi->s_itb_per_group);
-	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
-			       GFP_NOFS);
-	if (err)
-		goto exit_bh;
-	for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
-		ext4_set_bit(bit, bh->b_data);
+		mark_bitmap_end(group_data[i].blocks_count,
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
 
+handle_ib:
+		if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+			continue;
+
+		/* Initialize inode bitmap of the @group */
+		block = group_data[i].inode_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+		/* Mark unused entries in inode bitmap used */
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
+		}
 
-	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
-		   input->block_bitmap - start);
-	ext4_set_bit(input->block_bitmap - start, bh->b_data);
-	ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
-		   input->inode_bitmap - start);
-	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
-	/* Zero out all of the inode table blocks */
-	block = input->inode_table;
-	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-			block, sbi->s_itb_per_group);
-	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-	if (err)
-		goto exit_bh;
-	for (i = 0, bit = input->inode_table - start;
-	     i < sbi->s_itb_per_group; i++, bit++)
-		ext4_set_bit(bit, bh->b_data);
+		mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+	}
+	bh = NULL;
 
-	if ((err = extend_or_restart_transaction(handle, 2, bh)))
-		goto exit_bh;
+	/* Mark group tables in block bitmap */
+	for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+		count = group_table_count[j];
+		start = (&group_data[0].block_bitmap)[j];
+		block = start;
+		for (i = 1; i < flex_gd->count; i++) {
+			block += group_table_count[j];
+			if (block == (&group_data[i].block_bitmap)[j]) {
+				count += group_table_count[j];
+				continue;
+			}
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+			count = group_table_count[j];
+			start = (&group_data[i].block_bitmap)[j];
+			block = start;
+		}
 
-	mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
-	ext4_handle_dirty_metadata(handle, NULL, bh);
-	brelse(bh);
-	/* Mark unused entries in inode bitmap used */
-	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
-		   input->inode_bitmap, input->inode_bitmap - start);
-	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
+		if (count) {
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+		}
 	}
 
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-			bh->b_data);
-	ext4_handle_dirty_metadata(handle, NULL, bh);
-exit_bh:
+out:
 	brelse(bh);
-
-exit_journal:
-	mutex_unlock(&sbi->s_resize_lock);
-	if ((err2 = ext4_journal_stop(handle)) && !err)
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
 		err = err2;
 
 	return err;
@@ -324,10 +641,10 @@ static unsigned ext4_list_backups(struct
  * groups in current filesystem that have BACKUPS, or -ve error code.
  */
 static int verify_reserved_gdb(struct super_block *sb,
+			       ext4_group_t end,
 			       struct buffer_head *primary)
 {
 	const ext4_fsblk_t blk = primary->b_blocknr;
-	const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
@@ -367,15 +684,15 @@ static int verify_reserved_gdb(struct su
  * fail once we start modifying the data on disk, because JBD has no rollback.
  */
 static int add_new_gdb(handle_t *handle, struct inode *inode,
-		       struct ext4_new_group_data *input,
-		       struct buffer_head **primary)
+		       ext4_group_t group)
 {
 	struct super_block *sb = inode->i_sb;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-	unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
 	struct buffer_head **o_group_desc, **n_group_desc;
 	struct buffer_head *dind;
+	struct buffer_head *gdb_bh;
 	int gdbackups;
 	struct ext4_iloc iloc;
 	__le32 *data;
@@ -398,11 +715,11 @@ static int add_new_gdb(handle_t *handle,
 		return -EPERM;
 	}
 
-	*primary = sb_bread(sb, gdblock);
-	if (!*primary)
+	gdb_bh = sb_bread(sb, gdblock);
+	if (!gdb_bh)
 		return -EIO;
 
-	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+	if ((gdbackups = verify_reserved_gdb(sb, group, gdb_bh)) < 0) {
 		err = gdbackups;
 		goto exit_bh;
 	}
@@ -417,7 +734,7 @@ static int add_new_gdb(handle_t *handle,
 	data = (__le32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext4_warning(sb, "new group %u GDT block %llu not reserved",
-			     input->group, gdblock);
+			     group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
 	}
@@ -425,7 +742,7 @@ static int add_new_gdb(handle_t *handle,
 	if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
 		goto exit_dind;
 
-	if ((err = ext4_journal_get_write_access(handle, *primary)))
+	if ((err = ext4_journal_get_write_access(handle, gdb_bh)))
 		goto exit_sbh;
 
 	if ((err = ext4_journal_get_write_access(handle, dind)))
@@ -458,13 +775,13 @@ static int add_new_gdb(handle_t *handle,
 	brelse(dind);
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
 	ext4_mark_iloc_dirty(handle, inode, &iloc);
-	memset((*primary)->b_data, 0, sb->s_blocksize);
-	ext4_handle_dirty_metadata(handle, NULL, *primary);
+	memset(gdb_bh->b_data, 0, sb->s_blocksize);
+	ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
 
 	o_group_desc = EXT4_SB(sb)->s_group_desc;
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
-	n_group_desc[gdb_num] = *primary;
+	n_group_desc[gdb_num] = gdb_bh;
 	EXT4_SB(sb)->s_group_desc = n_group_desc;
 	EXT4_SB(sb)->s_gdb_count++;
 	kfree(o_group_desc);
@@ -486,7 +803,7 @@ exit_sbh:
 exit_dind:
 	brelse(dind);
 exit_bh:
-	brelse(*primary);
+	brelse(gdb_bh);
 
 	ext4_debug("leaving with error %d\n", err);
 	return err;
@@ -506,7 +823,7 @@ exit_bh:
  * backup GDT blocks are stored in their reserved primary GDT block.
  */
 static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
-			      struct ext4_new_group_data *input)
+			      ext4_group_t group)
 {
 	struct super_block *sb = inode->i_sb;
 	int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -550,7 +867,8 @@ static int reserve_backup_gdb(handle_t *
 			err = -EIO;
 			goto exit_bh;
 		}
-		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+		if ((gdbackups = verify_reserved_gdb(sb, group,
+						     primary[res])) < 0) {
 			brelse(primary[res]);
 			err = gdbackups;
 			goto exit_bh;
@@ -577,7 +895,7 @@ static int reserve_backup_gdb(handle_t *
 	 * Finally we can add each of the reserved backup GDT blocks from
 	 * the new group to its reserved primary GDT block.
 	 */
-	blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+	blk = group * EXT4_BLOCKS_PER_GROUP(sb);
 	for (i = 0; i < reserved_gdb; i++) {
 		int err2;
 		data = (__le32 *)primary[i]->b_data;
@@ -691,6 +1009,352 @@ exit_err:
 	}
 }
 
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+			      ext4_group_t group, struct inode *resize_inode,
+			      ext4_group_t count)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *gdb_bh;
+	int i, gdb_off, gdb_num, err = 0;
+
+	for (i = 0; i < count; i++, group++) {
+		int reserved_gdb = ext4_bg_has_super(sb, group) ?
+			le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * We will only either add reserved group blocks to a backup group
+		 * or remove reserved blocks for the first group in a new group block.
+		 * Doing both would be mean more complex code, and sane people don't
+		 * use non-sparse filesystems anymore.  This is already checked above.
+		 */
+		if (gdb_off) {
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			err = ext4_journal_get_write_access(handle, gdb_bh);
+
+			if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+				err = reserve_backup_gdb(handle, resize_inode, group);
+		} else
+			err = add_new_gdb(handle, resize_inode, group);
+
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_new_group_data	*group_data = flex_gd->groups;
+	struct ext4_group_desc		*gdp;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	struct buffer_head		*gdb_bh;
+	ext4_group_t			group;
+	__u16				*bg_flags = flex_gd->bg_flags;
+	int				i, gdb_off, gdb_num, err = 0;
+	
+
+	for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+		group = group_data->group;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+		 */
+		gdb_bh = sbi->s_group_desc[gdb_num];
+		/* Update group descriptor block for new group */
+		gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+						 gdb_off * EXT4_DESC_SIZE(sb));
+
+		memset(gdp, 0, EXT4_DESC_SIZE(sb));
+		ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+		ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+		ext4_inode_table_set(sb, gdp, group_data->inode_table);
+		ext4_free_blks_set(sb, gdp, group_data->free_blocks_count);
+		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+			ext4_itable_unused_set(sb, gdp,
+					       EXT4_INODES_PER_GROUP(sb));
+		gdp->bg_flags = cpu_to_le16(*bg_flags);
+		gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+		if (unlikely(err)) {
+			ext4_std_error(sb, err);
+			break;
+		}
+
+		/*
+		 * We can allocate memory for mb_alloc based on the new group
+		 * descriptor
+		 */
+		err = ext4_mb_add_groupinfo(sb, group, gdp);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+			     struct ext4_new_flex_group_data *flex_gd)
+{
+	ext4_fsblk_t blocks_count = 0;
+	ext4_fsblk_t free_blocks = 0;
+	ext4_fsblk_t reserved_blocks = 0;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	for (i = 0; i < flex_gd->count; i++) {
+		blocks_count += group_data[i].blocks_count;
+		free_blocks += group_data[i].free_blocks_count;
+	}
+
+	reserved_blocks = ext4_r_blocks_count(es) * 100;
+	do_div(reserved_blocks, ext4_blocks_count(es));
+	reserved_blocks *= blocks_count;
+	do_div(reserved_blocks, 100);
+
+	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+	le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers must perform a smp_wmb() after updating all
+	 *   dependent data and before modifying the groups count
+	 *
+	 * * Readers must perform an smp_rmb() after reading the groups
+	 *   count and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count += flex_gd->count;
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+				reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeblocks_counter, free_blocks);
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		atomic_add(free_blocks,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: added group %u:"
+		       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+		       blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+			       struct inode *resize_inode,
+			       struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_group_t group;
+	handle_t *handle;
+	unsigned reserved_gdb;
+	int err = 0, err2 = 0, credit;
+
+	BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	o_blocks_count = ext4_blocks_count(es);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+
+	err = setup_new_flex_group_blocks(sb, flex_gd);
+	if (err)
+		goto exit;
+	/*
+	 * We will always be modifying at least the superblock and  GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	credit = flex_gd->count * 4 + reserved_gdb;
+	handle = ext4_journal_start_sb(sb, credit);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto exit_journal;
+
+	group = flex_gd->groups[0].group;
+	BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+	err = ext4_add_new_descs(handle, sb, group,
+				resize_inode, flex_gd->count);
+	if (err)
+		goto exit_journal;
+
+	err = ext4_setup_new_descs(handle, sb, flex_gd);
+	if (err)
+		goto exit_journal;
+
+	ext4_update_super(sb, flex_gd);
+	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+exit_journal:
+	err2 = ext4_journal_stop(handle);
+	if (!err)
+		err = err2;
+
+	if (!err) {
+		int i;
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+		for (i = 0; i < flex_gd->count; i++, group++) {
+			struct buffer_head *gdb_bh;
+			int gdb_num;
+			gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+				       gdb_bh->b_size);
+		}
+	}
+exit:
+	return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+				    struct ext4_new_flex_group_data *flex_gd,
+				    ext4_fsblk_t n_blocks_count,
+				    unsigned long flexbg_size)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t n_group;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	ext4_grpblk_t last;
+	ext4_grpblk_t blocks_per_group;
+	unsigned long i;
+
+	blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (o_blocks_count == n_blocks_count)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+	last_group = group | (flexbg_size - 1);
+	if (last_group > n_group)
+		last_group = n_group;
+
+	flex_gd->count = last_group - group + 1;
+
+	for (i = 0; i < flex_gd->count; i++) {
+		int overhead;
+
+		group_data[i].group = group + i;
+		group_data[i].blocks_count = blocks_per_group;
+		overhead = ext4_bg_has_super(sb, group + i) ?
+			   (1 + ext4_bg_num_gdb(sb, group + i) +
+			    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+		group_data[i].free_blocks_count = blocks_per_group - overhead;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+					       EXT4_BG_INODE_UNINIT;
+		else
+			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+	}
+
+	if (last_group == n_group &&
+	    EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+		/* We need to initialize block bitmap of last group. */
+		flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+	if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+		group_data[i - 1].blocks_count = last + 1;
+		group_data[i - 1].free_blocks_count -= blocks_per_group-
+					last - 1;
+	}
+
+	return 1;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
@@ -706,16 +1370,15 @@ exit_err:
  */
 int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 {
+	struct ext4_new_flex_group_data flex_gd;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
 		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
-	struct buffer_head *primary = NULL;
-	struct ext4_group_desc *gdp;
 	struct inode *inode = NULL;
-	handle_t *handle;
 	int gdb_off, gdb_num;
-	int err, err2;
+	int err;
+	__u16 bg_flags = 0;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -753,178 +1416,67 @@ int ext4_group_add(struct super_block *s
 		}
 	}
 
+	err = verify_group_input(sb, input);
+	if (err)
+		goto out;
 
-	if ((err = verify_group_input(sb, input)))
-		goto exit_put;
+	flex_gd.count = 1;
+	flex_gd.groups = input;
+	flex_gd.bg_flags = &bg_flags;
+	err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+	iput(inode);
+	return err;
+} /* ext4_group_add */
 
-	if ((err = setup_new_group_blocks(sb, input)))
-		goto exit_put;
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+				      ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0, err2;
 
-	/*
-	 * We will always be modifying at least the superblock and a GDT
-	 * block.  If we are adding a group past the last current GDT block,
-	 * we will also modify the inode and the dindirect block.  If we
-	 * are adding a group with superblock/GDT backups  we will also
-	 * modify each of the reserved GDT dindirect blocks.
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_add_groupblocks().
 	 */
-	handle = ext4_journal_start_sb(sb,
-				       ext4_bg_has_super(sb, input->group) ?
-				       3 + reserved_gdb : 4);
+	handle = ext4_journal_start_sb(sb, 3);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
-		goto exit_put;
-	}
-
-	mutex_lock(&sbi->s_resize_lock);
-	if (input->group != sbi->s_groups_count) {
-		ext4_warning(sb, "multiple resizers run on filesystem!");
-		err = -EBUSY;
-		goto exit_journal;
+		ext4_warning(sb, "error %d on journal start", err);
+		return err;
 	}
 
-	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
-		goto exit_journal;
-
-        /*
-         * We will only either add reserved group blocks to a backup group
-         * or remove reserved blocks for the first group in a new group block.
-         * Doing both would be mean more complex code, and sane people don't
-         * use non-sparse filesystems anymore.  This is already checked above.
-         */
-	if (gdb_off) {
-		primary = sbi->s_group_desc[gdb_num];
-		if ((err = ext4_journal_get_write_access(handle, primary)))
-			goto exit_journal;
-
-		if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
-		    (err = reserve_backup_gdb(handle, inode, input)))
-			goto exit_journal;
-	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
-		goto exit_journal;
-
-        /*
-         * OK, now we've set up the new group.  Time to make it active.
-         *
-         * We do not lock all allocations via s_resize_lock
-         * so we have to be safe wrt. concurrent accesses the group
-         * data.  So we need to be careful to set all of the relevant
-         * group descriptor data etc. *before* we enable the group.
-         *
-         * The key field here is sbi->s_groups_count: as long as
-         * that retains its old value, nobody is going to access the new
-         * group.
-         *
-         * So first we update all the descriptor metadata for the new
-         * group; then we update the total disk blocks count; then we
-         * update the groups count to enable the group; then finally we
-         * update the free space counts so that the system can start
-         * using the new disk blocks.
-         */
-
-	/* Update group descriptor block for new group */
-	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
-					 gdb_off * EXT4_DESC_SIZE(sb));
-
-	memset(gdp, 0, EXT4_DESC_SIZE(sb));
-	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
-	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
-	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
-	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
-	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-
-	/*
-	 * We can allocate memory for mb_alloc based on the new group
-	 * descriptor
-	 */
-	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-	if (err)
-		goto exit_journal;
-
-	/*
-	 * Make the new blocks and inodes valid next.  We do this before
-	 * increasing the group count so that once the group is enabled,
-	 * all of its blocks and inodes are already valid.
-	 *
-	 * We always allocate group-by-group, then block-by-block or
-	 * inode-by-inode within a group, so enabling these
-	 * blocks/inodes before the group is live won't actually let us
-	 * allocate the new space yet.
-	 */
-	ext4_blocks_count_set(es, ext4_blocks_count(es) +
-		input->blocks_count);
-	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-
-	/*
-	 * We need to protect s_groups_count against other CPUs seeing
-	 * inconsistent state in the superblock.
-	 *
-	 * The precise rules we use are:
-	 *
-	 * * Writers of s_groups_count *must* hold s_resize_lock
-	 * AND
-	 * * Writers must perform a smp_wmb() after updating all dependent
-	 *   data and before modifying the groups count
-	 *
-	 * * Readers must hold s_resize_lock over the access
-	 * OR
-	 * * Readers must perform an smp_rmb() after reading the groups count
-	 *   and before reading any dependent data.
-	 *
-	 * NB. These rules can be relaxed when checking the group count
-	 * while freeing data, as we can only allocate from a block
-	 * group after serialising against the group count, and we can
-	 * only then free after serialising in turn against that
-	 * allocation.
-	 */
-	smp_wmb();
-
-	/* Update the global fs size fields */
-	sbi->s_groups_count++;
-	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
-			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
-
-	ext4_handle_dirty_metadata(handle, NULL, primary);
-
-	/* Update the reserved block counts only once the new group is
-	 * active. */
-	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
-		input->reserved_blocks);
-
-	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeblocks_counter,
-			   input->free_blocks_count);
-	percpu_counter_add(&sbi->s_freeinodes_counter,
-			   EXT4_INODES_PER_GROUP(sb));
-
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
-	    sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group;
-		flex_group = ext4_flex_group(sbi, input->group);
-		atomic_add(input->free_blocks_count,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
-		atomic_add(EXT4_INODES_PER_GROUP(sb),
-			   &sbi->s_flex_groups[flex_group].free_inodes);
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		ext4_warning(sb, "error %d on journal write access", err);
+		goto errout;
 	}
 
-	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
-	sb->s_dirt = 1;
-
-exit_journal:
-	mutex_unlock(&sbi->s_resize_lock);
-	if ((err2 = ext4_journal_stop(handle)) && !err)
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	ext4_add_groupblocks(handle, sb, o_blocks_count, add);
+	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+errout:
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
 		err = err2;
+
 	if (!err) {
-		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+		if (test_opt(sb, DEBUG))
+			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+			       "blocks\n", ext4_blocks_count(es));
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
 			       sizeof(struct ext4_super_block));
-		update_backups(sb, primary->b_blocknr, primary->b_data,
-			       primary->b_size);
 	}
-exit_put:
-	iput(inode);
 	return err;
-} /* ext4_group_add */
+}
 
 /*
  * Extend the filesystem to the new number of blocks specified.  This entry
@@ -944,13 +1496,9 @@ int ext4_group_extend(struct super_block
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
 	struct buffer_head *bh;
-	handle_t *handle;
 	int err;
 	ext4_group_t group;
 
-	/* We don't need to worry about locking wrt other resizers just
-	 * yet: we're going to revalidate es->s_blocks_count after
-	 * taking the s_resize_lock below. */
 	o_blocks_count = ext4_blocks_count(es);
 	o_groups_count = EXT4_SB(sb)->s_groups_count;
 
@@ -972,7 +1520,7 @@ int ext4_group_extend(struct super_block
 
 	if (n_blocks_count < o_blocks_count) {
 		ext4_warning(sb, "can't shrink FS - resize aborted");
-		return -EBUSY;
+		return -EINVAL;
 	}
 
 	/* Handle the remaining blocks in the last group only. */
@@ -1005,50 +1553,122 @@ int ext4_group_extend(struct super_block
 	}
 	brelse(bh);
 
-	/* We will update the superblock, one block bitmap, and
-	 * one group descriptor via ext4_free_blocks().
-	 */
-	handle = ext4_journal_start_sb(sb, 3);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		ext4_warning(sb, "error %d on journal start", err);
-		goto exit_put;
+	err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+	return err;
+} /* ext4_group_extend */
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+	struct ext4_new_flex_group_data *flex_gd = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *bh;
+	struct inode *resize_inode;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t o_group;
+	ext4_group_t n_group;
+	ext4_grpblk_t offset, add;
+	unsigned long n_desc_blocks;
+	unsigned long o_desc_blocks;
+	unsigned long desc_blocks;
+	int err = 0, flexbg_size = 1;
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count < o_blocks_count) {
+		/* On-line shrinking not supported */
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
 	}
 
-	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
-	if (o_blocks_count != ext4_blocks_count(es)) {
-		ext4_warning(sb, "multiple resizers run on filesystem!");
-		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
-		ext4_journal_stop(handle);
-		err = -EBUSY;
-		goto exit_put;
+	if (n_blocks_count == o_blocks_count)
+		/* Nothing need to do */
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
+
+	n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+			EXT4_DESC_PER_BLOCK(sb);
+	o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+			EXT4_DESC_PER_BLOCK(sb);
+	desc_blocks = n_desc_blocks - o_desc_blocks;
+
+	if (desc_blocks &&
+	    (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+	     le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+		ext4_warning(sb, "No reserved GDT blocks, can't resize");
+		return -EPERM;
 	}
 
-	if ((err = ext4_journal_get_write_access(handle,
-						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, "error %d on journal write access", err);
-		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
-		ext4_journal_stop(handle);
-		goto exit_put;
+	resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+	if (IS_ERR(resize_inode)) {
+		ext4_warning(sb, "Error opening resize inode");
+		return PTR_ERR(resize_inode);
 	}
-	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
-	sb->s_dirt = 1;
-	mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
-	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	/* We add the blocks to the bitmap and set the group need init bit */
-	ext4_add_groupblocks(handle, sb, o_blocks_count, add);
-	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	if ((err = ext4_journal_stop(handle)))
-		goto exit_put;
 
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, n_blocks_count - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	/* extend the last group */
+	if (n_group == o_group)
+		add = n_blocks_count - o_blocks_count;
+	else
+		add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+	if (add > 0) {
+		err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+		if (err)
+			goto out;
+	}
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    es->s_log_groups_per_flex)
+		flexbg_size = 1 << es->s_log_groups_per_flex;
+
+	o_blocks_count = ext4_blocks_count(es);
+	if (o_blocks_count == n_blocks_count)
+		goto out;
+
+	flex_gd = alloc_flex_gd(flexbg_size);
+	if (flex_gd == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Add flex groups. Note that a regular group is a
+	 * flex group with 1 group.
+	 */
+	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+					      flexbg_size)) {
+		if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+			break;
+		err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	if (flex_gd)
+		free_flex_gd(flex_gd);
+
+	iput(resize_inode);
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
-		       ext4_blocks_count(es));
-	update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
-		       sizeof(struct ext4_super_block));
-exit_put:
+		printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
 	return err;
-} /* ext4_group_extend */
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/super.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/super.c	2015-01-21 12:02:58.095831583 +0300
@@ -39,6 +39,9 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/pramcache.h>
+#include <linux/vzquota.h>
+#include <linux/virtinfo.h>
 #include <asm/uaccess.h>
 
 #include <linux/kthread.h>
@@ -80,6 +83,9 @@ static void ext4_clear_request_list(void
 static int ext4_reserve_blocks(struct ext4_sb_info *, ext4_fsblk_t);
 
 wait_queue_head_t aio_wq[WQ_HASH_SZ];
+wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+unsigned int attr_batched_writeback = 1;
+unsigned int attr_batched_sync = 1;
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -88,6 +94,7 @@ ext4_fsblk_t ext4_block_bitmap(struct su
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_block_bitmap);
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -96,6 +103,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_inode_bitmap);
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 			      struct ext4_group_desc *bg)
@@ -104,6 +112,7 @@ ext4_fsblk_t ext4_inode_table(struct sup
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_inode_table);
 
 __u32 ext4_free_blks_count(struct super_block *sb,
 			      struct ext4_group_desc *bg)
@@ -339,6 +348,11 @@ static void ext4_handle_error(struct sup
 			jbd2_journal_abort(journal, -EIO);
 	}
 	if (test_opt(sb, ERRORS_RO)) {
+		/*
+		 * Make shure updated value of ->s_mount_flags will be visiable
+		 * before ->s_flags update
+		 */
+		smp_wmb();
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -482,9 +496,13 @@ void ext4_abort(struct super_block *sb, 
 		return;
 
 	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS | EXT4_MF_FS_ABORTED;
+	/*
+	 * Make shure updated value of ->s_mount_flags will be visiable
+	 * before ->s_flags update
+	 */
+	smp_wmb();
 	sb->s_flags |= MS_RDONLY;
-	EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 	if (EXT4_SB(sb)->s_journal)
 		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
@@ -647,6 +665,7 @@ static void ext4_put_super(struct super_
 	int i, err;
 
 	ext4_unregister_li_request(sb);
+	vfs_dq_off(sb, 0);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
@@ -690,6 +709,10 @@ static void ext4_put_super(struct super_
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	percpu_counter_destroy(&sbi->s_csum_partial);
+	percpu_counter_destroy(&sbi->s_csum_complete);
+	percpu_counter_destroy(&sbi->s_pfcache_peers);
+	percpu_counter_destroy(&sbi->s_fsync_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -704,6 +727,9 @@ static void ext4_put_super(struct super_
 		dump_orphan_list(sb, sbi);
 	J_ASSERT(list_empty(&sbi->s_orphan));
 
+	if (sbi->s_mount_opt2 & EXT4_MOUNT2_PRAMCACHE)
+		pramcache_save_bdev_cache(sb);
+
 	invalidate_bdev(sb->s_bdev);
 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 		/*
@@ -768,6 +794,8 @@ static struct inode *ext4_alloc_inode(st
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_unwritten, 0);
+	atomic_set(&ei->i_ioend_count, 0);
+	atomic_set(&ei->i_flush_tag, 0);
 
 	return &ei->vfs_inode;
 }
@@ -822,6 +850,10 @@ static void ext4_clear_inode(struct inod
 	if (EXT4_JOURNAL(inode))
 		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
 				       &EXT4_I(inode)->jinode);
+	if (ext4_test_inode_state(inode, EXT4_STATE_CSUM)) {
+		ext4_close_pfcache(inode);
+		ext4_clear_data_csum(inode);
+	}
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -982,6 +1014,21 @@ static int ext4_show_options(struct seq_
 
 	if (test_opt(sb, NOLOAD))
 		seq_puts(seq, ",norecovery");
+	if (sbi->s_balloon_ino)
+		seq_printf(seq, ",balloon_ino=%ld", sbi->s_balloon_ino->i_ino);
+
+	if (ve_is_super(get_exec_env())) {
+		if (test_opt2(sb, CSUM))
+			seq_puts(seq, ",pfcache_csum");
+		if (sbi->s_pfcache_root.mnt) {
+			spin_lock(&sbi->s_pfcache_lock);
+			if (sbi->s_pfcache_root.mnt) {
+				seq_puts(seq, ",pfcache=");
+				seq_path(seq, &sbi->s_pfcache_root, "\\ \t\n");
+			}
+			spin_unlock(&sbi->s_pfcache_lock);
+		}
+	}
 
 	if (test_opt(sb, BLOCK_VALIDITY) &&
 	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
@@ -1072,11 +1119,16 @@ static int ext4_mark_dquot_dirty(struct 
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 				char *path, int remount);
+static int ext4_quota_off(struct super_block *sb, int type, int remount);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
+static ssize_t ext4_quota_read_ino(struct super_block *sb, struct inode *inode,
+				char *data, size_t len, loff_t off);
 static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
+static ssize_t ext4_quota_write_ino_nojournal(struct super_block *sb, struct inode *ino,
+		const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext4_quota_operations = {
 	.initialize	= dquot_initialize,
@@ -1103,7 +1155,7 @@ static const struct dquot_operations ext
 
 static const struct quotactl_ops ext4_qctl_operations = {
 	.quota_on	= ext4_quota_on,
-	.quota_off	= vfs_quota_off,
+	.quota_off	= ext4_quota_off,
 	.quota_sync	= vfs_quota_sync,
 	.get_info	= vfs_get_dqinfo,
 	.set_info	= vfs_set_dqinfo,
@@ -1128,7 +1180,9 @@ static const struct super_operations ext
 	.show_options	= ext4_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext4_quota_read,
+	.quota_read_ino = ext4_quota_read_ino,
 	.quota_write	= ext4_quota_write,
+	.quota_write_ino = ext4_quota_write_ino_nojournal,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
@@ -1176,7 +1230,11 @@ enum {
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
-	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+	Opt_discard, Opt_nodiscard, Opt_balloon_ino,
+	Opt_init_itable, Opt_noinit_itable,
+	Opt_csum, Opt_nocsum,
+	Opt_pfcache, Opt_nopfcache,
+	Opt_pramcache_nosync, Opt_pramcache_sync, Opt_nopramcache,
 };
 
 static const match_table_t tokens = {
@@ -1245,9 +1303,17 @@ static const match_table_t tokens = {
 	{Opt_noauto_da_alloc, "noauto_da_alloc"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_balloon_ino, "balloon_ino=%u"},
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_csum, "pfcache_csum"},
+	{Opt_nocsum, "nopfcache_csum"},
+	{Opt_pfcache, "pfcache=%s"},
+	{Opt_nopfcache, "nopfcache"},
+	{Opt_pramcache_nosync, "pramcache_nosync"},
+	{Opt_pramcache_sync, "pramcache_sync"},
+	{Opt_nopramcache, "nopramcache"},
 	{Opt_err, NULL},
 };
 
@@ -1279,6 +1345,7 @@ static ext4_fsblk_t get_sb_block(void **
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
 			 unsigned int *journal_ioprio,
+			 unsigned long *balloon_ino,
 			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1290,6 +1357,7 @@ static int parse_options(char *options, 
 	int qtype, qfmt;
 	char *qname;
 #endif
+	char *pfcache;
 
 	if (!options)
 		return 1;
@@ -1333,10 +1401,15 @@ static int parse_options(char *options, 
 			/* *sb_block = match_int(&args[0]); */
 			break;
 		case Opt_err_panic:
-			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt(sbi->s_mount_opt, ERRORS_RO);
-			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
-			break;
+			/* Without CAP_SYS_ADMIN treat
+			 * errors=panic as errors=remount-ro */
+			if (capable(CAP_SYS_ADMIN)) {
+				clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+				clear_opt(sbi->s_mount_opt, ERRORS_RO);
+				set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+				break;
+			}
+			/* fall through */
 		case Opt_err_ro:
 			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
 			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1686,6 +1759,11 @@ set_qf_format:
 		case Opt_nodiscard:
 			clear_opt(sbi->s_mount_opt, DISCARD);
 			break;
+		case Opt_balloon_ino:
+			if (match_int(&args[0], &option))
+				return 0;
+			*balloon_ino = option;
+			break;
 		case Opt_init_itable:
 			set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
 			if (args[0].from) {
@@ -1700,6 +1778,42 @@ set_qf_format:
 		case Opt_noinit_itable:
 			clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
 			break;
+		case Opt_csum:
+			if (capable(CAP_SYS_ADMIN))
+				set_opt2(sb, CSUM);
+			break;
+		case Opt_nocsum:
+			if (capable(CAP_SYS_ADMIN))
+				clear_opt2(sb, CSUM);
+			break;
+		case Opt_pfcache:
+			if (!capable(CAP_SYS_ADMIN))
+				break;
+			pfcache = match_strdup(&args[0]);
+			if (ext4_relink_pfcache(sb, pfcache, !is_remount)) {
+				kfree(pfcache);
+				return 0;
+			}
+			kfree(pfcache);
+			break;
+		case Opt_nopfcache:
+			if (!capable(CAP_SYS_ADMIN))
+				break;
+			if (ext4_relink_pfcache(sb, NULL, !is_remount))
+				return 0;
+			break;
+		case Opt_pramcache_nosync:
+			set_opt2(sb, PRAMCACHE);
+			set_opt2(sb, PRAMCACHE_NOSYNC);
+			break;
+		case Opt_pramcache_sync:
+			set_opt2(sb, PRAMCACHE);
+			clear_opt2(sb, PRAMCACHE_NOSYNC);
+			break;
+		case Opt_nopramcache:
+			clear_opt2(sb, PRAMCACHE);
+			clear_opt2(sb, PRAMCACHE_NOSYNC);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -1755,7 +1869,7 @@ static int ext4_setup_super(struct super
 		res = MS_RDONLY;
 	}
 	if (read_only)
-		return res;
+		goto out;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
 		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
 			 "running e2fsck is recommended");
@@ -1788,13 +1902,15 @@ static int ext4_setup_super(struct super
 	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-				"bpg=%lu, ipg=%lu, mo=%04x]\n",
+				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT4_BLOCKS_PER_GROUP(sb),
 			EXT4_INODES_PER_GROUP(sb),
-			sbi->s_mount_opt);
+			sbi->s_mount_opt, sbi->s_mount_opt2);
 
+out:
+	sb->s_mnt_count = le16_to_cpu(es->s_mnt_count);
 	return res;
 }
 
@@ -1987,6 +2103,7 @@ static void ext4_orphan_cleanup(struct s
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
+	struct virt_info_orphan vi;
 	if (!es->s_last_orphan) {
 		jbd_debug(4, "no orphan inodes to clean up\n");
 		return;
@@ -2027,8 +2144,10 @@ static void ext4_orphan_cleanup(struct s
 		}
 	}
 #endif
+	vi.super = sb;
 
 	while (es->s_last_orphan) {
+		int ret;
 		struct inode *inode;
 
 		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
@@ -2038,6 +2157,15 @@ static void ext4_orphan_cleanup(struct s
 		}
 
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+		vi.cookie = EXT4_I(inode)->i_dq_cookie;
+		if (vi.cookie) {
+			ret = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_ORPHAN_CLEAN, &vi);
+			if (ret != NOTIFY_OK)
+				printk(KERN_ERR "BUG: Quota files for %d are broken: %s\n", vi.cookie,
+						ret == NOTIFY_BAD ?
+						"error starting orphan cleanup" :
+						"no quota engine running");
+		}
 		vfs_dq_init(inode);
 		if (inode->i_nlink) {
 			ext4_msg(sb, KERN_DEBUG,
@@ -2045,7 +2173,9 @@ static void ext4_orphan_cleanup(struct s
 				__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
+			mutex_lock(&inode->i_mutex);
 			ext4_truncate(inode);
+			mutex_unlock(&inode->i_mutex);
 			nr_truncates++;
 		} else {
 			ext4_msg(sb, KERN_DEBUG,
@@ -2072,6 +2202,8 @@ static void ext4_orphan_cleanup(struct s
 		if (sb_dqopt(sb)->files[i])
 			vfs_quota_off(sb, i, 0);
 	}
+
+	virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_ORPHAN_DONE, &vi);
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
@@ -2244,6 +2376,7 @@ struct ext4_attr {
 	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
 			 const char *, size_t);
 	int offset;
+	void *private;
 };
 
 static int parse_strtoull(const char *buf,
@@ -2299,6 +2432,38 @@ static ssize_t delayed_allocation_blocks
 			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
 }
 
+static ssize_t fsync_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(s64) percpu_counter_sum(&sbi->s_fsync_counter));
+}
+
+static ssize_t csum_partial_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_csum_partial));
+}
+
+static ssize_t csum_complete_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_csum_complete));
+}
+
+static ssize_t pfcache_peers_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_pfcache_peers));
+}
+
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 					 struct ext4_sb_info *sbi, char *buf)
 {
@@ -2357,6 +2522,27 @@ static ssize_t sbi_ui_store(struct ext4_
 	return count;
 }
 
+static ssize_t global_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) a->private;
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t global_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) a->private;
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0xffffffff, &t))
+		return -EINVAL;
+	*ui = t;
+	return count;
+}
+
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
@@ -2364,6 +2550,15 @@ static struct ext4_attr ext4_attr_##_nam
 	.store	= _store,					\
 	.offset = offsetof(struct ext4_sb_info, _elname),	\
 }
+
+#define EXT4_ATTR_GLOBAL(_name,_mode,_show,_store,_data) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.private = &(_data),					\
+}
+
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 
@@ -2372,11 +2567,17 @@ static struct ext4_attr ext4_attr_##name
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
 	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define EXT4_RW_ATTR_GLOBAL_UI(name)	\
+	EXT4_ATTR_GLOBAL(name, 0644, global_ui_show, global_ui_store, attr_##name)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
 
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(csum_partial);
+EXT4_RO_ATTR(csum_complete);
+EXT4_RO_ATTR(pfcache_peers);
+EXT4_RO_ATTR(fsync);
 EXT4_RW_ATTR(reserved_blocks);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
@@ -2388,6 +2589,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(bd_full_ratelimit, s_bd_full_ratelimit);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2403,16 +2605,25 @@ static struct attribute *ext4_attrs[] = 
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(bd_full_ratelimit),
+	ATTR_LIST(csum_partial),
+	ATTR_LIST(csum_complete),
+	ATTR_LIST(pfcache_peers),
+	ATTR_LIST(fsync),
 	NULL,
 };
 
 /* Features this copy of ext4 supports */
 EXT4_INFO_ATTR(lazy_itable_init);
 EXT4_INFO_ATTR(batched_discard);
+EXT4_RW_ATTR_GLOBAL_UI(batched_writeback);
+EXT4_RW_ATTR_GLOBAL_UI(batched_sync);
 
 static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
 	ATTR_LIST(batched_discard),
+	ATTR_LIST(batched_writeback),
+	ATTR_LIST(batched_sync),
 	NULL,
 };
 
@@ -2455,6 +2666,54 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+static void ext4_load_balloon(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+
+	if (!ino) {
+		/* FIXME locking */
+		if (sbi->s_balloon_ino) {
+			iput(sbi->s_balloon_ino);
+			sbi->s_balloon_ino = NULL;
+		}
+
+		return;
+	}
+
+	if (ino < EXT4_FIRST_INO(sb)) {
+		ext4_msg(sb, KERN_WARNING, "bad balloon inode specified");
+		return;
+	}
+
+	inode = ext4_iget(sb, ino);
+	if (IS_ERR(inode)) {
+		ext4_msg(sb, KERN_WARNING, "can't load balloon inode (%ld)", PTR_ERR(inode));
+		return;
+	}
+
+	if (!S_ISREG(inode->i_mode)) {
+		iput(inode);
+		ext4_msg(sb, KERN_WARNING, "balloon should be regular");
+		return;
+	}
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		iput(inode);
+		ext4_msg(sb, KERN_WARNING, "balloon should support extents");
+		return;
+	}
+
+	/* FIXME - locking */
+	if (sbi->s_balloon_ino)
+		iput(sbi->s_balloon_ino);
+	sbi->s_balloon_ino = inode;
+	ext4_msg(sb, KERN_INFO, "loaded balloon from %ld (%ld blocks)",
+			inode->i_ino, inode->i_blocks);
+}
+
 static void ext4_feat_release(struct kobject *kobj)
 {
 	complete(&ext4_feat->f_kobj_unregister);
@@ -2905,6 +3164,7 @@ static int ext4_fill_super(struct super_
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	unsigned long balloon_ino = 0;
 	ext4_group_t first_not_zeroed;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
@@ -2988,7 +3248,9 @@ static int ext4_fill_super(struct super_
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
 		sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
 
-	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+	/* Without CAP_SYS_ADMIN treat errors=panic as errors=remount-ro */
+	if (capable(CAP_SYS_ADMIN) &&
+	    le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
 		set_opt(sbi->s_mount_opt, ERRORS_CONT);
@@ -3021,14 +3283,17 @@ static int ext4_fill_super(struct super_
 	 */
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
+	/* enable pramcache for clean pages by default */
+	sbi->s_mount_opt2 |= EXT4_MOUNT2_PRAMCACHE;
+
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-			   &journal_devnum, &journal_ioprio, NULL, 0)) {
+			   &journal_devnum, &journal_ioprio, &balloon_ino, NULL, 0)) {
 		ext4_msg(sb, KERN_WARNING,
 			 "failed to parse options in superblock: %s",
 			 sbi->s_es->s_mount_opts);
 	}
 	if (!parse_options((char *) data, sb, &journal_devnum,
-			   &journal_ioprio, NULL, 0))
+			   &journal_ioprio, &balloon_ino, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3263,6 +3528,7 @@ static int ext4_fill_super(struct super_
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+	spin_lock_init(&sbi->s_pfcache_lock);
 
 	err = percpu_counter_init(&sbi->s_freeblocks_counter,
 			ext4_count_free_blocks(sb));
@@ -3277,6 +3543,19 @@ static int ext4_fill_super(struct super_
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
 	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_csum_partial, 0);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_csum_complete, 0);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_pfcache_peers, 0);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_fsync_counter, 0);
+	}
+
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
@@ -3284,6 +3563,7 @@ static int ext4_fill_super(struct super_
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
+	sbi->s_bd_full_ratelimit = 1024;
 
 	/*
 	 * set up enough so that it can read an inode
@@ -3299,9 +3579,11 @@ static int ext4_fill_super(struct super_
 	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
-	mutex_init(&sbi->s_resize_lock);
+	sbi->s_resize_flags = 0;
 
 	sb->s_root = NULL;
 
@@ -3541,6 +3823,8 @@ no_journal:
 	} else
 		descr = "out journal";
 
+	ext4_load_balloon(sb, balloon_ino);
+
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
 		 "Opts: %s%s", descr, sbi->s_es->s_mount_opts,
 		 *sbi->s_es->s_mount_opts ? "; " : "");
@@ -3582,6 +3866,10 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	percpu_counter_destroy(&sbi->s_csum_partial);
+	percpu_counter_destroy(&sbi->s_csum_complete);
+	percpu_counter_destroy(&sbi->s_pfcache_peers);
+	percpu_counter_destroy(&sbi->s_fsync_counter);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -3592,6 +3880,8 @@ failed_mount:
 	if (sbi->s_proc) {
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
+	if (sbi->s_pfcache_root.mnt)
+		ext4_relink_pfcache(sb, NULL, true);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -3996,7 +4286,11 @@ int ext4_force_commit(struct super_block
 	journal_t *journal;
 	int ret = 0;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & MS_RDONLY) {
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
+	}
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
@@ -4097,6 +4391,7 @@ static int ext4_remount(struct super_blo
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
+	unsigned long balloon_ino = -1;
 
 	lock_kernel();
 
@@ -4104,6 +4399,7 @@ static int ext4_remount(struct super_blo
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4120,7 +4416,7 @@ static int ext4_remount(struct super_blo
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, &journal_ioprio,
+	if (!parse_options(data, sb, NULL, &journal_ioprio, &balloon_ino,
 			   &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
@@ -4256,14 +4552,20 @@ static int ext4_remount(struct super_blo
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+
+	if (balloon_ino != -1)
+		ext4_load_balloon(sb, balloon_ino);
+
 	unlock_kernel();
 	if (enable_quota)
 		vfs_dq_quota_on_remount(sb);
+	pramcache_load_page_cache(sb);
 	return 0;
 
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4351,6 +4653,20 @@ static int ext4_statfs(struct dentry *de
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 
+	if (sbi->s_balloon_ino) {
+		struct ext4_inode_info *ei;
+		blkcnt_t balloon_blocks;
+
+		balloon_blocks = sbi->s_balloon_ino->i_blocks;
+		ei = EXT4_I(sbi->s_balloon_ino);
+		spin_lock(&ei->i_block_reservation_lock);
+		balloon_blocks += ei->i_reserved_data_blocks;
+		spin_unlock(&ei->i_block_reservation_lock);
+
+		BUG_ON(sbi->s_balloon_ino->i_blkbits < 9);
+		buf->f_blocks -= balloon_blocks >> (sbi->s_balloon_ino->i_blkbits - 9);
+	}
+
 	return 0;
 }
 
@@ -4519,6 +4835,15 @@ static int ext4_quota_on(struct super_bl
 	return err;
 }
 
+static int ext4_quota_off(struct super_block *sb, int type, int remount)
+{
+	/* Force all delayed allocation blocks to be allocated */
+	if (EXT4_SB(sb) && test_opt(sb, DELALLOC))
+		sync_filesystem(sb);
+
+	return vfs_quota_off(sb, type, remount);
+}
+
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
  * acquiring the locks... As quota files are never truncated and quota code
  * itself serializes the operations (and noone else should touch the files)
@@ -4527,6 +4852,13 @@ static ssize_t ext4_quota_read(struct su
 			       size_t len, loff_t off)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
+
+	return ext4_quota_read_ino(sb, inode, data, len, off);
+}
+
+static ssize_t ext4_quota_read_ino(struct super_block *sb, struct inode *inode,
+		char *data, size_t len, loff_t off)
+{
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
@@ -4561,10 +4893,9 @@ static ssize_t ext4_quota_read(struct su
 
 /* Write to quotafile (we know the transaction is already started and has
  * enough credits) */
-static ssize_t ext4_quota_write(struct super_block *sb, int type,
-				const char *data, size_t len, loff_t off)
+static ssize_t ext4_quota_write_ino(struct super_block *sb, struct inode *inode,
+		const char *data, size_t len, loff_t off)
 {
-	struct inode *inode = sb_dqopt(sb)->files[type];
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
@@ -4621,20 +4952,69 @@ out:
 	return len;
 }
 
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+
+	return ext4_quota_write_ino(sb, inode, data, len, off);
+}
+
+static ssize_t ext4_quota_write_ino_nojournal(struct super_block *sb, struct inode *inode,
+		const char *data, size_t len, loff_t off)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(sb->s_root->d_inode, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	ret = ext4_quota_write_ino(sb, inode, data, len, off);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
 #endif
 
 static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+	int err = get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+	if (!err) {
+		pramcache_load_bdev_cache(mnt->mnt_sb);
+		pramcache_load_page_cache(mnt->mnt_sb);
+	}
+	return err;
+}
+
+static void ext4_kill_sb(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	if (sbi && sbi->s_balloon_ino)
+		iput(sbi->s_balloon_ino);
+
+	if (sbi && sbi->s_pfcache_root.mnt)
+		ext4_relink_pfcache(sb, NULL, false);
+
+	if (sbi && (sbi->s_mount_opt2 & EXT4_MOUNT2_PRAMCACHE))
+		pramcache_save_page_cache(sb,
+			sbi->s_mount_opt2 & EXT4_MOUNT2_PRAMCACHE_NOSYNC);
+
+	kill_block_super(sb);
 }
 
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
 	.get_sb		= ext4_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_NEW_FREEZE | FS_HANDLE_QUOTA,
+	.kill_sb	= ext4_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_NEW_FREEZE |
+			  FS_HANDLE_QUOTA | FS_VIRTUALIZED |
+			  FS_HAS_MMAP_PREP,
 };
 
 static int __init ext4_init_feat_adverts(void)
@@ -4680,6 +5060,9 @@ static int __init init_ext4_fs(void)
 	for (i = 0; i < WQ_HASH_SZ; i++)
 		init_waitqueue_head(&aio_wq[i]);
 
+	for (i = 0; i < WQ_HASH_SZ; i++)
+		init_waitqueue_head(&ioend_wq[i]);
+
 	err = init_ext4_system_zone();
 	if (err)
 		return err;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/symlink.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/symlink.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/symlink.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/symlink.c	2015-01-21 12:02:53.279959416 +0300
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symli
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symli
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ext4/xattr.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/xattr.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ext4/xattr.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ext4/xattr.c	2015-01-21 12:02:51.926995331 +0300
@@ -111,6 +111,7 @@ static struct xattr_handler *ext4_xattr_
 
 struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
+	&ext4_xattr_trusted_csum_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	&ext4_xattr_acl_access_handler,
@@ -787,6 +788,10 @@ inserted:
 				error = -EDQUOT;
 				if (vfs_dq_alloc_block(inode, 1))
 					goto cleanup;
+				if (check_bd_full(inode, 1)) {
+					error = -ENOSPC;
+					goto cleanup_dquot;
+				}
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
 				if (error)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fat/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fat/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fat/inode.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fat/inode.c	2015-01-21 12:02:52.019992862 +0300
@@ -738,8 +738,10 @@ fat_encode_fh(struct dentry *de, __u32 *
 	struct inode *inode =  de->d_inode;
 	u32 ipos_h, ipos_m, ipos_l;
 
-	if (len < 5)
+	if (len < 5) {
+		*lenp = 5;
 		return 255; /* no room */
+	}
 
 	ipos_h = MSDOS_I(inode)->i_pos >> 8;
 	ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fcntl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fcntl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fcntl.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fcntl.c	2015-01-21 12:02:57.963835087 +0300
@@ -126,6 +126,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldf
 	}
 	return sys_dup3(oldfd, newfd, 0);
 }
+EXPORT_SYMBOL(sys_dup2);
 
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
@@ -143,12 +144,50 @@ SYSCALL_DEFINE1(dup, unsigned int, filde
 }
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+void generic_set_file_flags_unlocked(struct file *filp, unsigned int arg)
+{
+	filp->f_flags = (arg & SETFL_MASK) |
+		(filp->f_flags & ~SETFL_MASK);
+
+}
+EXPORT_SYMBOL(generic_set_file_flags_unlocked);
+int generic_set_file_flags(struct file *filp, unsigned int arg)
+{
+	spin_lock(&filp->f_lock);
+	generic_set_file_flags_unlocked(filp, arg);
+	spin_unlock(&filp->f_lock);
+	return 0;
+
+}
+EXPORT_SYMBOL(generic_set_file_flags);
+
+int may_use_odirect(void)
+{
+	int may;
+
+	if (ve_is_super(get_exec_env()))
+		return 1;
+
+	may = capable(CAP_SYS_RAWIO);
+	if (!may) {
+		may = get_exec_env()->odirect_enable;
+		if (may == 2)
+			may = get_ve0()->odirect_enable;
+	}
+
+	return may;
+}
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
 	struct inode * inode = filp->f_path.dentry->d_inode;
 	int error = 0;
 
+	if (!may_use_odirect())
+		arg &= ~O_DIRECT;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		arg &= ~O_SYNC;
+
 	/*
 	 * O_APPEND cannot be cleared if the file is marked as append-only
 	 * and the file is open for write.
@@ -172,10 +211,6 @@ static int setfl(int fd, struct file * f
 				return -EINVAL;
 	}
 
-	if (filp->f_op && filp->f_op->check_flags)
-		error = filp->f_op->check_flags(arg);
-	if (error)
-		return error;
 
 	/*
 	 * ->fasync() is responsible for setting the FASYNC bit.
@@ -188,10 +223,11 @@ static int setfl(int fd, struct file * f
 		if (error > 0)
 			error = 0;
 	}
-	spin_lock(&filp->f_lock);
-	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
-	spin_unlock(&filp->f_lock);
 
+	if (filp->f_op && filp->f_op->set_flags)
+		error = filp->f_op->set_flags(filp, arg);
+	else
+		error = generic_set_file_flags(filp, arg);
  out:
 	return error;
 }
@@ -742,7 +778,7 @@ EXPORT_SYMBOL(kill_fasync);
 static int __init fasync_init(void)
 {
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+		sizeof(struct fasync_struct), 0, SLAB_PANIC|SLAB_UBC, NULL);
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fhandle.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fhandle.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fhandle.c	2015-01-21 12:02:52.019992862 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fhandle.c	2015-01-21 12:02:52.974967511 +0300
@@ -0,0 +1,316 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	struct file_handle f_handle;
+	int handle_dwords, handle_bytes;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * We need t make sure wether the file system
+	 * support decoding of the file handle
+	 */
+	if (!path->mnt->mnt_sb->s_export_op ||
+	    !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+		return -EFAULT;
+
+	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+		return -EINVAL;
+
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	/* convert handle size to  multiple of sizeof(u32) */
+	handle_dwords = f_handle.handle_bytes >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_dwords,  0);
+	handle->handle_type = retval;
+	/* convert handle size to bytes */
+	handle_bytes = handle_dwords * sizeof(u32);
+	handle->handle_bytes = handle_bytes;
+	if ((handle->handle_bytes > f_handle.handle_bytes) ||
+	    (retval == 255) || (retval == -ENOSPC)) {
+		/* As per old exportfs_encode_fh documentation
+		 * we could return ENOSPC to indicate overflow
+		 * But file system returned 255 always. So handle
+		 * both the values
+		 */
+		/*
+		 * set the handle size to zero so we copy only
+		 * non variable part of the file_handle
+		 */
+		handle_bytes = 0;
+		retval = -EOVERFLOW;
+	} else
+		retval = 0;
+	/* copy the mount id */
+	if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+	    copy_to_user(ufh, handle,
+			 sizeof(struct file_handle) + handle_bytes))
+		retval = -EFAULT;
+	kfree(handle);
+	return retval;
+}
+
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+		struct file_handle __user *, handle, int __user *, mnt_id,
+		int, flag)
+{
+	struct path path;
+	int lookup_flags;
+	int err;
+
+	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	err = user_path_at(dfd, name, lookup_flags, &path);
+	if (!err) {
+		err = do_sys_name_to_handle(&path, handle, mnt_id);
+		path_put(&path);
+	}
+	return err;
+}
+
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	struct path path;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		path = fs->pwd;
+		mntget(path.mnt);
+		spin_unlock(&fs->lock);
+	} else {
+		int fput_needed;
+		struct file *file = fget_light(fd, &fput_needed);
+		if (!file)
+			return ERR_PTR(-EBADF);
+		path = file->f_path;
+		mntget(path.mnt);
+		fput_light(file, fput_needed);
+	}
+	return path.mnt;
+}
+
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+	return 1;
+}
+
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+			     struct path *path)
+{
+	int retval = 0;
+	int handle_dwords;
+
+	path->mnt = get_vfsmount_from_fd(mountdirfd);
+	if (IS_ERR(path->mnt)) {
+		retval = PTR_ERR(path->mnt);
+		goto out_err;
+	}
+	/* change the handle size to multiple of sizeof(u32) */
+	handle_dwords = handle->handle_bytes >> 2;
+	path->dentry = exportfs_decode_fh(path->mnt,
+					  (struct fid *)handle->f_handle,
+					  handle_dwords, handle->handle_type,
+					  vfs_dentry_acceptable, NULL);
+	if (IS_ERR(path->dentry)) {
+		retval = PTR_ERR(path->dentry);
+		goto out_mnt;
+	}
+	return 0;
+out_mnt:
+	mntput(path->mnt);
+out_err:
+	return retval;
+}
+
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+		   struct path *path)
+{
+	int retval = 0;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * With handle we don't look at the execute bit on the
+	 * the directory. Ideally we would like CAP_DAC_SEARCH.
+	 * But we don't have that
+	 */
+	if (!capable(CAP_DAC_READ_SEARCH)) {
+		retval = -EPERM;
+		goto out_err;
+	}
+#ifdef CONFIG_VE
+       /* Don't allow opening files by handle inside VE. */
+       if (!ve_is_super(get_exec_env())) {
+               retval = -EPERM;
+               goto out_err;
+       }
+#endif
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto out_err;
+	}
+	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+	    (f_handle.handle_bytes == 0)) {
+		retval = -EINVAL;
+		goto out_err;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto out_err;
+	}
+	/* copy the full handle */
+	if (copy_from_user(handle, ufh,
+			   sizeof(struct file_handle) +
+			   f_handle.handle_bytes)) {
+		retval = -EFAULT;
+		goto out_handle;
+	}
+
+	retval = do_handle_to_path(mountdirfd, handle, path);
+
+out_handle:
+	kfree(handle);
+out_err:
+	return retval;
+}
+
+long do_handle_open(int mountdirfd,
+		    struct file_handle __user *ufh, int open_flag)
+{
+	long retval = 0;
+	struct path path;
+	struct file *file;
+	int fd;
+
+	retval = handle_to_path(mountdirfd, ufh, &path);
+	if (retval)
+		return retval;
+
+	fd = get_unused_fd_flags(open_flag);
+	if (fd < 0) {
+		path_put(&path);
+		return fd;
+	}
+	retval = may_open(&path, MAY_READ, open_flag);
+	if (retval) {
+		put_unused_fd(fd);
+		path_put(&path);
+		return retval;
+	}
+
+	file = dentry_open(path.dentry, path.mnt, open_flag, current_cred());
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		retval =  PTR_ERR(file);
+	} else {
+		retval = fd;
+		fsnotify_open(file->f_path.dentry);
+		fd_install(fd, file);
+	}
+	return retval;
+}
+
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+		struct file_handle __user *, handle,
+		int, flags)
+{
+	long ret;
+
+	/* only readonly fhandles are backported for now */
+	if (flags != O_RDONLY)
+		return -EINVAL;
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_handle_open(mountdirfd, handle, flags);
+	return ret;
+}
+
+/* size in bytes, including file_handle header */
+int vfs_inode_fhandle(struct inode *inode, struct file_handle *handle, int size)
+{
+	struct fid *fid = (struct fid *)handle->f_handle;
+	int len = sizeof(struct file_handle) + 2 * sizeof(u32);
+
+	if (!inode->i_sb->s_export_op || inode->i_sb->s_export_op->encode_fh)
+		return -ENOTSUPP;
+
+	if (size < len)
+		return -ENOBUFS;
+
+	handle->handle_bytes = 2 * sizeof(u32);
+	handle->handle_type = FILEID_INO32_GEN;
+	fid->i32.ino = inode->i_ino;
+	fid->i32.gen = inode->i_generation;
+
+	return len;
+}
+EXPORT_SYMBOL(vfs_inode_fhandle);
+
+struct dentry *vfs_fhandle_to_dentry(struct super_block *sb,
+				     struct file_handle *handle)
+{
+	int handle_dwords = handle->handle_bytes >> 2;
+	struct dentry *dentry;
+
+	dentry = sb->s_export_op->fh_to_dentry(sb,
+			(struct fid *)handle->f_handle,
+			handle_dwords, handle->handle_type);
+	if (!dentry)
+		return ERR_PTR(-ESTALE);
+	return dentry;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/file.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/file.c	2015-01-21 12:02:57.963835087 +0300
@@ -21,6 +21,8 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
+#include <bc/kmem.h>
+
 struct fdtable_defer {
 	spinlock_t lock;
 	struct work_struct wq;
@@ -28,6 +30,7 @@ struct fdtable_defer {
 };
 
 int sysctl_nr_open __read_mostly = 1024*1024;
+EXPORT_SYMBOL_GPL(sysctl_nr_open);
 int sysctl_nr_open_min = BITS_PER_LONG;
 int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 
@@ -42,9 +45,9 @@ static DEFINE_PER_CPU(struct fdtable_def
 static inline void * alloc_fdmem(unsigned int size)
 {
 	if (size <= PAGE_SIZE)
-		return kmalloc(size, GFP_KERNEL);
+		return kmalloc(size, GFP_KERNEL_UBC);
 	else
-		return vmalloc(size);
+		return ub_vmalloc(size);
 }
 
 static inline void free_fdarr(struct fdtable *fdt)
@@ -163,7 +166,7 @@ static struct fdtable * alloc_fdtable(un
 	if (unlikely(nr > sysctl_nr_open))
 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 
-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC);
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
@@ -198,7 +201,7 @@ out:
  * Return <0 error code on error; 1 on successful completion.
  * The files->file_lock should be held on entry, and will be held on exit.
  */
-static int expand_fdtable(struct files_struct *files, int nr)
+int expand_fdtable(struct files_struct *files, int nr)
 	__releases(files->file_lock)
 	__acquires(files->file_lock)
 {
@@ -238,6 +241,7 @@ static int expand_fdtable(struct files_s
 	}
 	return 1;
 }
+EXPORT_SYMBOL(expand_fdtable);
 
 /*
  * Expand files.
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/file_table.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/file_table.c
--- linux-2.6.32-504.3.3.el6.orig/fs/file_table.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/file_table.c	2015-01-21 12:02:50.193041363 +0300
@@ -20,11 +20,19 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
+#include <linux/ve.h>
+#include <linux/nsproxy.h>
 
 #include <asm/atomic.h>
 
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+#include <bc/misc.h>
+
 #include "internal.h"
 
 /* sysctl tunables... */
@@ -32,11 +40,11 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-/* public. Not pretty! */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DECLARE_LGLOCK(files_lglock);
+DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
-static struct kmem_cache *filp_cachep __read_mostly;
+struct kmem_cache *filp_cachep __read_mostly;
 
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
@@ -50,8 +58,10 @@ static inline void file_free_rcu(struct 
 
 static inline void file_free(struct file *f)
 {
-	percpu_counter_dec(&nr_files);
 	file_check_state(f);
+	if (f->f_ub == get_ub0())
+		percpu_counter_dec(&nr_files);
+	ub_file_uncharge(f);
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -105,11 +115,14 @@ struct file *get_empty_filp(void)
 	const struct cred *cred = current_cred();
 	static long old_max;
 	struct file * f;
+	int acct;
 
+	acct = (get_exec_ub() == get_ub0());
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+	if (acct && get_nr_files() >= files_stat.max_files &&
+			!capable(CAP_SYS_ADMIN)) {
 		/*
 		 * percpu_counters are inaccurate.  Do an expensive check before
 		 * we go and fail.
@@ -122,7 +135,11 @@ struct file *get_empty_filp(void)
 	if (f == NULL)
 		goto fail;
 
-	percpu_counter_inc(&nr_files);
+	if (ub_file_charge(f))
+		goto fail_ch;
+	if (acct)
+		percpu_counter_inc(&nr_files);
+
 	if (security_file_alloc(f))
 		goto fail_sec;
 
@@ -147,7 +164,12 @@ fail_sec:
 	file_free(f);
 fail:
 	return NULL;
+
+fail_ch:
+	kmem_cache_free(filp_cachep, f);
+	return NULL;
 }
+EXPORT_SYMBOL_GPL(get_empty_filp);
 
 /**
  * alloc_file - allocate and initialize a 'struct file'
@@ -257,7 +279,7 @@ void __fput(struct file *file)
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
-	file_kill(file);
+	file_sb_list_del(file);
 	if (file->f_mode & FMODE_WRITE)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
@@ -303,7 +325,10 @@ struct file *fget_light(unsigned int fd,
 	*fput_needed = 0;
 	if (likely((atomic_read(&files->count) == 1))) {
 		file = fcheck_files(files, fd);
+		if (unlikely(file && file->f_heavy))
+			goto slow;
 	} else {
+slow:
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
@@ -319,41 +344,107 @@ struct file *fget_light(unsigned int fd,
 	return file;
 }
 
-
 void put_filp(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		security_file_free(file);
-		file_kill(file);
+		file_sb_list_del(file);
 		file_free(file);
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+static inline int file_list_cpu(struct file *file)
 {
-	if (!list)
-		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+#ifdef CONFIG_SMP
+	return file->f_sb_list_cpu;
+#else
+	return smp_processor_id();
+#endif
 }
 
-void file_kill(struct file *file)
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	struct list_head *list;
+#ifdef CONFIG_SMP
+	int cpu;
+	cpu = smp_processor_id();
+	file->f_sb_list_cpu = cpu;
+	list = per_cpu_ptr(sb->s_files, cpu);
+#else
+	list = &sb->s_files;
+#endif
+	list_add(&file->f_u.fu_list, list);
+}
+
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	lg_local_lock(files_lglock);
+	__file_sb_list_add(file, sb);
+	lg_local_unlock(files_lglock);
+}
+
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		file_list_lock();
+		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		file_list_unlock();
+		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
 	}
 }
 
+#ifdef CONFIG_SMP
+
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	int i;							\
+	for_each_possible_cpu(i) {				\
+		struct list_head *list;				\
+		list = per_cpu_ptr((__sb)->s_files, i);		\
+		list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+	}							\
+}
+
+#else
+
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	struct list_head *list;					\
+	list = &(sb)->s_files;					\
+	list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+}
+
+#endif
+
 int fs_may_remount_ro(struct super_block *sb)
 {
 	struct file *file;
-
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, file) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -363,11 +454,11 @@ int fs_may_remount_ro(struct super_block
 		/* Writeable file? */
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
-	}
-	file_list_unlock();
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	lg_global_unlock(files_lglock);
 	return 0;
 }
 
@@ -383,8 +474,8 @@ void mark_files_ro(struct super_block *s
 	struct file *f;
 
 retry:
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
 		       continue;
@@ -399,17 +490,56 @@ retry:
 			continue;
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
-		file_list_unlock();
-		/*
-		 * This can sleep, so we can't hold
-		 * the file_list_lock() spinlock.
-		 */
+		/* This can sleep, so we can't hold the spinlock. */
+		lg_global_unlock(files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
+}
+
+struct file *get_task_file(pid_t pid, int fd)
+{
+	int err;
+	struct task_struct *tsk;
+	struct files_struct *fs;
+	struct file *file = NULL;
+
+	err = -ESRCH;
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_pid_ns(pid, get_exec_env()->ve_ns->pid_ns);
+	if (tsk == NULL) {
+		read_unlock(&tasklist_lock);
+		goto out;
 	}
-	file_list_unlock();
+
+	get_task_struct(tsk);
+	read_unlock(&tasklist_lock);
+
+	err = -EINVAL;
+	fs = get_files_struct(tsk);
+	if (fs == NULL)
+		goto out_put;
+
+	rcu_read_lock();
+	err = -EBADF;
+	file = fcheck_files(fs, fd);
+	if (file == NULL)
+		goto out_unlock;
+
+	err = 0;
+	get_file(file);
+
+out_unlock:
+	rcu_read_unlock();
+	put_files_struct(fs);
+out_put:
+	put_task_struct(tsk);
+out:
+	return err ? ERR_PTR(err) : file;
 }
+EXPORT_SYMBOL(get_task_file);
 
 void __init files_init(unsigned long mempages)
 { 
@@ -426,5 +556,6 @@ void __init files_init(unsigned long mem
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
+	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
 } 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/filesystems.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/filesystems.c
--- linux-2.6.32-504.3.3.el6.orig/fs/filesystems.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/filesystems.c	2015-01-21 12:02:58.722814940 +0300
@@ -14,6 +14,9 @@
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/sched.h>	/* for 'current' */
+#include <linux/mount.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 
 /*
@@ -23,8 +26,8 @@
  *	During the unload module must call unregister_filesystem().
  *	We can access the fields of list element if:
  *		1) spinlock is held or
- *		2) we hold the reference to the module.
- *	The latter can be guaranteed by call of try_module_get(); if it
+ *		2) we hold the reference to the element.
+ *	The latter can be guaranteed by call of try_filesystem(); if it
  *	returned 0 we must skip the element, otherwise we got the reference.
  *	Once the reference is obtained we can drop the spinlock.
  */
@@ -32,24 +35,47 @@
 static struct file_system_type *file_systems;
 static DEFINE_RWLOCK(file_systems_lock);
 
+int try_get_filesystem(struct file_system_type *fs)
+{
+	if (try_module_get(fs->owner)) {
+		(void)get_ve(fs->owner_env);
+		return 1;
+	}
+	return 0;
+}
+
 /* WARNING: This can be used only if we _already_ own a reference */
 void get_filesystem(struct file_system_type *fs)
 {
+	(void)get_ve(fs->owner_env);
 	__module_get(fs->owner);
 }
 
 void put_filesystem(struct file_system_type *fs)
 {
 	module_put(fs->owner);
+	put_ve(fs->owner_env);
+}
+EXPORT_SYMBOL(put_filesystem);
+
+static inline int check_ve_fstype(struct file_system_type *p,
+		struct ve_struct *env)
+{
+	return ((p->fs_flags & FS_VIRTUALIZED) ||
+			ve_accessible_strict(p->owner_env, env));
 }
 
-static struct file_system_type **find_filesystem(const char *name, unsigned len)
+static struct file_system_type **find_filesystem(const char *name, unsigned len,
+		struct ve_struct *env)
 {
 	struct file_system_type **p;
-	for (p=&file_systems; *p; p=&(*p)->next)
+	for (p=&file_systems; *p; p=&(*p)->next) {
+		if (!check_ve_fstype(*p, env))
+			continue;
 		if (strlen((*p)->name) == len &&
 		    strncmp((*p)->name, name, len) == 0)
 			break;
+	}
 	return p;
 }
 
@@ -75,8 +101,12 @@ int register_filesystem(struct file_syst
 	if (fs->next)
 		return -EBUSY;
 	INIT_LIST_HEAD(&fs->fs_supers);
+	if (fs->owner_env == NULL)
+		fs->owner_env = get_ve0();
+	if (fs->proto == NULL)
+		fs->proto = fs;
 	write_lock(&file_systems_lock);
-	p = find_filesystem(fs->name, strlen(fs->name));
+	p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env);
 	if (*p)
 		res = -EBUSY;
 	else
@@ -120,6 +150,82 @@ int unregister_filesystem(struct file_sy
 
 EXPORT_SYMBOL(unregister_filesystem);
 
+#ifdef CONFIG_VE
+int register_ve_fs_type_data_flags(struct ve_struct *ve, struct file_system_type *template,
+				   struct file_system_type **p_fs_type, struct vfsmount **p_mnt,
+				   void *data, int flags)
+{
+	struct vfsmount *mnt;
+	struct file_system_type *local_fs_type;
+	int ret;
+
+	local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *),
+					GFP_KERNEL);
+	if (local_fs_type == NULL)
+		return -ENOMEM;
+
+	local_fs_type->name = template->name;
+	local_fs_type->fs_flags = template->fs_flags;
+	local_fs_type->get_sb = template->get_sb;
+	local_fs_type->kill_sb = template->kill_sb;
+	local_fs_type->owner = template->owner;
+	local_fs_type->owner_env = ve;
+	local_fs_type->proto = template;
+
+	get_filesystem(local_fs_type);	/* get_ve() inside */
+
+	ret = register_filesystem(local_fs_type);
+	if (ret)
+		goto reg_err;
+
+	if (p_mnt == NULL) 
+		goto done; 
+
+	mnt = vfs_kern_mount(local_fs_type, flags, local_fs_type->name, data);
+	if (IS_ERR(mnt))
+		goto mnt_err;
+
+	*p_mnt = mnt;
+done:
+	*p_fs_type = local_fs_type;
+	return 0;
+
+mnt_err:
+	ret = PTR_ERR(mnt);
+	unregister_filesystem(local_fs_type); /* does not put */
+
+reg_err:
+	put_filesystem(local_fs_type);
+	kfree(local_fs_type);
+	printk(KERN_DEBUG
+	       "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret);
+	return ret;
+}
+EXPORT_SYMBOL(register_ve_fs_type_data_flags);
+
+int register_ve_fs_type_data(struct ve_struct *ve, struct file_system_type *template,
+		struct file_system_type **p_fs_type, struct vfsmount **p_mnt, void *data)
+{
+	return register_ve_fs_type_data_flags(ve, template, p_fs_type, p_mnt, data, 0);
+}
+EXPORT_SYMBOL(register_ve_fs_type_data);
+
+void unregister_ve_fs_type(struct file_system_type *local_fs_type,
+		struct vfsmount *local_fs_mount)
+{
+	if (local_fs_mount == NULL && local_fs_type == NULL)
+		return;
+
+	unregister_filesystem(local_fs_type);
+	umount_ve_fs_type(local_fs_type, -1);
+	if (local_fs_mount)
+		kern_umount(local_fs_mount); /* alias to mntput, drop our ref */
+	put_filesystem(local_fs_type);
+}
+
+EXPORT_SYMBOL(unregister_ve_fs_type);
+#endif
+
 static int fs_index(const char __user * __name)
 {
 	struct file_system_type * tmp;
@@ -133,11 +239,14 @@ static int fs_index(const char __user * 
 
 	err = -EINVAL;
 	read_lock(&file_systems_lock);
-	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
+	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) {
+		if (!check_ve_fstype(tmp, get_exec_env()))
+			continue;
 		if (strcmp(tmp->name, name->name) == 0) {
 			err = index;
 			break;
 		}
+		index++;
 	}
 	read_unlock(&file_systems_lock);
 	putname(name);
@@ -150,9 +259,15 @@ static int fs_name(unsigned int index, c
 	int len, res;
 
 	read_lock(&file_systems_lock);
-	for (tmp = file_systems; tmp; tmp = tmp->next, index--)
-		if (index <= 0 && try_module_get(tmp->owner))
-			break;
+	for (tmp = file_systems; tmp; tmp = tmp->next) {
+		if (!check_ve_fstype(tmp, get_exec_env()))
+			continue;
+		if (!index) {
+			if (try_get_filesystem(tmp))
+				break;
+		} else
+			index--;
+	}
 	read_unlock(&file_systems_lock);
 	if (!tmp)
 		return -EINVAL;
@@ -170,8 +285,9 @@ static int fs_maxindex(void)
 	int index;
 
 	read_lock(&file_systems_lock);
-	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
-		;
+	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next)
+		if (check_ve_fstype(tmp, get_exec_env()))
+			index++;
 	read_unlock(&file_systems_lock);
 	return index;
 }
@@ -207,9 +323,10 @@ int __init get_filesystem_list(char *buf
 	read_lock(&file_systems_lock);
 	tmp = file_systems;
 	while (tmp && len < PAGE_SIZE - 80) {
-		len += sprintf(buf+len, "%s\t%s\n",
-			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
-			tmp->name);
+		if (check_ve_fstype(tmp, get_exec_env()))
+			len += sprintf(buf+len, "%s\t%s\n",
+				(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+				tmp->name);
 		tmp = tmp->next;
 	}
 	read_unlock(&file_systems_lock);
@@ -224,9 +341,12 @@ static int filesystems_proc_show(struct 
 	read_lock(&file_systems_lock);
 	tmp = file_systems;
 	while (tmp) {
+		if (!check_ve_fstype(tmp, get_exec_env()))
+			goto next; /* skip in VE */
 		seq_printf(m, "%s\t%s\n",
 			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 			tmp->name);
+next:
 		tmp = tmp->next;
 	}
 	read_unlock(&file_systems_lock);
@@ -247,7 +367,7 @@ static const struct file_operations file
 
 static int __init proc_filesystems_init(void)
 {
-	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	proc_create("filesystems", 0, &glob_proc_root, &filesystems_proc_fops);
 	return 0;
 }
 module_init(proc_filesystems_init);
@@ -258,8 +378,8 @@ static struct file_system_type *__get_fs
 	struct file_system_type *fs;
 
 	read_lock(&file_systems_lock);
-	fs = *(find_filesystem(name, len));
-	if (fs && !try_module_get(fs->owner))
+	fs = *(find_filesystem(name, len, get_exec_env()));
+	if (fs && !try_get_filesystem(fs))
 		fs = NULL;
 	read_unlock(&file_systems_lock);
 	return fs;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fs-writeback.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fs-writeback.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fs-writeback.c	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fs-writeback.c	2015-01-21 12:02:52.554978660 +0300
@@ -28,8 +28,7 @@
 #include <trace/events/kmem.h>
 #include <linux/tracepoint.h>
 #include "internal.h"
-
-#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
+#include <bc/io_acct.h>
 
 /*
  * We don't actually have pdflush, but this one is exported though /proc...
@@ -42,10 +41,12 @@ int nr_pdflush_threads;
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	struct user_beancounter *ub;
 	enum writeback_sync_modes sync_mode;
 	int for_kupdate:1;
 	int range_cyclic:1;
 	int for_background:1;
+	int for_sync:1;
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
 };
@@ -69,6 +70,17 @@ int writeback_in_progress(struct backing
 {
 	return test_bit(BDI_writeback_running, &bdi->state);
 }
+EXPORT_SYMBOL(writeback_in_progress);
+
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (strcmp(sb->s_type->name, "bdev") == 0)
+		return inode->i_mapping->backing_dev_info;
+
+	return sb->s_bdi;
+}
 
 static void bdi_queue_work(struct backing_dev_info *bdi,
 		struct wb_writeback_work *work)
@@ -95,8 +107,8 @@ static void bdi_queue_work(struct backin
 }
 
 static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		bool range_cyclic, bool for_background)
+__bdi_start_writeback(struct backing_dev_info *bdi, struct user_beancounter *ub,
+		long nr_pages, bool range_cyclic, bool for_background)
 {
 	struct wb_writeback_work *work;
 
@@ -117,6 +129,7 @@ __bdi_start_writeback(struct backing_dev
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
 	work->for_background = for_background;
+	work->ub = ub,
 
 	bdi_queue_work(bdi, work);
 }
@@ -134,7 +147,7 @@ __bdi_start_writeback(struct backing_dev
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, false);
+	__bdi_start_writeback(bdi, NULL, nr_pages, true, false);
 }
 
 /**
@@ -146,9 +159,10 @@ void bdi_start_writeback(struct backing_
  *   started when this function returns, we make no guarentees on
  *   completion. Caller need not hold sb s_umount semaphore.
  */
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void bdi_start_background_writeback(struct backing_dev_info *bdi,
+		struct user_beancounter *ub)
 {
-	__bdi_start_writeback(bdi, LONG_MAX, true, true);
+	__bdi_start_writeback(bdi, ub, LONG_MAX, true, true);
 }
 
 /*
@@ -304,7 +318,7 @@ static void inode_wait_for_writeback(str
  * Called under inode_lock.
  */
 static int
-writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	unsigned dirty;
@@ -337,11 +351,9 @@ writeback_single_inode(struct inode *ino
 
 	BUG_ON(inode->i_state & I_SYNC);
 
-	/* Set I_SYNC, reset I_DIRTY */
-	dirty = inode->i_state & I_DIRTY;
+	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	inode->i_state |= I_SYNC;
-	inode->i_state &= ~I_DIRTY;
-
+	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
@@ -357,6 +369,15 @@ writeback_single_inode(struct inode *ino
 			ret = err;
 	}
 
+	/*
+	 * Some filesystems may redirty the inode during the writeback
+	 * due to delalloc, clear dirty metadata flags right before
+	 * write_inode()
+	 */
+	spin_lock(&inode_lock);
+	dirty = inode->i_state & I_DIRTY;
+	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	spin_unlock(&inode_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		int err = write_inode(inode, wbc);
@@ -412,6 +433,23 @@ writeback_single_inode(struct inode *ino
 	return ret;
 }
 
+static int
+writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct user_beancounter *ub = inode->i_mapping->dirtied_ub;
+	int ret;
+
+	if (likely(get_exec_ub() == ub || !ub))
+		return __writeback_single_inode(inode, wbc);
+
+	ub = get_beancounter_rcu(ub) ? set_exec_ub(ub) : NULL;
+	ret = __writeback_single_inode(inode, wbc);
+	if (ub)
+		put_beancounter(set_exec_ub(ub));
+
+	return ret;
+}
+
 /*
  * For background writeback the caller does not have the sb pinned
  * before calling writeback. So make sure that we do pin it, so it doesn't
@@ -429,7 +467,7 @@ static bool pin_sb_for_writeback(struct 
 	spin_unlock(&sb_lock);
 
 	if (down_read_trylock(&sb->s_umount)) {
-		if (sb->s_root)
+		if (sb->s_root && sb->s_frozen <= SB_FREEZE_WRITE)
 			return true;
 		up_read(&sb->s_umount);
 	}
@@ -475,6 +513,14 @@ static int writeback_sb_inodes(struct su
 			return 0;
 		}
 
+		/* Filter ub inodes if bdi dirty limit isn't exceeded */
+		if (wbc->wb_ub && !wb->bdi->dirty_exceeded &&
+		    (inode->i_state & I_DIRTY) == I_DIRTY_PAGES &&
+		    ub_should_skip_writeback(wbc->wb_ub, inode)) {
+			requeue_io(inode);
+			continue;
+		}
+
 		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
@@ -561,10 +607,13 @@ static void __writeback_inodes_sb(struct
  */
 #define MAX_WRITEBACK_PAGES     1024
 
-static inline bool over_bground_thresh(void)
+static inline bool over_bground_thresh(struct backing_dev_info *bdi)
 {
 	unsigned long background_thresh, dirty_thresh;
 
+	if (!bdi_cap_account_writeback(bdi) && bdi->dirty_exceeded)
+		return 1;
+
 	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
 	return (global_page_state(NR_FILE_DIRTY) +
@@ -594,7 +643,9 @@ static long wb_writeback(struct bdi_writ
 		.older_than_this	= NULL,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
+		.for_sync		= work->for_sync,
 		.range_cyclic		= work->range_cyclic,
+		.wb_ub			= work->ub,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -623,9 +674,11 @@ static long wb_writeback(struct bdi_writ
 
 		/*
 		 * For background writeout, stop when we are below the
-		 * background dirty threshold
+		 * background dirty threshold. For filtered background
+		 * writeback we write all inodes dirtied before us,
+		 * because we cannot dereference this ub pointer.
 		 */
-		if (work->for_background && !over_bground_thresh())
+		if (work->for_background && !work->ub && !over_bground_thresh(wb->bdi))
 			break;
 
 		if (work->for_kupdate) {
@@ -809,9 +862,15 @@ int bdi_writeback_task(struct bdi_writeb
 				break;
 		}
 
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!list_empty(&wb->bdi->work_list) || kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
 		if (dirty_writeback_interval) {
 			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-			schedule_timeout_interruptible(wait_jiffies);
+			schedule_timeout(wait_jiffies);
 		} else
 			schedule();
 
@@ -827,7 +886,7 @@ int bdi_writeback_task(struct bdi_writeb
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(struct user_beancounter *ub, long nr_pages)
 {
 	struct backing_dev_info *bdi;
 
@@ -840,7 +899,7 @@ void wakeup_flusher_threads(long nr_page
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, false);
+		__bdi_start_writeback(bdi, ub, nr_pages, false, false);
 	}
 	rcu_read_unlock();
 }
@@ -986,7 +1045,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-static void wait_sb_inodes(struct super_block *sb)
+static void wait_sb_inodes(struct super_block *sb, struct user_beancounter *ub)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1013,6 +1072,9 @@ static void wait_sb_inodes(struct super_
 		mapping = inode->i_mapping;
 		if (mapping->nrpages == 0)
 			continue;
+		if (ub && (mapping->dirtied_ub != ub) &&
+		    (inode->i_state & I_DIRTY) == I_DIRTY_PAGES)
+			continue;
 		__iget(inode);
 		spin_unlock(&inode_lock);
 		/*
@@ -1045,20 +1107,35 @@ static void wait_sb_inodes(struct super_
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+void writeback_inodes_sb_nr_ub(struct super_block *sb, unsigned long nr, struct user_beancounter *ub)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
+		.ub		= ub,
 		.sync_mode	= WB_SYNC_NONE,
 		.done		= &done,
 		.nr_pages	= nr,
 	};
 
+	if (sb->s_bdi == &noop_backing_dev_info)
+		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 }
+
+void writeback_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub)
+{
+	writeback_inodes_sb_nr_ub(sb, global_page_state(NR_FILE_DIRTY) +
+			      global_page_state(NR_UNSTABLE_NFS) +
+			      (inodes_stat.nr_inodes - inodes_stat.nr_unused), ub);
+}
+
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+{
+	writeback_inodes_sb_nr_ub(sb, nr, NULL);
+}
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
 /**
@@ -1071,9 +1148,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-	return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) +
-			      global_page_state(NR_UNSTABLE_NFS) +
-			      (inodes_stat.nr_inodes - inodes_stat.nr_unused));
+	writeback_inodes_sb_ub(sb, NULL);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1124,23 +1199,33 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_
  * This function writes and waits on any dirty inode belonging to this
  * super_block. The number of pages synced is returned.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
+		.ub		= ub,
 		.sync_mode	= WB_SYNC_ALL,
+		.for_sync	= 1,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
 		.done		= &done,
 	};
 
+	/* Nothing to do? */
+	if (sb->s_bdi == &noop_backing_dev_info)
+		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 
-	wait_sb_inodes(sb);
+	wait_sb_inodes(sb, ub);
+}
+
+void sync_inodes_sb(struct super_block *sb)
+{
+	sync_inodes_sb_ub(sb, NULL);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fs_struct.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fs_struct.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fs_struct.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fs_struct.c	2015-01-21 12:02:57.963835087 +0300
@@ -4,6 +4,7 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include <linux/pid_namespace.h>
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -13,14 +14,15 @@ void set_fs_root(struct fs_struct *fs, s
 {
 	struct path old_root;
 
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	old_root = fs->root;
 	fs->root = *path;
 	path_get(path);
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 	if (old_root.dentry)
 		path_put(&old_root);
 }
+EXPORT_SYMBOL(set_fs_root);
 
 /*
  * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
@@ -30,15 +32,16 @@ void set_fs_pwd(struct fs_struct *fs, st
 {
 	struct path old_pwd;
 
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
 	path_get(path);
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
 		path_put(&old_pwd);
 }
+EXPORT_SYMBOL_GPL(set_fs_pwd);
 
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
 {
@@ -47,11 +50,11 @@ void chroot_fs_refs(struct path *old_roo
 	int count = 0;
 
 	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
+	do_each_thread_ve(g, p) {
 		task_lock(p);
 		fs = p->fs;
 		if (fs) {
-			write_lock(&fs->lock);
+			spin_lock(&fs->lock);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
 				path_get(new_root);
@@ -64,10 +67,10 @@ void chroot_fs_refs(struct path *old_roo
 				fs->pwd = *new_root;
 				count++;
 			}
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 		}
 		task_unlock(p);
-	} while_each_thread(g, p);
+	} while_each_thread_ve(g, p);
 	read_unlock(&tasklist_lock);
 	while (count--)
 		path_put(old_root);
@@ -79,6 +82,7 @@ void free_fs_struct(struct fs_struct *fs
 	path_put(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
+EXPORT_SYMBOL(free_fs_struct);
 
 void exit_fs(struct task_struct *tsk)
 {
@@ -87,15 +91,16 @@ void exit_fs(struct task_struct *tsk)
 	if (fs) {
 		int kill;
 		task_lock(tsk);
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		tsk->fs = NULL;
 		kill = !--fs->users;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 		task_unlock(tsk);
 		if (kill)
 			free_fs_struct(fs);
 	}
 }
+EXPORT_SYMBOL(exit_fs);
 
 struct fs_struct *copy_fs_struct(struct fs_struct *old)
 {
@@ -104,14 +109,9 @@ struct fs_struct *copy_fs_struct(struct 
 	if (fs) {
 		fs->users = 1;
 		fs->in_exec = 0;
-		rwlock_init(&fs->lock);
+		spin_lock_init(&fs->lock);
 		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
+		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
 	}
 	return fs;
 }
@@ -126,10 +126,10 @@ int unshare_fs_struct(void)
 		return -ENOMEM;
 
 	task_lock(current);
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	kill = !--fs->users;
 	current->fs = new_fs;
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 	task_unlock(current);
 
 	if (kill)
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
@@ -158,20 +158,22 @@ void daemonize_fs_struct(void)
 
 	if (fs) {
 		int kill;
+		struct fs_struct *ve_fs = get_exec_env_init()->fs;
 
 		task_lock(current);
 
-		write_lock(&init_fs.lock);
-		init_fs.users++;
-		write_unlock(&init_fs.lock);
+		spin_lock(&ve_fs->lock);
+		ve_fs->users++;
+		spin_unlock(&ve_fs->lock);
 
-		write_lock(&fs->lock);
-		current->fs = &init_fs;
+		spin_lock(&fs->lock);
+		current->fs = ve_fs;
 		kill = !--fs->users;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 
 		task_unlock(current);
 		if (kill)
 			free_fs_struct(fs);
 	}
 }
+EXPORT_SYMBOL_GPL(daemonize_fs_struct);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fscache/cache.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fscache/cache.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fscache/cache.c	2014-12-12 23:29:29.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fscache/cache.c	2015-01-21 12:02:41.365275729 +0300
@@ -220,8 +220,6 @@ int fscache_add_cache(struct fscache_cac
 {
 	struct fscache_cache_tag *tag;
 
-	mark_tech_preview(NULL, THIS_MODULE);
-
 	BUG_ON(!cache->ops);
 	BUG_ON(!ifsdef);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/control.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/control.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/control.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/control.c	2015-01-21 12:02:51.782999154 +0300
@@ -10,6 +10,9 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/ve_proto.h>
+#include <linux/seq_file.h>
 
 #define FUSE_CTL_SUPER_MAGIC 0x65735543
 
@@ -17,7 +20,11 @@
  * This is non-NULL when the single instance of the control filesystem
  * exists.  Protected by fuse_mutex
  */
+#ifdef CONFIG_VE
+#define fuse_control_sb	(get_exec_env()->_fuse_control_sb)
+#else
 static struct super_block *fuse_control_sb;
+#endif
 
 static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
 {
@@ -198,6 +205,279 @@ static const struct file_operations fuse
 	.write = fuse_conn_congestion_threshold_write,
 };
 
+static ssize_t fuse_reconnect_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	unsigned long t;
+	char tmp[32];
+	unsigned limit = (1 << 16) - 1;
+	int err;
+	struct fuse_conn *fc;
+
+	if (*ppos || count >= sizeof(tmp) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(tmp, buf, count))
+		return -EINVAL;
+
+	tmp[count] = '\0';
+
+	err = strict_strtoul(tmp, 0, &t);
+	if (err)
+		return err;
+
+	fc = fuse_ctl_file_conn_get(file);
+	err = fuse_reconnect_fd(t, fc);
+	fuse_conn_put(fc);
+
+	if (err)
+		return err;
+
+	return count;
+}
+
+static const struct file_operations fuse_reconnect_ops = {
+	.open = nonseekable_open,
+	.write = fuse_reconnect_write,
+};
+
+struct fuse_conn_priv {
+	struct fuse_conn *conn;
+	struct list_head *req_list;
+};
+
+enum {
+	FUSE_PENDING_REQ = 1,
+	FUSE_PROCESSING_REQ,
+	FUSE_IO_REQ,
+};
+
+static void *fuse_req_start(struct seq_file *m, loff_t *p)
+{
+	struct fuse_conn_priv *fcp = m->private;
+
+	spin_lock(&fcp->conn->lock);
+	return seq_list_start(fcp->req_list, *p);
+}
+
+static void *fuse_req_next(struct seq_file *m, void *v, loff_t *p)
+{
+	struct fuse_conn_priv *fcp = m->private;
+	return seq_list_next(v, fcp->req_list, p);
+}
+
+static void fuse_req_stop(struct seq_file *m, void *v)
+{
+	struct fuse_conn_priv *fcp = m->private;
+	spin_unlock(&fcp->conn->lock);
+}
+
+static int fuse_req_show(struct seq_file *f, void *v)
+{
+	struct fuse_req *req;
+
+	req = list_entry((struct list_head *)v, struct fuse_req, list);
+	seq_printf(f, "state: %-2d flags: %c%c%c%c%c%c%c "
+			"in: op %-4d uniq 0x%016Lx node 0x%016Lx "
+			"out: err %-6d uniq 0x%016Lx\n",
+			req->state,
+			req->isreply ? 'r' : '-',
+			req->force ? 'f' : '-',
+			req->aborted ? 'a' : '-',
+			req->background ? 'b' : '-',
+			req->interrupted ? 'i' : '-',
+			req->locked ? 'l' : '-',
+			req->waiting ? 'w': '-',
+			req->in.h.opcode,
+			req->in.h.unique,
+			req->in.h.nodeid,
+			req->out.h.error,
+			req->out.h.unique);
+
+	return 0;
+}
+
+static const struct seq_operations fuse_conn_req_ops = {
+	.start = fuse_req_start,
+	.next = fuse_req_next,
+	.stop = fuse_req_stop,
+	.show = fuse_req_show,
+};
+
+static int fuse_conn_seq_open(struct file *filp, int list_id)
+{
+	struct fuse_conn *conn;
+	struct fuse_conn_priv *fcp;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	fcp = __seq_open_private(filp, &fuse_conn_req_ops,
+			sizeof(struct fuse_conn_priv));
+	if (fcp == NULL) {
+		fuse_conn_put(conn);
+		return -ENOMEM;
+	}
+
+	fcp->conn = conn;
+	switch (list_id) {
+	case FUSE_PROCESSING_REQ:
+		fcp->req_list = &conn->processing;
+		break;
+	case FUSE_PENDING_REQ:
+		fcp->req_list = &conn->pending;
+		break;
+	case FUSE_IO_REQ:
+		fcp->req_list = &conn->io;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int fuse_conn_release(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn_priv *fcp = ((struct seq_file *)filp->private_data)->private;
+
+	if (fcp)
+		fuse_conn_put(fcp->conn);
+
+	return seq_release_private(inode, filp);
+}
+
+static int fuse_conn_pending_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_PENDING_REQ);
+}
+
+static const struct file_operations fuse_conn_pending_req = {
+	.open = fuse_conn_pending_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_processing_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_PROCESSING_REQ);
+}
+
+static const struct file_operations fuse_conn_processing_req = {
+	.open = fuse_conn_processing_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_io_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_IO_REQ);
+}
+
+static const struct file_operations fuse_conn_io_req = {
+	.open = fuse_conn_io_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_files_show(struct seq_file *f, void *v)
+{
+	struct fuse_file *ff;
+
+	ff = list_entry(v, struct fuse_file, fl);
+	seq_printf(f, "kh 0x%016Lx fh 0x%016Lx node 0x%016Lx flags 0x%08x name ",
+			ff->kh, ff->fh, ff->nodeid, ff->open_flags);
+	if (ff->ff_dentry)
+		seq_dentry(f, ff->ff_dentry, "");
+	else
+		seq_putc(f, '-');
+	seq_putc(f, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations fuse_conn_files_seq_ops = {
+	.start = fuse_req_start,
+	.next = fuse_req_next,
+	.stop = fuse_req_stop,
+	.show = fuse_files_show,
+};
+
+static int fuse_conn_files_open(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn *conn;
+	struct fuse_conn_priv *fcp;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	fcp = __seq_open_private(filp, &fuse_conn_files_seq_ops,
+			sizeof(struct fuse_conn_priv));
+	if (fcp == NULL) {
+		fuse_conn_put(conn);
+		return -ENOMEM;
+	}
+
+	fcp->conn = conn;
+	fcp->req_list = &conn->conn_files;
+	return 0;
+}
+
+static const struct file_operations fuse_conn_files_ops = {
+	.open = fuse_conn_files_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_show(struct seq_file *sf, void *v)
+{
+	struct fuse_conn *fc = sf->private;
+	seq_printf(sf, "Connected: %d\n", fc->connected);
+	seq_printf(sf, "Initialized: %d\n", fc->initialized);
+	seq_printf(sf, "Blocked: %d\n", fc->blocked);
+	seq_printf(sf, "WQ active: %d\n", waitqueue_active(&fc->waitq));
+	seq_printf(sf, "Blocked_wq active: %d\n", waitqueue_active(&fc->blocked_waitq));
+	seq_printf(sf, "num_background: %d\n", fc->num_background);
+	seq_printf(sf, "num_waiting: %d\n", atomic_read(&fc->num_waiting));
+	return 0;
+}
+
+static int fuse_conn_info_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	struct fuse_conn *conn;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	ret = single_open(filp, fuse_conn_show, conn);
+	if (ret)
+		fuse_conn_put(conn);
+
+	return ret;
+}
+
+static int fuse_conn_info_release(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn *conn = ((struct seq_file *)filp->private_data)->private;
+	fuse_conn_put(conn);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations fuse_conn_info_ops = {
+	.open = fuse_conn_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_info_release,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name,
@@ -261,7 +541,26 @@ int fuse_ctl_add_conn(struct fuse_conn *
 				 1, NULL, &fuse_conn_max_background_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
 				 S_IFREG | 0600, 1, NULL,
-				 &fuse_conn_congestion_threshold_ops))
+				 &fuse_conn_congestion_threshold_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "pending_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_pending_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "processing_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_processing_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "io_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_io_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "open_files",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_files_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "conn_info",
+			    	S_IFREG | 0600, 1, NULL,
+				&fuse_conn_info_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "reconnect",
+			    	S_IFREG | 0600, 1, NULL,
+				&fuse_reconnect_ops)
+	    )
 		goto err;
 
 	return 0;
@@ -345,12 +644,55 @@ static struct file_system_type fuse_ctl_
 	.kill_sb	= fuse_ctl_kill_sb,
 };
 
+#ifdef CONFIG_VE
+static int fuse_ctl_start(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	if (ve->fuse_ctl_fs_type != NULL)
+		return -EBUSY;
+
+	return register_ve_fs_type(ve, &fuse_ctl_fs_type,
+			&ve->fuse_ctl_fs_type, NULL);
+}
+
+static void fuse_ctl_stop(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	if (ve->fuse_ctl_fs_type == NULL)
+		return;
+
+	unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL);
+	/* fuse_ctl_fs_type is freed in real_put_ve -> free_ve_filesystems */
+}
+
+static struct ve_hook fuse_ctl_ve_hook = {
+	.init		= fuse_ctl_start,
+	.fini		= fuse_ctl_stop,
+	.owner		= THIS_MODULE,
+	.priority	= HOOK_PRIO_FS,
+};
+#endif
+
 int __init fuse_ctl_init(void)
 {
-	return register_filesystem(&fuse_ctl_fs_type);
+	int err;
+	
+	err = register_filesystem(&fuse_ctl_fs_type);
+#ifdef CONFIG_VE
+	if (err == 0)
+		ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook);
+#endif
+	return err;
 }
 
 void fuse_ctl_cleanup(void)
 {
+#ifdef CONFIG_VE
+	ve_hook_unregister(&fuse_ctl_ve_hook);
+#endif
 	unregister_filesystem(&fuse_ctl_fs_type);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/cuse.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/cuse.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/cuse.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/cuse.c	2015-01-21 12:02:51.656002526 +0300
@@ -93,7 +93,7 @@ static ssize_t cuse_read(struct file *fi
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
 	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -107,7 +107,8 @@ static ssize_t cuse_write(struct file *f
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos,
+			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/dev.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/dev.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/dev.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/dev.c	2015-01-21 12:02:51.757999816 +0300
@@ -19,6 +19,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/swap.h>
 #include <linux/splice.h>
+#include <linux/bio.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 
@@ -126,7 +127,7 @@ static void fuse_req_init_context(struct
 {
 	req->in.h.uid = current_fsuid();
 	req->in.h.gid = current_fsgid();
-	req->in.h.pid = current->pid;
+	req->in.h.pid = task_pid_vnr(current);
 }
 
 static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
@@ -479,7 +480,8 @@ __acquires(&fc->lock)
 	}
 }
 
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req,
+				struct fuse_file *ff)
 {
 	BUG_ON(req->background);
 	spin_lock(&fc->lock);
@@ -487,6 +489,8 @@ static void __fuse_request_send(struct f
 		req->out.h.error = -ENOTCONN;
 	else if (fc->conn_error)
 		req->out.h.error = -ECONNREFUSED;
+	else if (ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+		req->out.h.error = -EIO;
 	else {
 		queue_request(fc, req);
 		/* acquire extra reference, since request is still needed
@@ -498,10 +502,16 @@ static void __fuse_request_send(struct f
 	spin_unlock(&fc->lock);
 }
 
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_check_and_send(struct fuse_conn *fc, struct fuse_req *req,
+				 struct fuse_file *ff)
 {
 	req->isreply = 1;
-	__fuse_request_send(fc, req);
+	__fuse_request_send(fc, req, ff);
+}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	fuse_request_check_and_send(fc, req, NULL);
 }
 EXPORT_SYMBOL_GPL(fuse_request_send);
 
@@ -524,7 +534,13 @@ static void fuse_request_send_nowait_loc
 static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
-	if (fc->connected) {
+	if (req->page_cache && req->ff &&
+	    test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->ff->ff_state)) {
+		BUG_ON(req->in.h.opcode != FUSE_READ);
+		req->out.h.error = -EIO;
+		req->background = 0;
+		request_end(fc, req);
+	} else if (fc->connected) {
 		fuse_request_send_nowait_locked(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
@@ -568,7 +584,7 @@ void fuse_force_forget(struct file *file
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
 	req->isreply = 0;
-	__fuse_request_send(fc, req);
+	__fuse_request_send(fc, req, NULL);
 	/* ignore errors */
 	fuse_put_request(fc, req);
 }
@@ -897,7 +913,8 @@ static int fuse_ref_page(struct fuse_cop
  * done atomically
  */
 static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
-			  unsigned offset, unsigned count, int zeroing)
+			  unsigned offset, unsigned count, int zeroing,
+			  bool move_disabled)
 {
 	int err;
 	struct page *page = *pagep;
@@ -913,6 +930,9 @@ static int fuse_copy_page(struct fuse_co
 		} else if (!cs->len) {
 			if (cs->move_pages && page &&
 			    offset == 0 && count == PAGE_SIZE) {
+				/* read-ahead doesn't use bvec */
+				BUG_ON(move_disabled);
+
 				err = fuse_try_move_page(cs, pagep);
 				if (err <= 0)
 					return err;
@@ -947,7 +967,7 @@ static int fuse_copy_pages(struct fuse_c
 		unsigned offset = req->page_descs[i].offset;
 		unsigned count = min(nbytes, req->page_descs[i].length);
 		err = fuse_copy_page(cs, &req->pages[i], offset, count,
-				     zeroing);
+				     zeroing, false);
 		if (err)
 			return err;
 
@@ -956,6 +976,24 @@ static int fuse_copy_pages(struct fuse_c
 	return 0;
 }
 
+static int fuse_copy_bvec(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+
+	for (i = 0; i < req->num_bvecs && (nbytes || zeroing); i++) {
+		struct bio_vec *bvec = &req->bvec[i];
+
+		int err = fuse_copy_page(cs, &bvec->bv_page,
+				bvec->bv_offset, bvec->bv_len, zeroing, true);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Copy a single argument in the request to/from userspace buffer */
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
@@ -972,7 +1010,7 @@ static int fuse_copy_one(struct fuse_cop
 
 /* Copy request arguments to/from userspace buffer */
 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
-			  unsigned argpages, struct fuse_arg *args,
+			  unsigned argpages, unsigned argbvec, struct fuse_arg *args,
 			  int zeroing)
 {
 	int err = 0;
@@ -982,6 +1020,8 @@ static int fuse_copy_args(struct fuse_co
 		struct fuse_arg *arg = &args[i];
 		if (i == numargs - 1 && argpages)
 			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else if (i == numargs - 1 && argbvec)
+			err = fuse_copy_bvec(cs, arg->size, zeroing);
 		else
 			err = fuse_copy_one(cs, arg->value, arg->size);
 	}
@@ -1167,7 +1207,7 @@ static ssize_t fuse_dev_do_read(struct f
 	cs->req = req;
 	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
 	if (!err)
-		err = fuse_copy_args(cs, in->numargs, in->argpages,
+		err = fuse_copy_args(cs, in->numargs, in->argpages, in->argbvec,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(cs);
 	spin_lock(&fc->lock);
@@ -1408,6 +1448,36 @@ err:
 	return err;
 }
 
+static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_files_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (!fc->sb)
+		goto err_unlock;
+
+	err = fuse_invalidate_files(fc, outarg.ino);
+
+err_unlock:
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1421,6 +1491,9 @@ static int fuse_notify(struct fuse_conn 
 	case FUSE_NOTIFY_INVAL_ENTRY:
 		return fuse_notify_inval_entry(fc, size, cs);
 
+	case FUSE_NOTIFY_INVAL_FILES:
+		return fuse_notify_inval_files(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
@@ -1458,8 +1531,8 @@ static int copy_out_args(struct fuse_cop
 			return -EINVAL;
 		lastarg->size -= diffsize;
 	}
-	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
-			      out->page_zeroing);
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->argbvec,
+			out->args, out->page_zeroing);
 }
 
 /*
@@ -1690,6 +1763,22 @@ __acquires(&fc->lock)
 	}
 }
 
+static void requeue_requests(struct fuse_conn *fc)
+{
+	BUG_ON(!list_empty(&fc->io));
+	while (!list_empty(&fc->processing)) {
+		struct fuse_req *req;
+
+		/* take requests from the processing queue tail ... */
+		req = list_entry(fc->processing.prev, struct fuse_req, list);
+		BUG_ON(req->locked);
+		BUG_ON(req->state != FUSE_REQ_SENT);
+		/* ... and queue them back to the pending head */
+		req->state = FUSE_REQ_PENDING;
+		list_move(&req->list, &fc->pending);
+	}
+}
+
 /*
  * Abort requests under I/O
  *
@@ -1731,8 +1820,8 @@ static void end_queued_requests(struct f
 {
 	fc->max_background = UINT_MAX;
 	flush_bg_queue(fc);
-	end_requests(fc, &fc->pending);
 	end_requests(fc, &fc->processing);
+	end_requests(fc, &fc->pending);
 	while (forget_pending(fc))
 		kfree(dequeue_forget(fc));
 }
@@ -1778,11 +1867,16 @@ int fuse_dev_release(struct inode *inode
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (fc) {
 		spin_lock(&fc->lock);
-		fc->connected = 0;
-		fc->blocked = 0;
-		fc->initialized = 1;
-		end_queued_requests(fc);
-		wake_up_all(&fc->blocked_waitq);
+		if (!(fc->flags & FUSE_CAN_RECONNECT)) {
+			fc->connected = 0;
+			fc->blocked = 0;
+			fc->initialized = 1;
+			end_queued_requests(fc);
+			wake_up_all(&fc->blocked_waitq);
+		} else {
+			fc->connected = 2;
+			requeue_requests(fc);
+		}
 		spin_unlock(&fc->lock);
 		fuse_conn_put(fc);
 	}
@@ -1791,6 +1885,40 @@ int fuse_dev_release(struct inode *inode
 }
 EXPORT_SYMBOL_GPL(fuse_dev_release);
 
+int fuse_reconnect_fd(int fd, struct fuse_conn *fc)
+{
+	int err;
+	struct file *f;
+
+	err = -EBADF;
+	f = fget(fd);
+	if (!f)
+		goto out;
+
+	err = -EINVAL;
+	if (f->f_op != &fuse_dev_operations)
+		goto out_fput;
+
+	mutex_lock(&fuse_mutex);
+	err = -EBUSY;
+	if (fc->connected != 2)
+		goto out_unlock;
+
+	if (f->private_data)
+		goto out_unlock;
+
+	f->private_data = fuse_conn_get(fc);
+	fc->connected = 1;
+	err = 0;
+
+out_unlock:
+	mutex_unlock(&fuse_mutex);
+out_fput:
+	fput(f);
+out:
+	return err;
+}
+
 static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
 	struct fuse_conn *fc = fuse_get_conn(file);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/dir.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/dir.c	2015-01-21 12:02:51.634003110 +0300
@@ -186,8 +186,7 @@ static int fuse_dentry_revalidate(struct
 
 	if (inode && is_bad_inode(inode))
 		goto invalid;
-	else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
-		 (nd->flags & LOOKUP_REVAL)) {
+	else if (1) {
 		int err;
 		struct fuse_entry_out outarg;
 		struct fuse_req *req;
@@ -332,7 +331,7 @@ int fuse_lookup_name(struct super_block 
 
 	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
 			   &outarg->attr, entry_attr_timeout(outarg),
-			   attr_version);
+			   attr_version, 0);
 	err = -ENOMEM;
 	if (!*inode) {
 		fuse_queue_forget(fc, forget, outarg->nodeid, 1);
@@ -431,6 +430,9 @@ static int fuse_create_open(struct inode
 	if (fc->no_create)
 		return -ENOSYS;
 
+	if ((flags & O_DIRECT) && !(fc->flags & FUSE_ODIRECT))
+		return -EINVAL;
+
 	forget = fuse_alloc_forget();
 	if (!forget)
 		return -ENOMEM;
@@ -487,7 +489,7 @@ static int fuse_create_open(struct inode
 	ff->nodeid = outentry.nodeid;
 	ff->open_flags = outopen.open_flags;
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
-			  &outentry.attr, entry_attr_timeout(&outentry), 0);
+			  &outentry.attr, entry_attr_timeout(&outentry), 0, 1);
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		fuse_sync_release(ff, flags);
@@ -500,6 +502,10 @@ static int fuse_create_open(struct inode
 	fuse_invalidate_attr(dir);
 	file = lookup_instantiate_filp(nd, entry, generic_file_open);
 	if (IS_ERR(file)) {
+		if (fc->flags & FUSE_WBCACHE) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
+			atomic_dec(&fi->num_openers);
+		}
 		fuse_sync_release(ff, flags);
 		return PTR_ERR(file);
 	}
@@ -556,7 +562,7 @@ static int create_new_entry(struct fuse_
 		goto out_put_forget_req;
 
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-			  &outarg.attr, entry_attr_timeout(&outarg), 0);
+			  &outarg.attr, entry_attr_timeout(&outarg), 0, 0);
 	if (!inode) {
 		fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 		return -ENOMEM;
@@ -831,13 +837,13 @@ static void fuse_fillattr(struct inode *
 	stat->mtime.tv_nsec = attr->mtimensec;
 	stat->ctime.tv_sec = attr->ctime;
 	stat->ctime.tv_nsec = attr->ctimensec;
-	stat->size = attr->size;
+	stat->size = inode->i_size;
 	stat->blocks = attr->blocks;
 	stat->blksize = (1 << inode->i_blkbits);
 }
 
 static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
-			   struct file *file)
+			   struct file *file, int get_size_form_attr)
 {
 	int err;
 	struct fuse_getattr_in inarg;
@@ -883,13 +889,32 @@ static int fuse_do_getattr(struct inode 
 			fuse_change_attributes(inode, &outarg.attr,
 					       attr_timeout(&outarg),
 					       attr_version);
-			if (stat)
+			if (get_size_form_attr)
+				stat->size = outarg.attr.size;
+			else if (stat) {
+				struct fuse_inode *fi = get_fuse_inode(inode);
 				fuse_fillattr(inode, &outarg.attr, stat);
+				if (!atomic_read(&fi->num_openers))
+					stat->size = outarg.attr.size;
+			}
 		}
 	}
 	return err;
 }
 
+int fuse_getattr_size(struct inode *inode, struct file *file, u64 *size)
+{
+	struct kstat stat;
+	int err;
+
+	err = fuse_do_getattr(inode, &stat, file, 1);
+	if (err)
+		return err;
+
+	*size = stat.size;
+	return 0;
+}
+
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed)
 {
@@ -899,7 +924,7 @@ int fuse_update_attributes(struct inode 
 
 	if (time_before64(fi->i_time, get_jiffies_64())) {
 		r = true;
-		err = fuse_do_getattr(inode, stat, file);
+		err = fuse_do_getattr(inode, stat, file, 0);
 	} else {
 		r = false;
 		err = 0;
@@ -1055,7 +1080,7 @@ static int fuse_permission(struct inode 
 		   attributes.  This is also needed, because the root
 		   node will at first have no permissions */
 		if (err == -EACCES && !refreshed) {
-			err = fuse_do_getattr(inode, NULL, NULL);
+			err = fuse_do_getattr(inode, NULL, NULL, 0);
 			if (!err)
 				err = generic_permission(inode, mask, NULL);
 		}
@@ -1071,7 +1096,7 @@ static int fuse_permission(struct inode 
 			if (refreshed)
 				return -EACCES;
 
-			err = fuse_do_getattr(inode, NULL, NULL);
+			err = fuse_do_getattr(inode, NULL, NULL, 0);
 			if (!err && !(inode->i_mode & S_IXUGO))
 				return -EACCES;
 		}
@@ -1191,7 +1216,7 @@ static int fuse_direntplus_link(struct f
 	dentry->d_op = &fuse_dentry_operations;
 
 	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
-			  &o->attr, entry_attr_timeout(o), attr_version);
+			  &o->attr, entry_attr_timeout(o), attr_version, 0);
 	if (!inode)
 		goto out;
 
@@ -1486,6 +1511,7 @@ int fuse_do_setattr(struct inode *inode,
 	struct fuse_setattr_in inarg;
 	struct fuse_attr_out outarg;
 	bool is_truncate = false;
+	int wb = fc->flags & FUSE_WBCACHE;
 	loff_t oldsize;
 	int err;
 
@@ -1557,7 +1583,8 @@ int fuse_do_setattr(struct inode *inode,
 	fuse_change_attributes_common(inode, &outarg.attr,
 				      attr_timeout(&outarg));
 	oldsize = inode->i_size;
-	i_size_write(inode, outarg.attr.size);
+	if (!wb || is_truncate || !S_ISREG(inode->i_mode))
+		i_size_write(inode, outarg.attr.size);
 
 	if (is_truncate) {
 		/* NOTE: this may release/reacquire fc->lock */
@@ -1569,7 +1596,8 @@ int fuse_do_setattr(struct inode *inode,
 	 * Only call invalidate_inode_pages2() after removing
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
-	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+	if ((is_truncate || !wb) &&
+	    S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
 		truncate_pagecache(inode, oldsize, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/file.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/file.c	2015-01-21 12:02:58.173829511 +0300
@@ -15,8 +15,21 @@
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/falloc.h>
+#include <linux/bio.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/virtinfo.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
+static void fuse_sync_writes(struct inode *inode);
+
+static void fuse_account_request(struct fuse_conn *fc, size_t count)
+{
+	struct user_beancounter *ub = get_exec_ub();
+
+	ub_percpu_inc(ub, fuse_requests);
+	ub_percpu_add(ub, fuse_bytes, count);
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_FUSE_REQ, NULL);
+}
 
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			  int opcode, struct fuse_open_out *outargp)
@@ -56,6 +69,8 @@ struct fuse_file *fuse_file_alloc(struct
 	if (unlikely(!ff))
 		return NULL;
 
+	ff->ff_state = 0;
+
 	ff->fc = fc;
 	ff->reserved_req = fuse_request_alloc(0);
 	if (unlikely(!ff->reserved_req)) {
@@ -64,19 +79,30 @@ struct fuse_file *fuse_file_alloc(struct
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
+	INIT_LIST_HEAD(&ff->rw_entry);
 	atomic_set(&ff->count, 0);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
 
 	spin_lock(&fc->lock);
 	ff->kh = ++fc->khctr;
+	ff->ff_dentry = NULL;
+	list_add_tail(&ff->fl, &fc->conn_files);
 	spin_unlock(&fc->lock);
 
 	return ff;
 }
 
+static void fuse_file_list_del(struct fuse_file *ff)
+{
+	spin_lock(&ff->fc->lock);
+	list_del_init(&ff->fl);
+	spin_unlock(&ff->fc->lock);
+}
+
 void fuse_file_free(struct fuse_file *ff)
 {
+	fuse_file_list_del(ff);
 	fuse_request_free(ff->reserved_req);
 	kfree(ff);
 }
@@ -126,19 +152,32 @@ static void fuse_file_put(struct fuse_fi
 		struct fuse_req *req = ff->reserved_req;
 
 		if (sync) {
+			/* Must force. Otherwise request could be interrupted,
+			 * but file association in user space remains.
+			 */
+			req->force = 1;
 			req->background = 0;
 			fuse_request_send(ff->fc, req);
+			fuse_file_list_del(ff);
 			path_put(&req->misc.release.path);
 			fuse_put_request(ff->fc, req);
 		} else {
+			fuse_file_list_del(ff);
 			req->end = fuse_release_end;
 			req->background = 1;
 			fuse_request_send_background(ff->fc, req);
 		}
+
 		kfree(ff);
 	}
 }
 
+static void __fuse_file_put(struct fuse_file *ff)
+{
+	if (atomic_dec_and_test(&ff->count))
+		BUG();
+}
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir)
 {
@@ -169,11 +208,37 @@ int fuse_do_open(struct fuse_conn *fc, u
 }
 EXPORT_SYMBOL_GPL(fuse_do_open);
 
-void fuse_finish_open(struct inode *inode, struct file *file)
+static void fuse_link_file(struct file *file, bool write)
 {
-	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff = file->private_data;
+
+	struct list_head *entry = write ? &ff->write_entry : &ff->rw_entry;
+	struct list_head *list  = write ? &fi->write_files : &fi->rw_files;
+
+	spin_lock(&fc->lock);
+	if (list_empty(entry))
+		list_add(entry, list);
+	spin_unlock(&fc->lock);
+}
+
+static void fuse_link_write_file(struct file *file)
+{
+	fuse_link_file(file, true);
+}
+
+static void fuse_link_rw_file(struct file *file)
+{
+	fuse_link_file(file, false);
+}
+
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+
+	ff->ff_dentry = file->f_dentry;
 
 	if (ff->open_flags & FOPEN_DIRECT_IO)
 		file->f_op = &fuse_direct_io_file_operations;
@@ -182,11 +247,11 @@ void fuse_finish_open(struct inode *inod
 	if (ff->open_flags & FOPEN_NONSEEKABLE)
 		nonseekable_open(inode, file);
 
- 	/* file might be required for fallocate */
-	spin_lock(&fc->lock);
+ 	/* file might be required for fallocate or writeback cache */
 	if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
-		list_add(&ff->write_entry, &fi->write_files);
-	spin_unlock(&fc->lock);
+		fuse_link_write_file(file);
+
+	fuse_link_rw_file(file);
 }
 
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -194,6 +259,9 @@ int fuse_open_common(struct inode *inode
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
+	if ((file->f_flags & O_DIRECT) && !(fc->flags & FUSE_ODIRECT))
+		return -EINVAL;
+
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
@@ -202,6 +270,30 @@ int fuse_open_common(struct inode *inode
 	if (err)
 		return err;
 
+	if ((fc->flags & FUSE_WBCACHE) && !isdir) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		u64 size;
+
+		mutex_lock(&inode->i_mutex);
+		atomic_inc(&fi->num_openers);
+
+		if (atomic_read(&fi->num_openers) == 1) {
+			err = fuse_getattr_size(inode, file, &size);
+			if (err) {
+				atomic_dec(&fi->num_openers);
+				mutex_unlock(&inode->i_mutex);
+				fuse_release_common(file, FUSE_RELEASE);
+				return err;
+			}
+
+			spin_lock(&fc->lock);
+			i_size_write(inode, size);
+			spin_unlock(&fc->lock);
+		}
+
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	fuse_finish_open(inode, file);
 
 	return 0;
@@ -215,6 +307,7 @@ static void fuse_prepare_release(struct 
 
 	spin_lock(&fc->lock);
 	list_del(&ff->write_entry);
+	list_del(&ff->rw_entry);
 	if (!RB_EMPTY_NODE(&ff->polled_node))
 		rb_erase(&ff->polled_node, &fc->polled_files);
 	spin_unlock(&fc->lock);
@@ -247,6 +340,13 @@ void fuse_release_common(struct file *fi
 	req->misc.release.path = file->f_path;
 
 	/*
+	 * No more in-flight asynchronous READ or WRITE requests if
+	 * fuse file release is synchronous
+	 */
+	if (ff->fc->close_wait)
+		BUG_ON(atomic_read(&ff->count) != 1);
+
+	/*
 	 * Normally this will send the RELEASE request, however if
 	 * some asynchronous READ or WRITE requests are outstanding,
 	 * the sending will be delayed.
@@ -255,7 +355,8 @@ void fuse_release_common(struct file *fi
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fc->destroy_req != NULL);
+	fuse_file_put(ff, ff->fc->destroy_req != NULL ||
+			  ff->fc->close_wait);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -265,6 +366,53 @@ static int fuse_open(struct inode *inode
 
 static int fuse_release(struct inode *inode, struct file *file)
 {
+	struct fuse_file *ff = file->private_data;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	if (ff->fc->flags & FUSE_WBCACHE) {
+		if (file->f_mode & FMODE_WRITE) {
+			filemap_write_and_wait(file->f_mapping);
+
+			/* Must remove file from write list. Otherwise it is possible this
+			 * file will get more writeback from another files rerouted via write_files
+			 */
+			spin_lock(&ff->fc->lock);
+			list_del_init(&ff->write_entry);
+			spin_unlock(&ff->fc->lock);
+
+			/* A writeback from another fuse file might come after
+			 * filemap_write_and_wait() above
+			 */
+			if (!ff->fc->close_wait)
+				filemap_write_and_wait(file->f_mapping);
+		} else
+			BUG_ON(!list_empty(&ff->write_entry));
+
+		/* This can livelock. Inode can be open via another file
+		 * and that file can generate continuous writeback.
+		 * I think i_mutex could be taken around this.
+		 * 
+		 * For now we replace this with waiting on ff->count,
+		 * it is safe, because we essentially wait only for writeback (and readahead)
+		 * enqueued on this file and it is not going to get new one: it is closing.
+		 */
+		if (!ff->fc->close_wait)
+			wait_event(fi->page_waitq, list_empty_careful(&fi->writepages));
+		else
+			wait_event(fi->page_waitq, atomic_read(&ff->count) == 1);
+
+		/* Wait for threads just released ff to leave their critical sections.
+		 * Taking spinlock is the first thing fuse_release_common does, so that
+		 * this is unneseccary, but it is still good to emphasize right here,
+		 * that we need this.
+		 */
+		spin_unlock_wait(&ff->fc->lock);
+
+		/* since now we can trust userspace attr.size */
+		atomic_dec(&fi->num_openers);
+	} else if (ff->fc->close_wait)
+		wait_event(fi->page_waitq, atomic_read(&ff->count) == 1);
+
 	fuse_release_common(file, FUSE_RELEASE);
 
 	/* return value is ignored by VFS */
@@ -274,6 +422,7 @@ static int fuse_release(struct inode *in
 void fuse_sync_release(struct fuse_file *ff, int flags)
 {
 	WARN_ON(atomic_read(&ff->count) > 1);
+	fuse_file_list_del(ff);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
 	ff->reserved_req->force = 1;
 	ff->reserved_req->background = 0;
@@ -324,7 +473,31 @@ static bool fuse_page_is_writeback(struc
 
 		BUG_ON(req->inode != inode);
 		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-		if (curr_index == index) {
+		if (curr_index <= index &&
+		    index < curr_index + req->num_pages) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (!(idx_from >= curr_index + req->num_pages || idx_to < curr_index)) {
 			found = true;
 			break;
 		}
@@ -348,6 +521,32 @@ static int fuse_wait_on_page_writeback(s
 	return 0;
 }
 
+/*
+ * Can be woken up by FUSE_NOTIFY_INVAL_FILES
+ */
+static int fuse_wait_on_page_writeback_or_invalidate(struct inode *inode,
+						     struct file *file,
+						     pgoff_t index)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff = file->private_data;
+
+	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index) ||
+		   test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
+	return 0;
+}
+
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start, size_t bytes)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	pgoff_t idx_from, idx_to;
+
+	idx_from = start >> PAGE_CACHE_SHIFT;
+	idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+	wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, idx_from, idx_to));
+}
+
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
@@ -360,6 +559,19 @@ static int fuse_flush(struct file *file,
 	if (is_bad_inode(inode))
 		return -EIO;
 
+	if (!(file->f_mode & FMODE_WRITE))
+		return 0;
+
+	if (fc->flags & FUSE_WBCACHE) {
+		err = filemap_write_and_wait(file->f_mapping);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		fuse_sync_writes(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	if (fc->no_flush)
 		return 0;
 
@@ -473,6 +685,9 @@ void fuse_read_fill(struct fuse_req *req
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = count;
+
+	if (opcode == FUSE_READ)
+		req->inode = file->f_dentry->d_inode;
 }
 
 static void fuse_release_user_pages(struct fuse_req *req, int write)
@@ -538,6 +753,13 @@ static void fuse_aio_complete(struct fus
 			}
 		}
 
+		if (res < 0)
+			printk("fuse_aio_complete(io=%p, err=%d, pos=%ld"
+			       "): io->err=%d io->bytes=%ld io->size=%ld "
+			       "is_sync=%d res=%ld ki_opcode=%d ki_pos=%llu\n",
+			       io, err, pos, io->err, io->bytes,
+			       io->size, is_sync_kiocb(io->iocb), res,
+			       io->iocb->ki_opcode, io->iocb->ki_pos);
 		aio_complete(io->iocb, res, 0);
 		kfree(io);
 	}
@@ -548,7 +770,8 @@ static void fuse_aio_complete_req(struct
 	struct fuse_io_priv *io = req->io;
 	ssize_t pos = -1;
 
-	fuse_release_user_pages(req, !io->write);
+	if (!req->bvec)
+		fuse_release_user_pages(req, !io->write);
 
 	if (io->write) {
 		if (req->misc.write.in.size != req->misc.write.out.size)
@@ -560,6 +783,15 @@ static void fuse_aio_complete_req(struct
 				req->out.args[0].size;
 	}
 
+	if (req->out.h.error)
+		printk("fuse_aio_complete_req: request (rw=%s fh=0x%llx "
+		       "pos=%lld size=%d) completed with err=%d\n",
+		       !io->write ? "READ"                   : "WRITE",
+		       !io->write ? req->misc.read.in.fh     : req->misc.write.in.fh,
+		       !io->write ? req->misc.read.in.offset : req->misc.write.in.offset,
+		       !io->write ? req->misc.read.in.size   : req->misc.write.in.size,
+		       req->out.h.error);
+
 	fuse_aio_complete(io, req->out.h.error, pos);
 }
 
@@ -588,6 +820,7 @@ static size_t fuse_send_read(struct fuse
 	struct fuse_conn *fc = ff->fc;
 
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_account_request(fc, count);
 	if (owner != NULL) {
 		struct fuse_read_in *inarg = &req->misc.read.in;
 
@@ -598,7 +831,7 @@ static size_t fuse_send_read(struct fuse
 	if (io->async)
 		return fuse_async_req_send(fc, req, count, io);
 
-	fuse_request_send(fc, req);
+	fuse_request_check_and_send(fc, req, ff);
 	return req->out.args[0].size;
 }
 
@@ -617,6 +850,37 @@ static void fuse_read_update_size(struct
 	spin_unlock(&fc->lock);
 }
 
+static void fuse_readpages_short(struct fuse_req *req, u64 attr_ver)
+{
+	int i;
+	size_t num_read = req->out.args[0].size;
+	struct inode *inode = req->pages[0]->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (fc->flags & FUSE_WBCACHE) {
+		/*
+		 * A hole in a file. Some data after the hole are in page cache.
+		 */
+		size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+		for (i = num_read >> PAGE_CACHE_SHIFT; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			void *mapaddr = kmap_atomic(page, KM_USER0);
+
+			memset(mapaddr + off, 0, PAGE_CACHE_SIZE - off);
+
+			kunmap_atomic(mapaddr, KM_USER0);
+			off = 0;
+		}
+	} else {
+		/*
+		 * Short read means EOF.  If file size is larger, truncate it
+		 */
+		loff_t pos = page_offset(req->pages[0]) + num_read;
+		fuse_read_update_size(inode, pos, attr_ver);
+	}
+}
+
 static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct fuse_io_priv io = { .async = 0, .file = file };
@@ -628,17 +892,20 @@ static int fuse_readpage(struct file *fi
 	size_t count = PAGE_CACHE_SIZE;
 	u64 attr_ver;
 	int err;
+	bool killed = false;
 
 	err = -EIO;
 	if (is_bad_inode(inode))
 		goto out;
 
 	/*
-	 * Page writeback can extend beyond the liftime of the
+	 * Page writeback can extend beyond the lifetime of the
 	 * page-cache page, so make sure we read a properly synced
 	 * page.
+	 *
+	 * But we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress.
 	 */
-	fuse_wait_on_page_writeback(inode, page->index);
+	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
 
 	req = fuse_get_req(fc, 1);
 	err = PTR_ERR(req);
@@ -652,51 +919,57 @@ static int fuse_readpage(struct file *fi
 	req->num_pages = 1;
 	req->pages[0] = page;
 	req->page_descs[0].length = count;
+	req->page_cache = 1;
 	num_read = fuse_send_read(req, &io, pos, count, NULL);
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
+	killed = req->killed;
+	err = killed ? -EIO : req->out.h.error;
 
 	if (!err) {
-		/*
-		 * Short read means EOF.  If file size is larger, truncate it
-		 */
 		if (num_read < count)
-			fuse_read_update_size(inode, pos + num_read, attr_ver);
+			fuse_readpages_short(req, attr_ver);
 
 		SetPageUptodate(page);
 	}
 
+	fuse_put_request(fc, req);
+
 	fuse_invalidate_attr(inode); /* atime changed */
  out:
-	unlock_page(page);
+	if (!killed)
+		unlock_page(page);
 	return err;
 }
 
+void fuse_release_ff(struct inode *inode, struct fuse_file *ff)
+{
+	if (ff) {
+		if (ff->fc->close_wait) {
+			spin_lock(&ff->fc->lock);
+			__fuse_file_put(ff);
+			wake_up(&get_fuse_inode(inode)->page_waitq);
+			spin_unlock(&ff->fc->lock);
+		} else {
+			fuse_file_put(ff, false);
+		}
+	}
+}
+
 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int i;
 	size_t count = req->misc.read.in.size;
 	size_t num_read = req->out.args[0].size;
-	struct address_space *mapping = NULL;
+	struct inode *inode = req->inode;
 
-	for (i = 0; mapping == NULL && i < req->num_pages; i++)
-		mapping = req->pages[i]->mapping;
+	/* fused might process given request before lost-lease happened */
+	if (req->killed && !req->out.h.error)
+		req->out.h.error = -EIO;
 
-	if (mapping) {
-		struct inode *inode = mapping->host;
+	if (req->killed)
+		goto killed;
 
-		/*
-		 * Short read means EOF. If file size is larger, truncate it
-		 */
-		if (!req->out.h.error && num_read < count) {
-			loff_t pos;
-
-			pos = page_offset(req->pages[0]) + num_read;
-			fuse_read_update_size(inode, pos,
-					      req->misc.read.attr_ver);
-		}
-		fuse_invalidate_attr(inode); /* atime changed */
-	}
+	if (!req->out.h.error && num_read < count)
+		fuse_readpages_short(req, req->misc.read.attr_ver);
 
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -707,8 +980,12 @@ static void fuse_readpages_end(struct fu
 		unlock_page(page);
 		page_cache_release(page);
 	}
+
+killed:
+	fuse_invalidate_attr(inode); /* atime changed */
+
 	if (req->ff)
-		fuse_file_put(req->ff, false);
+		fuse_release_ff(inode, req->ff);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -721,7 +998,9 @@ static void fuse_send_readpages(struct f
 	req->out.argpages = 1;
 	req->out.page_zeroing = 1;
 	req->out.page_replace = 1;
+	req->page_cache = 1;
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_account_request(fc, count);
 	req->misc.read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
 		req->ff = fuse_file_get(ff);
@@ -736,7 +1015,10 @@ static void fuse_send_readpages(struct f
 
 struct fuse_fill_data {
 	struct fuse_req *req;
-	struct file *file;
+	union {
+		struct file *file;
+		struct fuse_file *ff;
+	};
 	struct inode *inode;
 	unsigned nr_pages;
 };
@@ -746,9 +1028,11 @@ static int fuse_readpages_fill(void *_da
 	struct fuse_fill_data *data = _data;
 	struct fuse_req *req = data->req;
 	struct inode *inode = data->inode;
+	struct file *file = data->file;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	fuse_wait_on_page_writeback(inode, page->index);
+	/* we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress */
+	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
 
 	if (req->num_pages &&
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
@@ -870,6 +1154,7 @@ static size_t fuse_send_write(struct fus
 	struct fuse_write_in *inarg = &req->misc.write.in;
 
 	fuse_write_fill(req, ff, pos, count);
+	fuse_account_request(fc, count);
 	inarg->flags = file->f_flags;
 	if (owner != NULL) {
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
@@ -883,6 +1168,112 @@ static size_t fuse_send_write(struct fus
 	return req->misc.write.out.size;
 }
 
+static inline bool fuse_file_fail_immediately(struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+
+	return test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline unsigned fuse_page_length(struct page *page)
+{
+	loff_t i_size = i_size_read(page->mapping->host);
+
+	if (i_size > 0) {
+		pgoff_t page_index = page->index;
+		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+		if (page_index < end_index)
+			return PAGE_CACHE_SIZE;
+		if (page_index == end_index)
+			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+	}
+	return 0;
+}
+
+static int fuse_prepare_write(struct fuse_conn *fc, struct file *file,
+		struct page *page, loff_t pos, unsigned len)
+{
+	struct fuse_io_priv io = { .async = 0, .file = file };
+	struct fuse_req *req;
+	unsigned num_read = 0;
+	unsigned page_len;
+	int err;
+
+	if (fuse_file_fail_immediately(file)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return -EIO;
+	}
+
+	if (PageUptodate(page) || (len == PAGE_CACHE_SIZE))
+		return 0;
+
+	page_len = fuse_page_length(page);
+	if (!page_len) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	/*
+	 * Page writeback can extend beyond the liftime of the
+	 * page-cache page, so make sure we read a properly synced
+	 * page.
+	 */
+	fuse_wait_on_page_writeback(page->mapping->host, page->index);
+
+	req = fuse_get_req(fc, 1);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	/*
+	 * FIXME
+	 * we pick up the whole page from userspace, but only two ranges
+	 * [0 .. pos] & [pos + len .. PAGE_CACHE_SIZE] is enough
+	 *
+	 * NB: implementing what suggested above, do not forget to handle
+	 * copied != len in fuse_write_end() properly!
+	 */
+
+	req->out.page_zeroing = 1;
+	req->out.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	req->page_descs[0].offset = 0;
+	req->page_descs[0].length = PAGE_SIZE;
+	num_read = fuse_send_read(req, &io, page_offset(page), page_len, NULL);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+out:
+	if (err) {
+		unlock_page(page);
+		page_cache_release(page);
+	} else if (num_read != PAGE_CACHE_SIZE) {
+		zero_user_segment(page, num_read, PAGE_CACHE_SIZE);
+	}
+
+	return err;
+}
+
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
+
+	BUG_ON(!(fc->flags & FUSE_WBCACHE));
+
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+
+	return fuse_prepare_write(fc, file, *pagep, pos, len);
+}
+
 static void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -895,6 +1286,34 @@ static void fuse_write_update_size(struc
 	spin_unlock(&fc->lock);
 }
 
+static int fuse_commit_write(struct file *file, struct page *page,
+			       unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	fuse_write_update_size(inode, pos);
+	set_page_dirty(page);
+	return 0;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+	fuse_commit_write(file, page, from, from+copied);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
 				    struct inode *inode, loff_t pos,
 				    size_t count)
@@ -1077,6 +1496,9 @@ static ssize_t fuse_file_aio_write(struc
 	struct iov_iter i;
 	loff_t endbyte = 0;
 
+	if (get_fuse_conn(file->f_dentry->d_inode)->flags & FUSE_WBCACHE)
+		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+
 	WARN_ON(iocb->ki_pos != pos);
 
 	ocount = 0;
@@ -1159,7 +1581,12 @@ static inline void fuse_page_descs_lengt
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
 {
-	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+	struct iovec *iov;
+
+	BUG_ON(!iov_iter_has_iovec(ii));
+	iov = (struct iovec *)ii->data;
+
+	return (unsigned long)iov->iov_base + ii->iov_offset;
 }
 
 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
@@ -1250,8 +1677,10 @@ static inline int fuse_iter_npages(const
 
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write)
+		       int flags)
 {
+	int write = flags & FUSE_DIO_WRITE;
+	int cuse = flags & FUSE_DIO_CUSE;
 	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
@@ -1261,6 +1690,8 @@ ssize_t fuse_direct_io(struct fuse_io_pr
 	struct fuse_req *req;
 	struct iov_iter ii;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	iov_iter_init(&ii, iov, nr_segs, count, 0);
 
 	if (io->async)
@@ -1280,10 +1711,16 @@ ssize_t fuse_direct_io(struct fuse_io_pr
 			break;
 		}
 
-		if (write)
+		if (!cuse)
+			fuse_wait_on_writeback(file->f_mapping->host, pos, nbytes);
+
+		if (write) {
 			nres = fuse_send_write(req, io, pos, nbytes, owner);
-		else
+			task_io_account_write(nbytes);
+		} else {
 			nres = fuse_send_read(req, io, pos, nbytes, owner);
+			task_io_account_read(nbytes);
+		}
 
 		if (!io->async)
 			fuse_release_user_pages(req, !write);
@@ -1358,7 +1795,8 @@ static ssize_t __fuse_direct_write(struc
 
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res)
-		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos,
+				     FUSE_DIO_WRITE);
 
 	fuse_invalidate_attr(inode);
 
@@ -1388,8 +1826,13 @@ static ssize_t fuse_direct_write(struct 
 
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
-	__free_page(req->pages[0]);
-	fuse_file_put(req->ff, false);
+	int i;
+
+	for (i = 0; i < req->num_pages; i++)
+		__free_page(req->pages[i]);
+
+	if (!(fc->flags & FUSE_WBCACHE) && !fc->close_wait)
+		fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1397,10 +1840,15 @@ static void fuse_writepage_finish(struct
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	int i;
 
 	list_del(&req->writepages_entry);
-	dec_bdi_stat(bdi, BDI_WRITEBACK);
-	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
+	if ((fc->flags & FUSE_WBCACHE) || fc->close_wait)
+		__fuse_file_put(req->ff);
+	for (i = 0; i < req->num_pages; i++) {
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
+	}
 	bdi_writeout_inc(bdi);
 	wake_up(&fi->page_waitq);
 }
@@ -1413,14 +1861,15 @@ __acquires(&fc->lock)
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
 	struct fuse_write_in *inarg = &req->misc.write.in;
+	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
 
 	if (!fc->connected)
 		goto out_free;
 
-	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
-		inarg->size = PAGE_CACHE_SIZE;
+	if (inarg->offset + data_size <= size) {
+		inarg->size = data_size;
 	} else if (inarg->offset < size) {
-		inarg->size = size & (PAGE_CACHE_SIZE - 1);
+		inarg->size = size - inarg->offset;
 	} else {
 		/* Got truncated off completely */
 		goto out_free;
@@ -1473,7 +1922,21 @@ static void fuse_writepage_end(struct fu
 	fuse_writepage_free(fc, req);
 }
 
-static int fuse_writepage_locked(struct page *page)
+static struct fuse_file *fuse_write_file(struct fuse_conn *fc, struct fuse_inode *fi)
+{
+	struct fuse_file *ff = NULL;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->write_files)) {
+		ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+		fuse_file_get(ff);
+	}
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
+static int fuse_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
@@ -1483,7 +1946,16 @@ static int fuse_writepage_locked(struct 
 	struct fuse_file *ff;
 	struct page *tmp_page;
 
-	set_page_writeback(page);
+	while (fuse_page_is_writeback(inode, page->index)) {
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			redirty_page_for_writepage(wbc, page);
+			return 0;
+		}
+		fuse_wait_on_page_writeback(inode, page->index);
+	}
+
+	if (test_set_page_writeback(page))
+		BUG();
 
 	req = fuse_request_alloc_nofs(1);
 	if (!req)
@@ -1494,13 +1966,13 @@ static int fuse_writepage_locked(struct 
 	if (!tmp_page)
 		goto err_free;
 
-	spin_lock(&fc->lock);
-	BUG_ON(list_empty(&fi->write_files));
-	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
-	req->ff = fuse_file_get(ff);
-	spin_unlock(&fc->lock);
+	ff = fuse_write_file(fc, fi);
+	if (!ff)
+		goto err_nofile;
 
+	req->ff = ff;
 	fuse_write_fill(req, ff, page_offset(page), 0);
+	fuse_account_request(fc, PAGE_CACHE_SIZE);
 
 	copy_highpage(tmp_page, page);
 	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
@@ -1525,6 +1997,9 @@ static int fuse_writepage_locked(struct 
 
 	return 0;
 
+err_nofile:
+	printk("FUSE: page dirtied on dead file\n");
+	__free_page(tmp_page);
 err_free:
 	fuse_request_free(req);
 err:
@@ -1536,18 +2011,289 @@ static int fuse_writepage(struct page *p
 {
 	int err;
 
-	err = fuse_writepage_locked(page);
+	err = fuse_writepage_locked(page, wbc);
 	unlock_page(page);
 
 	return err;
 }
 
+static void fuse_end_writeback(int npages, struct page ** orig_pages)
+{
+	int i;
+
+	for (i = 0; i < npages; i++)
+		end_page_writeback(orig_pages[i]);
+}
+
+static int fuse_send_writepages(struct fuse_fill_data *data)
+{
+	int i, all_ok = 1;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	loff_t off = -1;
+	int npages = req->num_pages;
+	struct page * orig_pages[npages];
+
+	/* we can acquire ff here because we do have locked pages here! */
+	if (!data->ff)
+		data->ff = fuse_write_file(fc, fi);
+
+	if (!data->ff) {
+		printk("FUSE: pages dirtied on dead file\n");
+		fuse_end_writeback(npages, req->pages);
+		return -EIO;
+	}
+
+	if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &data->ff->ff_state)) {
+		for (i = 0; i < npages; i++) {
+			struct page *page = req->pages[i];
+			req->pages[i] = NULL;
+			SetPageError(page);
+			end_page_writeback(page);
+		}
+		fuse_release_ff(inode, data->ff);
+		data->ff = NULL;
+		fuse_put_request(fc, req);
+		return 0;
+	}
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = req->pages[i];
+		struct address_space *mapping = page->mapping;
+		struct page *tmp_page;
+
+		tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (tmp_page) {
+			copy_highpage(tmp_page, page);
+			inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+			inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+		} else
+			all_ok = 0;
+		orig_pages[i] = page;
+		req->pages[i] = tmp_page;
+		if (i == 0)
+			off = page_offset(page);
+	}
+
+	if (!all_ok) {
+		/* Undo everything, release temporary pages. We could do this in main
+		 * loop, but why to mess up main loop for a case which never happens
+		 * in this life.
+		 */
+		for (i = 0; i < npages; i++) {
+			struct page * page = orig_pages[i];
+			struct page *tmp_page = req->pages[i];
+			if (tmp_page) {
+				dec_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+				dec_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+				__free_page(tmp_page);
+				req->pages[i] = NULL;
+			}
+		}
+		fuse_end_writeback(npages, orig_pages);
+		fuse_release_ff(inode, data->ff);
+		data->ff = NULL;
+		return -ENOMEM;
+	}
+
+	req->ff = fuse_file_get(data->ff);
+	fuse_write_fill(req, data->ff, off, 0);
+	fuse_account_request(fc, npages << PAGE_CACHE_SHIFT);
+
+	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+	req->in.argpages = 1;
+	req->background = 1; /* writeback always goes to bg_queue */
+	fuse_page_descs_length_init(req, 0, req->num_pages);
+	req->page_descs[0].offset = 0;
+	req->end = fuse_writepage_end;
+	req->inode = data->inode;
+
+	spin_lock(&fc->lock);
+	list_add(&req->writepages_entry, &fi->writepages);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(data->inode);
+	spin_unlock(&fc->lock);
+
+	fuse_end_writeback(npages, orig_pages);
+
+	fuse_release_ff(inode, data->ff);
+	data->ff = NULL;
+	return 0;
+}
+
+/*
+ * Returns true if and only if fuse connection is blocked and there is
+ * no file invalidation in progress.
+ */
+static inline bool fuse_blocked_for_wb(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool blocked = true;
+
+	if (!fc->blocked)
+		return false;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->write_files)) {
+		struct fuse_file *ff = list_entry(fi->write_files.next,
+						  struct fuse_file, write_entry);
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+			blocked = false;
+	}
+	spin_unlock(&fc->lock);
+
+	return blocked;
+}
+
+static int fuse_writepages_fill(struct page *page,
+		struct writeback_control *wbc, void *_data)
+{
+	struct fuse_fill_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int check_for_blocked = 0;
+
+	while (fuse_page_is_writeback(inode, page->index)) {
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+		fuse_wait_on_page_writeback(inode, page->index);
+	}
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int err;
+
+		if (wbc->nonblocking && fc->blocked) {
+			BUG_ON(wbc->sync_mode == WB_SYNC_ALL);
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+
+		err = fuse_send_writepages(data);
+		if (err) {
+			unlock_page(page);
+			return err;
+		}
+
+		data->req = req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+		if (req == NULL) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
+
+		check_for_blocked = 1;
+	}
+
+	req->pages[req->num_pages] = page;
+	req->num_pages++;
+
+	if (test_set_page_writeback(page))
+		BUG();
+
+	unlock_page(page);
+
+	if (!wbc->nonblocking && check_for_blocked)
+		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
+
+	return 0;
+}
+
+static int fuse_dummy_writepage(struct page *page,
+				struct writeback_control *wbc,
+				void *data)
+{
+	unlock_page(page);
+	return 0;
+}
+
+static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_fill_data data;
+	int err;
+
+	if (!(fc->flags & FUSE_WBCACHE))
+		return generic_writepages(mapping, wbc);
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	if (wbc->nonblocking) {
+		if (fc->blocked)
+			return 0;
+	}
+
+	/*
+	 * We use fuse_blocked_for_wb() instead of just fc->blocked to avoid
+	 * deadlock when we are called from fuse_invalidate_files() in case
+	 * of single-threaded fused.
+	 */
+	if (wbc->sync_mode != WB_SYNC_NONE)
+		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
+
+	/* More than optimization: writeback pages to /dev/null; fused would
+	 * drop our FUSE_WRITE requests anyway, but it will be blocked while
+	 * sending NOTIFY_INVAL_FILES until we return!
+	 *
+	 * NB: We can't wait till fuse_send_writepages() because
+	 * fuse_writepages_fill() would possibly deadlock on
+	 * fuse_page_is_writeback().
+	 */
+ 	data.ff = fuse_write_file(fc, get_fuse_inode(inode));
+	if (data.ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &data.ff->ff_state)) {
+		err = write_cache_pages(mapping, wbc, fuse_dummy_writepage,
+					mapping);
+		fuse_release_ff(inode, data.ff);
+		data.ff = NULL;
+		goto out_put;
+	}
+	if (data.ff) {
+		fuse_release_ff(inode, data.ff);
+		data.ff = NULL;
+	}
+
+	data.inode = inode;
+	data.req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+	err = -ENOMEM;
+	if (!data.req)
+		goto out_put;
+
+	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+	if (data.req) {
+		if (!err && data.req->num_pages) {
+			err = fuse_send_writepages(&data);
+			if (err)
+				fuse_put_request(fc, data.req);
+		} else
+			fuse_put_request(fc, data.req);
+	}
+out_put:
+	BUG_ON(data.ff);
+out:
+	return err;
+}
+
 static int fuse_launder_page(struct page *page)
 {
 	int err = 0;
 	if (clear_page_dirty_for_io(page)) {
 		struct inode *inode = page->mapping->host;
-		err = fuse_writepage_locked(page);
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+		};
+		err = fuse_writepage_locked(page, &wbc);
 		if (!err)
 			fuse_wait_on_page_writeback(inode, page->index);
 	}
@@ -1560,7 +2306,11 @@ static int fuse_launder_page(struct page
  */
 static void fuse_vma_close(struct vm_area_struct *vma)
 {
-	filemap_write_and_wait(vma->vm_file->f_mapping);
+	struct file *file = vma->vm_file;
+	struct fuse_file *ff = file->private_data;
+
+	if (!(ff->fc->flags & FUSE_WBCACHE))
+		filemap_write_and_wait(file->f_mapping);
 }
 
 /*
@@ -1587,6 +2337,9 @@ static int fuse_page_mkwrite(struct vm_a
 	 */
 	struct inode *inode = vma->vm_file->f_mapping->host;
 
+	if (fuse_file_fail_immediately(vma->vm_file))
+		return -EIO;
+
 	fuse_wait_on_page_writeback(inode, page->index);
 	return 0;
 }
@@ -1599,20 +2352,13 @@ static const struct vm_operations_struct
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
-		struct inode *inode = file->f_dentry->d_inode;
-		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		struct fuse_file *ff = file->private_data;
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 		/*
 		 * file may be written through mmap, so chain it onto the
 		 * inodes's write_file list
 		 */
-		spin_lock(&fc->lock);
-		if (list_empty(&ff->write_entry))
-			list_add(&ff->write_entry, &fi->write_files);
-		spin_unlock(&fc->lock);
-	}
+		fuse_link_write_file(file);
+
 	file_accessed(file);
 	vma->vm_ops = &fuse_file_vm_ops;
 	return 0;
@@ -1859,8 +2605,9 @@ static int fuse_ioctl_copy_user(struct p
 		kaddr = map = kmap(page);
 
 		while (todo) {
-			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
-			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			struct iovec *iiov = (struct iovec *)ii.data;
+			char __user *uaddr = iiov->iov_base + ii.iov_offset;
+			size_t iov_len = iiov->iov_len - ii.iov_offset;
 			size_t copy = min(todo, iov_len);
 			size_t left;
 
@@ -2249,10 +2996,109 @@ int fuse_notify_poll_wakeup(struct fuse_
 	return 0;
 }
 
+static struct fuse_io_priv *fuse_io_priv_create(struct kiocb *iocb,
+		loff_t off, int rw, bool async)
+{
+	struct fuse_io_priv *io;
+
+	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+	if (!io)
+		return NULL;
+
+	spin_lock_init(&io->lock);
+	io->reqs = 1;
+	io->bytes = -1;
+	io->size = 0;
+	io->offset = off;
+	io->write = (rw == WRITE);
+	io->err = 0;
+	io->file = iocb->ki_filp;
+	io->async = async;
+	io->iocb = iocb;
+
+	return io;
+}
+
+static ssize_t fuse_direct_IO_bvec(int rw, struct kiocb *iocb,
+		struct bio_vec *bvec, loff_t offset, unsigned long bvec_len)
+{
+	struct fuse_io_priv *io;
+	struct fuse_req *req;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	size_t nmax = (rw == WRITE ? fc->max_write : fc->max_read);
+	size_t filled, nres;
+	loff_t pos = iocb->ki_pos;
+	int i;
+
+	if (nmax > FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT)
+		nmax = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
+	io = fuse_io_priv_create(iocb, pos, rw, true);
+	if (!io)
+		return -ENOMEM;
+
+	req = NULL;
+	filled = 0;
+	i = 0;
+
+	while (1) {
+		if (!req) {
+			req = fuse_get_req_for_background(fc, 0);
+			if (IS_ERR(req))
+				break;
+
+			if (rw == WRITE)
+				req->in.argbvec = 1;
+			else
+				req->out.argbvec = 1;
+
+			filled = 0;
+			req->bvec = bvec;
+		}
+
+		if (filled + bvec->bv_len <= nmax) {
+			filled += bvec->bv_len;
+			req->num_bvecs++;
+			bvec++;
+			i++;
+
+			if (i < bvec_len)
+				continue;
+		}
+
+		BUG_ON(!filled);
+
+		if (rw == WRITE)
+			nres = fuse_send_write(req, io, pos,
+					filled, NULL);
+		else
+			nres = fuse_send_read(req, io, pos,
+					filled, NULL);
+
+		BUG_ON(nres != filled);
+		fuse_put_request(fc, req);
+
+		if (i == bvec_len)
+			break;
+
+		pos += filled;
+		req = NULL;
+		filled = 0;
+	}
+
+	fuse_aio_complete(io, !IS_ERR(req) ? 0 : PTR_ERR(req), -1);
+	return -EIOCBQUEUED;
+}
+
 static void fuse_do_truncate(struct file *file)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct iattr attr;
+	int err;
 
 	attr.ia_valid = ATTR_SIZE;
 	attr.ia_size = i_size_read(inode);
@@ -2260,7 +3106,10 @@ static void fuse_do_truncate(struct file
 	attr.ia_file = file;
 	attr.ia_valid |= ATTR_FILE;
 
-	fuse_do_setattr(inode, &attr, file);
+	err = fuse_do_setattr(inode, &attr, file);
+	if (err)
+		printk("failed to truncate to %lld with error %d\n",
+		       i_size_read(inode), err);
 }
 
 static inline loff_t fuse_round_up(loff_t off)
@@ -2275,7 +3124,7 @@ fuse_direct_IO(int rw, struct kiocb *ioc
 	ssize_t ret = 0;
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
-	bool async_dio = ff->fc->async_dio;
+	bool async_dio = ff->fc->async_dio | (ff->fc->flags & FUSE_WBCACHE);
 	loff_t pos = 0;
 	struct inode *inode;
 	loff_t i_size;
@@ -2288,28 +3137,25 @@ fuse_direct_IO(int rw, struct kiocb *ioc
 
 	/* optimization for short read */
 	if (async_dio && rw != WRITE && offset + count > i_size) {
+		loff_t new_count;
+
 		if (offset >= i_size)
 			return 0;
-		count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+
+		new_count = i_size - offset;
+		if (!(ff->fc->flags & FUSE_WBCACHE))
+			new_count = fuse_round_up(new_count);
+
+		count = min_t(loff_t, count, new_count);
 	}
 
-	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
-	if (!io)
-		return -ENOMEM;
-	spin_lock_init(&io->lock);
-	io->reqs = 1;
-	io->bytes = -1;
-	io->size = 0;
-	io->offset = offset;
-	io->write = (rw == WRITE);
-	io->err = 0;
-	io->file = file;
 	/*
 	 * By default, we want to optimize all I/Os with async request
 	 * submission to the client filesystem if supported.
 	 */
-	io->async = async_dio;
-	io->iocb = iocb;
+	io = fuse_io_priv_create(iocb, offset, rw, async_dio);
+	if (!io)
+		return -ENOMEM;
 
 	/*
 	 * We cannot asynchronously extend the size of a file. We have no method
@@ -2325,6 +3171,14 @@ fuse_direct_IO(int rw, struct kiocb *ioc
 		ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
 
 	if (io->async) {
+		if (ret != count) {
+			struct fuse_file *ff = file->private_data;
+			printk("fuse_direct_IO: failed to %s %ld bytes "
+			       "(offset=%llu ret=%ld i_size=%llu ino=%lu "
+			       "fh=%llu\n", rw == WRITE ? "write" : "read",
+			       count, offset, ret, i_size, inode->i_ino,
+			       ff->fh);
+		}
 		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
 
 		/* we have a non-extending, async request, so return */
@@ -2346,6 +3200,32 @@ fuse_direct_IO(int rw, struct kiocb *ioc
 	return ret;
 }
 
+static ssize_t fuse_direct_IO_page(int rw, struct kiocb *iocb,
+	struct page *page, loff_t offset)
+{
+	struct iovec iov;
+	mm_segment_t oldfs;
+	ssize_t ret;
+
+	iov.iov_base = kmap(page);
+	iov.iov_len = PAGE_SIZE;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	ret = fuse_direct_IO(rw, iocb, &iov, offset, 1);
+	if (ret != -EIOCBQUEUED && ret != PAGE_SIZE)
+		printk("fuse_direct_IO_page: io failed with err=%ld "
+		       "(rw=%s fh=0x%llx pos=%lld)\n",
+		       ret, rw == WRITE ? "WRITE" : "READ",
+		       ((struct fuse_file *)iocb->ki_filp->private_data)->fh,
+		       offset);
+
+	set_fs(oldfs);
+	kunmap(page);
+	return ret;
+}
+
 long fuse_file_fallocate(struct inode *inode, struct fuse_file *ff, int mode,
 			 loff_t offset, loff_t length)
 {
@@ -2440,6 +3320,8 @@ static const struct file_operations fuse
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -2462,11 +3344,16 @@ static const struct file_operations fuse
 static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.writepage	= fuse_writepage,
+	.writepages	= fuse_writepages,
 	.launder_page	= fuse_launder_page,
+	.write_begin	= fuse_write_begin,
+	.write_end	= fuse_write_end,
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.bmap		= fuse_bmap,
 	.direct_IO	= fuse_direct_IO,
+	.direct_IO_bvec	= fuse_direct_IO_bvec,
+	.direct_IO_page	= fuse_direct_IO_page,
 };
 
 void fuse_init_file_inode(struct inode *inode)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/fuse_i.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/fuse_i.h
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/fuse_i.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/fuse_i.h	2015-01-21 12:02:51.758999790 +0300
@@ -33,7 +33,7 @@
 #define FUSE_NAME_MAX 1024
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 5
+#define FUSE_CTL_NUM_DENTRIES 11
 
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
@@ -44,11 +44,30 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+/* This means, that the userspace can reconnect back to the mountpoint */
+#define FUSE_CAN_RECONNECT	(1 << 2)
+
+/* Enable write-back cache */
+#define FUSE_WBCACHE		(1 << 3)
+
+/* Enable direct access */
+#define FUSE_ODIRECT		(1 << 4)
+
+/* Enable synchronous umount */
+#define FUSE_UMOUNT_WAIT	(1 << 5)
+
+/* Disable synchronous close */
+#define FUSE_DISABLE_CLOSE_WAIT	(1 << 6)
+
 /** Number of page pointers embedded in fuse_req */
 #define FUSE_REQ_INLINE_PAGES 1
 
 /** List of active connections */
+#ifdef CONFIG_VE
+#define fuse_conn_list (get_exec_env()->_fuse_conn_list)
+#else
 extern struct list_head fuse_conn_list;
+#endif
 
 /** Global mutex protecting fuse_conn_list and the control filesystem */
 extern struct mutex fuse_mutex;
@@ -95,6 +114,9 @@ struct fuse_inode {
 	/** Files usable in writepage.  Protected by fc->lock */
 	struct list_head write_files;
 
+	/** List of all opened files.  Protected by fc->lock */
+	struct list_head rw_files;
+
 	/** Writepages pending on truncate or fsync */
 	struct list_head queued_writes;
 
@@ -108,6 +130,9 @@ struct fuse_inode {
 	/** List of writepage requestst (pending or sent) */
 	struct list_head writepages;
 
+	/** Mostly to detect very first open */
+	atomic_t num_openers;
+
 	/** Miscellaneous bits describing inode state */
 	unsigned long state;
 };
@@ -148,11 +173,25 @@ struct fuse_file {
 	/** Entry on inode's write_files list */
 	struct list_head write_entry;
 
+	/** Entry on inode's rw_files list */
+	struct list_head rw_entry;
+
 	/** RB node to be linked on fuse_conn->polled_files */
 	struct rb_node polled_node;
 
 	/** Wait queue head for poll */
 	wait_queue_head_t poll_wait;
+
+	struct list_head fl;
+	struct dentry *ff_dentry;
+
+	unsigned long ff_state;
+};
+
+/** FUSE file states (ff_state) */
+enum {
+	/** Any fops on given ff should fail immediately */
+	FUSE_S_FAIL_IMMEDIATELY,
 };
 
 /** One input argument of a request */
@@ -168,6 +207,8 @@ struct fuse_in {
 
 	/** True if the data for the last argument is in req->pages */
 	unsigned argpages:1;
+	/** True is the data for the last argument is in req->bvecs */
+	unsigned argbvec:1;
 
 	/** Number of arguments */
 	unsigned numargs;
@@ -198,6 +239,8 @@ struct fuse_out {
 
 	/** Last argument is a list of pages to copy data to */
 	unsigned argpages:1;
+	/** Last argument is a list of bvecs to copy data to */
+	unsigned argbvec:1;
 
 	/** Zero partially or not copied pages */
 	unsigned page_zeroing:1;
@@ -286,6 +329,12 @@ struct fuse_req {
 	/** Request is counted as "waiting" */
 	unsigned waiting:1;
 
+	/** Request contains pages from page-cache */
+	unsigned page_cache:1;
+
+	/** Request was killed -- pages were released */
+	unsigned killed:1;
+
 	/** State of the request */
 	enum fuse_req_state state;
 
@@ -322,8 +371,9 @@ struct fuse_req {
 		struct fuse_lk_in lk_in;
 	} misc;
 
-	/** page vector */
+	/** page vector / bvecs */
 	struct page **pages;
+	struct bio_vec *bvec;
 
 	/** page-descriptor vector */
 	struct fuse_page_desc *page_descs;
@@ -337,8 +387,11 @@ struct fuse_req {
 	/** inline page-descriptor vector */
 	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
 
-	/** number of pages in vector */
-	unsigned num_pages;
+	/** number of pages/bvecs in vector */
+	union {
+		unsigned num_pages;
+		unsigned num_bvecs;
+	};
 
 	/** File used in the request (or NULL) */
 	struct fuse_file *ff;
@@ -529,6 +582,9 @@ struct fuse_conn {
 	/** Use enhanced/automatic page cache invalidation. */
 	unsigned auto_inval_data:1;
 
+	/** Wait for response from daemon on close */
+	unsigned close_wait:1;
+
 	/** Does the filesystem support readdirplus? */
 	unsigned do_readdirplus:1;
 
@@ -582,6 +638,8 @@ struct fuse_conn {
 
 	/** Read/write semaphore to hold when accessing sb. */
 	struct rw_semaphore killsb;
+
+	struct list_head conn_files;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -619,7 +677,7 @@ int fuse_inode_eq(struct inode *inode, v
  */
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
-			u64 attr_valid, u64 attr_version);
+			u64 attr_valid, u64 attr_version, int creat);
 
 int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 		     struct fuse_entry_out *outarg, struct inode **inode);
@@ -759,6 +817,12 @@ void fuse_put_request(struct fuse_conn *
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
+ * Send a request (synchronous) if not FUSE_S_FAIL_IMMEDIATELY
+ */
+void fuse_request_check_and_send(struct fuse_conn *fc, struct fuse_req *req,
+				 struct fuse_file *ff);
+
+/**
  * Send a request in the background
  */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
@@ -768,6 +832,7 @@ void fuse_request_send_background_locked
 
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
+int fuse_reconnect_fd(int fd, struct fuse_conn *fc);
 
 /**
  * Invalidate inode attributes
@@ -823,6 +888,8 @@ u64 fuse_lock_owner_id(struct fuse_conn 
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed);
 
+int fuse_getattr_size(struct inode *inode, struct file *file, u64 *size);
+
 void fuse_flush_writepages(struct inode *inode);
 
 void fuse_set_nowrite(struct inode *inode);
@@ -843,11 +910,28 @@ int fuse_reverse_inval_inode(struct supe
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 			     struct qstr *name);
 
+/**
+ * File-system tells the kernel to invalidate all fuse-files (and cache)
+ * for the given node id.
+ */
+int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid);
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
+
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write);
+		       int flags);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/fuse/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/fuse/inode.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/fuse/inode.c	2015-01-21 12:02:52.020992836 +0300
@@ -20,15 +20,21 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#include <linux/ve_proto.h>
+#include <linux/sysctl.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
 static struct kmem_cache *fuse_inode_cachep;
+#ifndef CONFIG_VE
 struct list_head fuse_conn_list;
+#endif
 DEFINE_MUTEX(fuse_mutex);
 
+static int fuse_ve_odirect;
+
 static int set_global_limit(const char *val, struct kernel_param *kp);
 
 unsigned max_user_bgreq;
@@ -94,6 +100,7 @@ static struct inode *fuse_alloc_inode(st
 	fi->orig_ino = 0;
 	fi->state = 0;
 	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->rw_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
 	init_waitqueue_head(&fi->page_waitq);
@@ -110,6 +117,7 @@ static void fuse_destroy_inode(struct in
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	BUG_ON(!list_empty(&fi->write_files));
+	BUG_ON(!list_empty(&fi->rw_files));
 	BUG_ON(!list_empty(&fi->queued_writes));
 	kfree(fi->forget);
 	kmem_cache_free(fuse_inode_cachep, inode);
@@ -189,6 +197,7 @@ void fuse_change_attributes(struct inode
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	int wb = fc->flags & FUSE_WBCACHE;
 	loff_t oldsize;
 	struct timespec old_mtime;
 
@@ -203,10 +212,11 @@ void fuse_change_attributes(struct inode
 	fuse_change_attributes_common(inode, attr, attr_valid);
 
 	oldsize = inode->i_size;
-	i_size_write(inode, attr->size);
+	if (!wb || !S_ISREG(inode->i_mode))
+		i_size_write(inode, attr->size);
 	spin_unlock(&fc->lock);
 
-	if (S_ISREG(inode->i_mode)) {
+	if (!wb && S_ISREG(inode->i_mode)) {
 		bool inval = false;
 
 		if (oldsize != attr->size) {
@@ -231,8 +241,12 @@ void fuse_change_attributes(struct inode
 	}
 }
 
-static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
+			    int num_openers)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	atomic_set(&fi->num_openers, num_openers);
+
 	inode->i_mode = attr->mode & S_IFMT;
 	inode->i_size = attr->size;
 	if (S_ISREG(inode->i_mode)) {
@@ -269,7 +283,7 @@ static int fuse_inode_set(struct inode *
 
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
-			u64 attr_valid, u64 attr_version)
+			u64 attr_valid, u64 attr_version, int creat)
 {
 	struct inode *inode;
 	struct fuse_inode *fi;
@@ -284,7 +298,8 @@ struct inode *fuse_iget(struct super_blo
 		inode->i_flags |= S_NOATIME|S_NOCMTIME;
 		inode->i_generation = generation;
 		inode->i_data.backing_dev_info = &fc->bdi;
-		fuse_init_inode(inode, attr);
+		fuse_init_inode(inode, attr,
+				(fc->flags & FUSE_WBCACHE) ? creat : 0);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
 		/* Inode has changed type, any I/O on the old should fail */
@@ -327,6 +342,77 @@ int fuse_reverse_inval_inode(struct supe
 	return 0;
 }
 
+static void fuse_kill_requests(struct fuse_conn *fc, struct inode *inode,
+			       struct list_head *req_list)
+{
+	struct fuse_req *req;
+
+	list_for_each_entry(req, req_list, list)
+		if (req->inode == inode && req->page_cache && !req->killed) {
+			int i;
+
+			BUG_ON(req->in.h.opcode != FUSE_READ);
+			req->killed = 1;
+
+			for (i = 0; i < req->num_pages; i++) {
+				struct page *page = req->pages[i];
+				SetPageError(page);
+				unlock_page(page);
+				req->pages[i] = NULL;
+			}
+
+			req->num_pages = 0;
+		}
+}
+
+int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
+{
+	struct super_block *sb = fc->sb;
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_file *ff;
+	int err;
+
+	if (!fc->async_read) {
+		printk(KERN_ERR "Turn async_read ON to use "
+				"FUSE_NOTIFY_INVAL_FILES!\n");
+		return -EOPNOTSUPP;
+	}
+
+	inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		return -ENOENT;
+
+	fi = get_fuse_inode(inode);
+	spin_lock(&fc->lock);
+	list_for_each_entry(ff, &fi->rw_files, rw_entry) {
+		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+	}
+	spin_unlock(&fc->lock);
+
+	/* let them see FUSE_S_FAIL_IMMEDIATELY */
+	wake_up_all(&fc->blocked_waitq);
+
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
+		spin_lock(&fc->lock);
+		fuse_kill_requests(fc, inode, &fc->processing);
+		fuse_kill_requests(fc, inode, &fc->pending);
+		fuse_kill_requests(fc, inode, &fc->bg_queue);
+		fuse_kill_requests(fc, inode, &fc->io);
+		wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
+		spin_unlock(&fc->lock);
+
+		err = invalidate_inode_pages2(inode->i_mapping);
+	}
+
+	if (!err)
+		fuse_invalidate_attr(inode);
+
+	iput(inode);
+	return err;
+}
+
 static void fuse_umount_begin(struct super_block *sb)
 {
 	fuse_abort_conn(get_fuse_conn_super(sb));
@@ -434,8 +520,13 @@ enum {
 	OPT_GROUP_ID,
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
+	OPT_CAN_RECONNECT,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+	OPT_WBCACHE,
+	OPT_ODIRECT,
+	OPT_UMOUNT_WAIT,
+	OPT_DISABLE_CLOSE_WAIT,
 	OPT_ERR
 };
 
@@ -446,8 +537,13 @@ static const match_table_t tokens = {
 	{OPT_GROUP_ID,			"group_id=%u"},
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
+	{OPT_CAN_RECONNECT,		"can_reconnect"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_BLKSIZE,			"blksize=%u"},
+	{OPT_WBCACHE,			"writeback_enable"},
+	{OPT_ODIRECT,			"direct_enable"},
+	{OPT_UMOUNT_WAIT,		"umount_wait"},
+	{OPT_DISABLE_CLOSE_WAIT,	"disable_close_wait"},
 	{OPT_ERR,			NULL}
 };
 
@@ -505,6 +601,10 @@ static int parse_fuse_opt(char *opt, str
 			d->flags |= FUSE_ALLOW_OTHER;
 			break;
 
+		case OPT_CAN_RECONNECT:
+			d->flags |= FUSE_CAN_RECONNECT;
+			break;
+
 		case OPT_MAX_READ:
 			if (match_int(&args[0], &value))
 				return 0;
@@ -517,6 +617,28 @@ static int parse_fuse_opt(char *opt, str
 			d->blksize = value;
 			break;
 
+		case OPT_WBCACHE:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_WBCACHE;
+			break;
+
+		case OPT_ODIRECT:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_ODIRECT;
+			break;
+
+		case OPT_UMOUNT_WAIT:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_UMOUNT_WAIT;
+			break;
+
+		case OPT_DISABLE_CLOSE_WAIT:
+			d->flags |= FUSE_DISABLE_CLOSE_WAIT;
+			break;
+
 		default:
 			return 0;
 		}
@@ -539,6 +661,16 @@ static int fuse_show_options(struct seq_
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		seq_puts(m, ",allow_other");
+	if (fc->flags & FUSE_CAN_RECONNECT)
+		seq_puts(m, ",can_reconnect");
+	if (fc->flags & FUSE_WBCACHE)
+		seq_puts(m, ",writeback_enable");
+	if (fc->flags & FUSE_ODIRECT)
+		seq_puts(m, ",direct_enable");
+	if (fc->flags & FUSE_UMOUNT_WAIT)
+		seq_puts(m, ",umount_wait");
+	if (fc->flags & FUSE_DISABLE_CLOSE_WAIT)
+		seq_puts(m, ",disable_close_wait");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
 	if (mnt->mnt_sb->s_bdev &&
@@ -563,6 +695,7 @@ void fuse_conn_init(struct fuse_conn *fc
 	INIT_LIST_HEAD(&fc->interrupts);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
+	INIT_LIST_HEAD(&fc->conn_files);
 	fc->forget_list_tail = &fc->forget_list_head;
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
@@ -603,7 +736,7 @@ static struct inode *fuse_get_root_inode
 	attr.mode = mode;
 	attr.ino = FUSE_ROOT_ID;
 	attr.nlink = 1;
-	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+	return fuse_iget(sb, 1, 0, &attr, 0, 0, 0);
 }
 
 struct fuse_inode_handle {
@@ -671,8 +804,10 @@ static int fuse_encode_fh(struct dentry 
 	u64 nodeid;
 	u32 generation;
 
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return  255;
+	}
 
 	nodeid = get_fuse_inode(inode)->nodeid;
 	generation = inode->i_generation;
@@ -936,10 +1071,10 @@ static int fuse_bdi_init(struct fuse_con
 		return err;
 
 	/*
-	 * For a single fuse filesystem use max 1% of dirty +
+	 * For a single fuse filesystem use max 20% of dirty +
 	 * writeback threshold.
 	 *
-	 * This gives about 1M of write buffer for memory maps on a
+	 * This gives about 20M of write buffer for memory maps on a
 	 * machine with 1G and 10% dirty_ratio, which should be more
 	 * than enough.
 	 *
@@ -947,7 +1082,13 @@ static int fuse_bdi_init(struct fuse_con
 	 *
 	 *    /sys/class/bdi/<bdi>/max_ratio
 	 */
-	bdi_set_max_ratio(&fc->bdi, 1);
+	bdi_set_max_ratio(&fc->bdi, 20);
+
+	/*
+	 * These values have precedence over max_ratio
+	 */
+	bdi_set_max_dirty(&fc->bdi, (256 * 1024 * 1024) / PAGE_SIZE);
+	bdi_set_min_dirty(&fc->bdi, (64 * 1024 * 1024) / PAGE_SIZE);
 
 	return 0;
 }
@@ -1038,7 +1179,7 @@ static int fuse_fill_super(struct super_
 		goto err_put_root;
 	init_req->background = 1;
 
-	if (is_bdev) {
+	if (is_bdev || (fc->flags & FUSE_UMOUNT_WAIT)) {
 		fc->destroy_req = fuse_request_alloc(0);
 		if (!fc->destroy_req)
 			goto err_free_init_req;
@@ -1088,7 +1229,21 @@ static int fuse_get_sb(struct file_syste
 		       int flags, const char *dev_name,
 		       void *raw_data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+	int error;
+
+	error = get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+
+	/* Hack to distinguish pcs fuse service and to force synchronous close for it.
+	 * Seems, this is the only place where we have some variable (dev_name), which
+	 * is not confined by fuse API and already defined.
+	 */
+	if (!error && mnt->mnt_devname && strncmp(mnt->mnt_devname, "pstorage://", 11) == 0) {
+		struct fuse_conn *fc = mnt->mnt_sb->s_fs_info;
+
+		if (!(fc->flags & FUSE_DISABLE_CLOSE_WAIT))
+			fc->close_wait = 1;
+	}
+	return error;
 }
 
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1239,6 +1394,58 @@ static void fuse_sysfs_cleanup(void)
 	kobject_put(fuse_kobj);
 }
 
+#ifdef CONFIG_VE
+static int fuse_start(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	if (ve->fuse_fs_type != NULL)
+		return -EBUSY;
+
+	INIT_LIST_HEAD(&ve->_fuse_conn_list);
+	return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL);
+}
+
+static void fuse_stop(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	if (ve->fuse_fs_type == NULL)
+		return;
+
+	unregister_ve_fs_type(ve->fuse_fs_type, NULL);
+	/* fuse_fs_type is freed in real_put_ve -> free_ve_filesystems */
+}
+
+static struct ve_hook fuse_ve_hook = {
+	.init		= fuse_start,
+	.fini		= fuse_stop,
+	.owner		= THIS_MODULE,
+	.priority	= HOOK_PRIO_FS,
+};
+#endif
+
+static ctl_table fuse_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "fuse-ve-odirect",
+		.data		= &fuse_ve_odirect,
+		.maxlen		= sizeof(fuse_ve_odirect),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec,
+	},
+	{}
+};
+
+static struct ctl_path fuse_path[] = {
+	{ .procname = "fs", .ctl_name = CTL_FS, },
+	{},
+};
+
+static struct ctl_table_header * fuse_sysctl_header;
+
 static int __init fuse_init(void)
 {
 	int res;
@@ -1263,9 +1470,14 @@ static int __init fuse_init(void)
 	if (res)
 		goto err_sysfs_cleanup;
 
+#ifdef CONFIG_VE
+	ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook);
+#endif
 	sanitize_global_limit(&max_user_bgreq);
 	sanitize_global_limit(&max_user_congthresh);
 
+	fuse_sysctl_header = register_sysctl_paths(fuse_path, fuse_table);
+
 	return 0;
 
  err_sysfs_cleanup:
@@ -1282,10 +1494,14 @@ static void __exit fuse_exit(void)
 {
 	printk(KERN_DEBUG "fuse exit\n");
 
+#ifdef CONFIG_VE
+	ve_hook_unregister(&fuse_ve_hook);
+#endif
 	fuse_ctl_cleanup();
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
 	fuse_dev_cleanup();
+	unregister_sysctl_table(fuse_sysctl_header);
 }
 
 module_init(fuse_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/bmap.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/bmap.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/bmap.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/bmap.c	2015-01-21 12:02:53.453954797 +0300
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
@@ -135,21 +136,21 @@ int gfs2_unstuff_dinode(struct gfs2_inod
 		   and write it out to disk */
 
 		unsigned int n = 1;
-		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL, 1);
 		if (error)
 			goto out_brelse;
 		if (isdir) {
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 			if (error)
-				goto out_brelse;
+				goto out_brelse2;
 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 					      dibh, sizeof(struct gfs2_dinode));
 			brelse(bh);
 		} else {
 			error = gfs2_unstuffer_page(ip, dibh, block, page);
 			if (error)
-				goto out_brelse;
+				goto out_brelse2;
 		}
 	}
 
@@ -161,13 +162,16 @@ int gfs2_unstuff_dinode(struct gfs2_inod
 
 	if (i_size_read(&ip->i_inode)) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
-		gfs2_add_inode_blocks(&ip->i_inode, 1);
+		vfs_dq_claim_block(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	}
 
 	ip->i_height = 1;
 	di->di_height = cpu_to_be16(1);
 
+out_brelse2:
+	if (error && i_size_read(&ip->i_inode))
+		vfs_dq_release_reservation_block(&ip->i_inode, 1);
 out_brelse:
 	brelse(dibh);
 out:
@@ -483,9 +487,13 @@ static int gfs2_bmap_alloc(struct inode 
 	do {
 		int error;
 		n = blks - alloced;
-		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
-		if (error)
+		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL, 1);
+		if (error) {
+			if (alloced != 0)
+				vfs_dq_release_reservation_block(&ip->i_inode,
+								 alloced);
 			return error;
+		}
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 			gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -556,7 +564,7 @@ static int gfs2_bmap_alloc(struct inode 
 	} while ((state != ALLOC_DATA) || !dblock);
 
 	ip->i_height = height;
-	gfs2_add_inode_blocks(&ip->i_inode, alloced);
+	vfs_dq_claim_block(&ip->i_inode, alloced);
 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
 	map_bh(bh_map, inode->i_sb, dblock);
 	bh_map->b_size = dblks << inode->i_blkbits;
@@ -826,7 +834,7 @@ static int do_strip(struct gfs2_inode *i
 		}
 
 		*p = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
+		vfs_dq_free_block(&ip->i_inode, 1);
 	}
 	if (bstart) {
 		__gfs2_free_blocks(ip, bstart, blen, metadata);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/dir.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/dir.c	2015-01-21 12:02:53.456954716 +0300
@@ -55,6 +55,7 @@
 
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
@@ -873,23 +874,26 @@ got_dent:
 	return dent;
 }
 
-static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth,
+				  int *error_p)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int n = 1;
 	u64 bn;
-	int error;
 	struct buffer_head *bh;
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
 	struct qstr name = { .name = "", .len = 0, .hash = 0 };
 
-	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
-	if (error)
+	*error_p = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL, 1);
+	if (*error_p)
 		return NULL;
 	bh = gfs2_meta_new(ip->i_gl, bn);
-	if (!bh)
+	if (!bh) {
+		vfs_dq_release_reservation_block(inode, 1);
+		*error_p = -ENOSPC;
 		return NULL;
+	}
 
 	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 	gfs2_trans_add_meta(ip->i_gl, bh);
@@ -933,9 +937,11 @@ static int dir_make_exhash(struct inode 
 
 	/*  Turn over a new leaf  */
 
-	leaf = new_leaf(inode, &bh, 0);
-	if (!leaf)
-		return -ENOSPC;
+	leaf = new_leaf(inode, &bh, 0, &error);
+	if (!leaf) {
+		brelse(dibh);
+		return error;
+	}
 	bn = bh->b_blocknr;
 
 	gfs2_assert(sdp, dip->i_entries < (1 << 16));
@@ -955,11 +961,13 @@ static int dir_make_exhash(struct inode 
 	dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
 				gfs2_dirent_last, &args, NULL);
 	if (!dent) {
+		vfs_dq_release_reservation_block(inode, 1);
 		brelse(bh);
 		brelse(dibh);
 		return -EIO;
 	}
 	if (IS_ERR(dent)) {
+		vfs_dq_release_reservation_block(inode, 1);
 		brelse(bh);
 		brelse(dibh);
 		return PTR_ERR(dent);
@@ -986,7 +994,7 @@ static int dir_make_exhash(struct inode 
 		*lp = cpu_to_be64(bn);
 
 	i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
-	gfs2_add_inode_blocks(&dip->i_inode, 1);
+	vfs_dq_claim_block(&dip->i_inode, 1);
 	dip->i_diskflags |= GFS2_DIF_EXHASH;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
@@ -1039,10 +1047,11 @@ static int dir_split_leaf(struct inode *
 
 	gfs2_trans_add_meta(dip->i_gl, obh);
 
-	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1,
+			 &error);
 	if (!nleaf) {
 		brelse(obh);
-		return -ENOSPC;
+		return error;
 	}
 	bn = nbh->b_blocknr;
 
@@ -1126,9 +1135,11 @@ static int dir_split_leaf(struct inode *
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
 		gfs2_trans_add_meta(dip->i_gl, dibh);
-		gfs2_add_inode_blocks(&dip->i_inode, 1);
+		vfs_dq_claim_block(&dip->i_inode, 1);
 		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
+	} else {
+		vfs_dq_release_reservation_block(&dip->i_inode, 1);
 	}
 
 	brelse(obh);
@@ -1140,6 +1151,7 @@ fail_lpfree:
 	kfree(lp);
 
 fail_brelse:
+	vfs_dq_release_reservation_block(&dip->i_inode, 1);
 	brelse(obh);
 	brelse(nbh);
 	return error;
@@ -1685,20 +1697,22 @@ static int dir_new_leaf(struct inode *in
 
 	gfs2_trans_add_meta(ip->i_gl, obh);
 
-	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth), &error);
 	if (!leaf) {
 		brelse(obh);
-		return -ENOSPC;
+		return error;
 	}
 	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
 	brelse(bh);
 	brelse(obh);
 
 	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error)
+	if (error) {
+		vfs_dq_release_reservation_block(&ip->i_inode, 1);
 		return error;
+	}
 	gfs2_trans_add_meta(ip->i_gl, bh);
-	gfs2_add_inode_blocks(&ip->i_inode, 1);
+	vfs_dq_claim_block(&ip->i_inode, 1);
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	return 0;
@@ -1959,7 +1973,7 @@ static int leaf_dealloc(struct gfs2_inod
 		brelse(bh);
 
 		gfs2_free_meta(dip, blk, 1);
-		gfs2_add_inode_blocks(&dip->i_inode, -1);
+		vfs_dq_free_block(&dip->i_inode, 1);
 	}
 
 	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/export.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/export.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/export.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/export.c	2015-01-21 12:02:52.020992836 +0300
@@ -37,9 +37,13 @@ static int gfs2_encode_fh(struct dentry 
 	struct super_block *sb = inode->i_sb;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	if (*len < GFS2_SMALL_FH_SIZE ||
-	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+	if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+		*len = GFS2_LARGE_FH_SIZE;
 		return 255;
+	} else if (*len < GFS2_SMALL_FH_SIZE) {
+		*len = GFS2_SMALL_FH_SIZE;
+		return 255;
+	}
 
 	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
 	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/file.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/file.c	2015-01-21 12:02:42.775238293 +0300
@@ -364,6 +364,13 @@ static int gfs2_page_mkwrite(struct vm_a
 	loff_t size;
 	int ret;
 
+	if (vma->vm_file->f_op->get_host) {
+		struct file *file = vma->vm_file->f_op->get_host(vma->vm_file);
+		inode = file->f_path.dentry->d_inode;
+		ip = GFS2_I(inode);
+		sdp = GFS2_SB(inode);
+	}
+
 	sb_start_pagefault(inode->i_sb);
 
 	/* Update file times before taking page lock */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/glops.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/glops.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/glops.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/glops.c	2015-01-21 12:02:53.428955462 +0300
@@ -305,7 +305,7 @@ static int inode_go_lock(struct gfs2_hol
 		return 0;
 
 	if (test_bit(GIF_INVALID, &ip->i_flags)) {
-		error = gfs2_inode_refresh(ip);
+		error = gfs2_inode_refresh(ip, 0);
 		if (error)
 			return error;
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/incore.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/incore.h
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/incore.h	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/incore.h	2015-01-21 12:02:53.421955646 +0300
@@ -341,6 +341,7 @@ struct gfs2_inode {
 	u32 i_diskflags;
 	u8 i_height;
 	u8 i_depth;
+	qsize_t i_reserved_quota;
 };
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/inode.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/inode.c	2015-01-21 12:02:53.462954559 +0300
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/posix_acl.h>
 #include <linux/sort.h>
@@ -156,6 +157,7 @@ struct inode *gfs2_inode_lookup(struct s
 
 	if (inode->i_state & I_NEW) {
 		struct gfs2_sbd *sdp = GFS2_SB(inode);
+		ip->i_reserved_quota = 0;
 		ip->i_no_formal_ino = no_formal_ino;
 
 		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -178,7 +180,7 @@ struct inode *gfs2_inode_lookup(struct s
 
  		if (type == DT_UNKNOWN) {
 			/* Inode glock must be locked already */
-			error = gfs2_inode_refresh(GFS2_I(inode));
+			error = gfs2_inode_refresh(GFS2_I(inode), 0);
 			if (error)
 				goto fail_refresh;
 		} else {
@@ -274,13 +276,16 @@ static void gfs2_set_nlink(struct inode 
 	}
 }
 
-static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf, int createi)
 {
 	const struct gfs2_dinode *str = buf;
 	struct timespec atime;
-	u16 height, depth;
+	u16 height = be16_to_cpu(str->di_height);
+	u16 depth  = be16_to_cpu(str->di_depth);
 
-	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr) ||
+		     height > GFS2_MAX_META_HEIGHT ||
+		     depth > GFS2_DIR_MAX_DEPTH))
 		goto corrupt;
 	ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
 	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
@@ -297,7 +302,12 @@ static int gfs2_dinode_in(struct gfs2_in
 	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
 	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
 	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
-	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+
+	if (!createi)
+		gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+	else
+		vfs_dq_claim_block(&ip->i_inode, be64_to_cpu(str->di_blocks));
+
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
 	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
@@ -312,14 +322,7 @@ static int gfs2_dinode_in(struct gfs2_in
 
 	ip->i_diskflags = be32_to_cpu(str->di_flags);
 	gfs2_set_inode_flags(&ip->i_inode);
-	height = be16_to_cpu(str->di_height);
-	if (unlikely(height > GFS2_MAX_META_HEIGHT))
-		goto corrupt;
 	ip->i_height = (u8)height;
-
-	depth = be16_to_cpu(str->di_depth);
-	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
-		goto corrupt;
 	ip->i_depth = (u8)depth;
 	ip->i_entries = be32_to_cpu(str->di_entries);
 
@@ -341,7 +344,7 @@ corrupt:
  * Returns: errno
  */
 
-int gfs2_inode_refresh(struct gfs2_inode *ip)
+int gfs2_inode_refresh(struct gfs2_inode *ip, int create)
 {
 	struct buffer_head *dibh;
 	int error;
@@ -350,7 +353,7 @@ int gfs2_inode_refresh(struct gfs2_inode
 	if (error)
 		return error;
 
-	error = gfs2_dinode_in(ip, dibh->b_data);
+	error = gfs2_dinode_in(ip, dibh->b_data, create);
 	brelse(dibh);
 	clear_bit(GIF_INVALID, &ip->i_flags);
 
@@ -515,7 +518,7 @@ static int alloc_dinode(struct gfs2_inod
 	if (error)
 		goto out_ipreserv;
 
-	error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
+	error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation, 1);
 	ip->i_no_formal_ino = ip->i_generation;
 	ip->i_inode.i_ino = ip->i_no_addr;
 	ip->i_goal = ip->i_no_addr;
@@ -669,6 +672,7 @@ static int link_dinode(struct gfs2_inode
 	return 0;
 
 fail_end_trans:
+	gfs2_unlink_di(&ip->i_inode);
 	gfs2_trans_end(sdp);
 fail_ipreserv:
 	gfs2_inplace_release(dip);
@@ -816,24 +820,37 @@ int gfs2_create_inode(struct inode *dir,
 
 	ip->i_iopen_gh.gh_gl->gl_object = ip;
 	gfs2_glock_put(io_gl);
+
+	if (vfs_dq_alloc_inode(inode)) {
+		error = -EDQUOT;
+		goto fail_dq;
+	}
+	if (vfs_dq_reserve_block(inode, 1)) {
+		vfs_dq_free_inode(inode);
+		error = -EDQUOT;
+		goto fail_dq;
+	}
+
 	gfs2_set_iop(inode);
 	insert_inode_hash(inode);
 
-	error = gfs2_inode_refresh(ip);
-	if (error)
+	error = gfs2_inode_refresh(ip, 1);
+	if (error) {
+		vfs_dq_release_reservation_block(inode, 1);
 		goto fail_gunlock3;
+	}
 
 	error = gfs2_acl_create(dip, inode);
 	if (error)
-		goto fail_gunlock3;
+		goto fail_gunlock4;
 
 	error = gfs2_security_init(dip, ip);
 	if (error)
-		goto fail_gunlock3;
+		goto fail_gunlock4;
 
 	error = link_dinode(dip, name, ip, arq);
 	if (error)
-		goto fail_gunlock3;
+		goto fail_gunlock4;
 
 	if (bh)
 		brelse(bh);
@@ -846,12 +863,20 @@ int gfs2_create_inode(struct inode *dir,
 	d_instantiate(dentry, inode);
 	return 0;
 
+fail_gunlock4:
+	vfs_dq_free_block(inode, 1);
 fail_gunlock3:
 	gfs2_glock_dq_uninit(ghs + 1);
 	if (ip->i_gl)
 		gfs2_glock_put(ip->i_gl);
+	vfs_dq_free_inode(inode);
 	goto fail_gunlock;
 
+fail_dq:
+	ip->i_iopen_gh.gh_gl->gl_object = NULL;
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	goto fail_gunlock2;
+
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
 fail_free_inode:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/inode.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/inode.h
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/inode.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/inode.h	2015-01-21 12:02:53.428955462 +0300
@@ -124,7 +124,7 @@ extern struct inode *gfs2_lookup_by_inum
 					 unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
 
-extern int gfs2_inode_refresh(struct gfs2_inode *ip);
+extern int gfs2_inode_refresh(struct gfs2_inode *ip, int create);
 
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 				  int is_root);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/ops_fstype.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/ops_fstype.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/ops_fstype.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/ops_fstype.c	2015-01-21 12:02:53.421955646 +0300
@@ -1202,6 +1202,7 @@ static int fill_super(struct super_block
 	sb->s_export_op = &gfs2_export_ops;
 	sb->s_xattr = gfs2_xattr_handlers;
 	sb->s_qcop = &gfs2_quotactl_ops;
+	sb->dq_op  = &gfs2_quota_operations;
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
 	sb->s_time_gran = 1;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/ops_inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/ops_inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/ops_inode.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/ops_inode.c	2015-01-21 12:02:53.436955248 +0300
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
@@ -338,6 +339,10 @@ static int gfs2_unlink(struct inode *dir
 	struct gfs2_rgrpd *rgd;
 	int error;
 
+	vfs_dq_init(dentry->d_inode);
+
+	vfs_dq_init(dentry->d_inode);
+
 	error = gfs2_rindex_update(sdp);
 	if (error)
 		return error;
@@ -533,6 +538,7 @@ static int gfs2_rename(struct inode *odi
 	int error;
 
 	if (ndentry->d_inode) {
+		vfs_dq_init(ndentry->d_inode);
 		nip = GFS2_I(ndentry->d_inode);
 		if (ip == nip)
 			return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/quota.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/quota.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/quota.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/quota.c	2015-01-21 12:02:53.422955619 +0300
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
@@ -1641,3 +1642,30 @@ const struct quotactl_ops gfs2_quotactl_
 	.get_xquota	= gfs2_xquota_get,
 	.set_xquota	= gfs2_xquota_set,
 };
+
+static qsize_t *gfs2_get_reserved_space(struct inode *inode)
+{
+	return &GFS2_I(inode)->i_reserved_quota;
+}
+
+const struct dquot_operations gfs2_quota_operations = {
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
+	.alloc_space	= dquot_alloc_space,
+	.reserve_space	= dquot_reserve_space,
+	.claim_space	= dquot_claim_space,
+	.release_rsv	= dquot_release_reserved_space,
+	.alloc_inode	= dquot_alloc_inode,
+	.free_space	= dquot_free_space,
+	.free_inode	= dquot_free_inode,
+	.transfer	= dquot_transfer,
+	.write_dquot	= dquot_commit,
+	.acquire_dquot	= dquot_acquire,
+	.release_dquot	= dquot_release,
+	.mark_dirty	= dquot_mark_dquot_dirty,
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+
+	.get_reserved_space = gfs2_get_reserved_space,
+};
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/quota.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/quota.h
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/quota.h	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/quota.h	2015-01-21 12:02:53.422955619 +0300
@@ -53,5 +53,6 @@ static inline int gfs2_quota_lock_check(
 
 extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
 extern const struct quotactl_ops gfs2_quotactl_ops;
+extern const struct dquot_operations gfs2_quota_operations;
 
 #endif /* __QUOTA_DOT_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/rgrp.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/rgrp.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/rgrp.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/rgrp.c	2015-01-21 12:02:53.443955062 +0300
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
@@ -2013,12 +2014,13 @@ out:
  * @nblocks: requested number of blocks/extent length (value/result)
  * @dinode: 1 if we're allocating a dinode block, else 0
  * @generation: the generation number of the inode
+ * @do_reserve: reserve linux disk quota blocks
  *
  * Returns: 0 or error
  */
 
 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
-		      bool dinode, u64 *generation)
+		      bool dinode, u64 *generation, int do_reserve)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
@@ -2027,6 +2029,12 @@ int gfs2_alloc_blocks(struct gfs2_inode 
 	u64 goal;
 	u64 block; /* block, within the file system scope */
 	int error;
+	int quota_initial_reserve = *nblocks;
+
+	if (do_reserve) {
+		if (vfs_dq_reserve_block(&ip->i_inode, *nblocks))
+			return -EDQUOT;
+	}
 
 	if (gfs2_rs_active(ip->i_res))
 		goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm);
@@ -2044,6 +2052,10 @@ int gfs2_alloc_blocks(struct gfs2_inode 
 				      NULL);
 	}
 
+	if (do_reserve && *nblocks != quota_initial_reserve)
+		vfs_dq_release_reservation_block(&ip->i_inode,
+					quota_initial_reserve - *nblocks);
+
 	/* Since all blocks are reserved in advance, this shouldn't happen */
 	if (error) {
 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d, fail_pt=%d\n",
@@ -2104,6 +2116,8 @@ int gfs2_alloc_blocks(struct gfs2_inode 
 	return 0;
 
 rgrp_error:
+	if (do_reserve)
+		vfs_dq_release_reservation_block(&ip->i_inode, *nblocks);
 	gfs2_rgrp_error(rbm.rgd);
 	return -EIO;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/rgrp.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/rgrp.h
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/rgrp.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/rgrp.h	2015-01-21 12:02:53.443955062 +0300
@@ -44,7 +44,7 @@ extern int gfs2_inplace_reserve(struct g
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
-			     bool dinode, u64 *generation);
+			     bool dinode, u64 *generation, int do_reserve);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
 extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/super.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/super.c	2015-01-21 12:02:53.448954930 +0300
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/statfs.h>
 #include <linux/seq_file.h>
@@ -1506,7 +1507,7 @@ static void gfs2_delete_inode(struct ino
 	}
 
 	if (test_bit(GIF_INVALID, &ip->i_flags)) {
-		error = gfs2_inode_refresh(ip);
+		error = gfs2_inode_refresh(ip, 0);
 		if (error)
 			goto out_truncate;
 	}
@@ -1579,6 +1580,12 @@ out_truncate:
 		goto out_unlock;
 	/* Needs to be done before glock release & also in a transaction */
 	truncate_inode_pages(&inode->i_data, 0);
+
+	vfs_dq_init(inode);
+	vfs_dq_free_block(inode, 1);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
+
 	gfs2_trans_end(sdp);
 
 out_unlock:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/gfs2/xattr.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/xattr.c
--- linux-2.6.32-504.3.3.el6.orig/fs/gfs2/xattr.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/gfs2/xattr.c	2015-01-21 12:02:53.463954532 +0300
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
@@ -288,7 +289,7 @@ static int ea_dealloc_unstuffed(struct g
 		}
 
 		*dataptrs = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
+		vfs_dq_free_block(&ip->i_inode, 1);
 	}
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
@@ -612,7 +613,7 @@ static int ea_alloc_blk(struct gfs2_inod
 	u64 block;
 	int error;
 
-	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL, 1);
 	if (error)
 		return error;
 	gfs2_trans_add_unrevoke(sdp, block, 1);
@@ -627,7 +628,7 @@ static int ea_alloc_blk(struct gfs2_inod
 	ea->ea_flags = GFS2_EAFLAG_LAST;
 	ea->ea_num_ptrs = 0;
 
-	gfs2_add_inode_blocks(&ip->i_inode, 1);
+	vfs_dq_claim_block(&ip->i_inode, 1);
 
 	return 0;
 }
@@ -650,6 +651,12 @@ static int ea_write(struct gfs2_inode *i
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	int error;
 
+	if (GFS2_EAREQ_SIZE_STUFFED(er) > sdp->sd_jbsize &&
+	    vfs_dq_reserve_block(&ip->i_inode,
+				 DIV_ROUND_UP(er->er_data_len,
+					      sdp->sd_jbsize)))
+		return -EDQUOT;
+
 	ea->ea_data_len = cpu_to_be32(er->er_data_len);
 	ea->ea_name_len = er->er_name_len;
 	ea->ea_type = er->er_type;
@@ -674,15 +681,18 @@ static int ea_write(struct gfs2_inode *i
 			int mh_size = sizeof(struct gfs2_meta_header);
 			unsigned int n = 1;
 
-			error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
-			if (error)
+			error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL, 0);
+			if (error) {
+				vfs_dq_release_reservation_block(&ip->i_inode,
+							ea->ea_num_ptrs - x);
 				return error;
+			}
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_meta(ip->i_gl, bh);
 			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
 
-			gfs2_add_inode_blocks(&ip->i_inode, 1);
+			vfs_dq_claim_block(&ip->i_inode, 1);
 
 			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
 							   data_len;
@@ -714,6 +724,7 @@ static int ea_alloc_skeleton(struct gfs2
 	struct gfs2_alloc_parms ap = { .target = blks };
 	struct buffer_head *dibh;
 	int error;
+	int error2 = 0;
 
 	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
 	if (error)
@@ -733,8 +744,13 @@ static int ea_alloc_skeleton(struct gfs2
 	if (error)
 		goto out_ipres;
 
-	error = skeleton_call(ip, er, private);
-	if (error)
+	/*
+	 * skeleton_call below might allocate a few disk blocks,
+	 * then fail with -EDQUOT. It wouldn't be nice to bail out
+	 * without flushing metadata info to disk in such a case.
+	 */
+	error2 = error = skeleton_call(ip, er, private);
+	if (error && (error != -EDQUOT))
 		goto out_end_trans;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -751,6 +767,8 @@ out_ipres:
 	gfs2_inplace_release(ip);
 out_gunlock_q:
 	gfs2_quota_unlock(ip);
+	if (error2)
+		return error2;
 	return error;
 }
 
@@ -990,7 +1008,7 @@ static int ea_set_block(struct gfs2_inod
 	} else {
 		u64 blk;
 		unsigned int n = 1;
-		error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
+		error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL, 1);
 		if (error)
 			return error;
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
@@ -1003,7 +1021,7 @@ static int ea_set_block(struct gfs2_inod
 		*eablk = cpu_to_be64(ip->i_eattr);
 		ip->i_eattr = blk;
 		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
-		gfs2_add_inode_blocks(&ip->i_inode, 1);
+		vfs_dq_claim_block(&ip->i_inode, 1);
 
 		eablk++;
 	}
@@ -1404,7 +1422,7 @@ static int ea_dealloc_indirect(struct gf
 		}
 
 		*eablk = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
+		vfs_dq_free_block(&ip->i_inode, 1);
 	}
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
@@ -1459,7 +1477,7 @@ static int ea_dealloc_block(struct gfs2_
 	gfs2_free_meta(ip, ip->i_eattr, 1);
 
 	ip->i_eattr = 0;
-	gfs2_add_inode_blocks(&ip->i_inode, -1);
+	vfs_dq_free_block(&ip->i_inode, 1);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/hugetlbfs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/hugetlbfs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/hugetlbfs/inode.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/hugetlbfs/inode.c	2015-01-21 12:02:41.772264922 +0300
@@ -919,7 +919,8 @@ static int can_do_hugetlb_shm(void)
 	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag,
 				struct user_struct **user, int creat_flags)
 {
 	int error = -ENOMEM;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/inode.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/inode.c	2015-01-21 12:02:58.265827069 +0300
@@ -26,6 +26,8 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
 #include "internal.h"
 
 /*
@@ -86,6 +88,7 @@ static struct hlist_head *inode_hashtabl
  * the i_state of an inode while it is in use..
  */
 DEFINE_SPINLOCK(inode_lock);
+EXPORT_SYMBOL(inode_lock);
 
 /*
  * iprune_sem provides exclusion between the kswapd or try_to_free_pages
@@ -106,7 +109,7 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static struct kmem_cache *inode_cachep __read_mostly;
+struct kmem_cache *inode_cachep __read_mostly;
 
 static void wake_up_inode(struct inode *inode)
 {
@@ -125,19 +128,22 @@ static void wake_up_inode(struct inode *
  * These are initializations that need to be done on every inode
  * allocation as the fields are not initialised by slab allocation.
  */
+
+static struct address_space_operations vfs_empty_aops;
+const struct inode_operations vfs_empty_iops;
+static const struct file_operations vfs_empty_fops;
+EXPORT_SYMBOL(vfs_empty_iops);
+
 int inode_init_always(struct super_block *sb, struct inode *inode)
 {
-	static const struct address_space_operations empty_aops;
-	static const struct inode_operations empty_iops;
-	static const struct file_operations empty_fops;
 	struct address_space *const mapping = &inode->i_data;
 
 	inode->i_sb = sb;
 	inode->i_blkbits = sb->s_blocksize_bits;
 	inode->i_flags = 0;
 	atomic_set(&inode->i_count, 1);
-	inode->i_op = &empty_iops;
-	inode->i_fop = &empty_fops;
+	inode->i_op = &vfs_empty_iops;
+	inode->i_fop = &vfs_empty_fops;
 	inode->i_nlink = 1;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
@@ -158,21 +164,22 @@ int inode_init_always(struct super_block
 	if (security_inode_alloc(inode))
 		goto out;
 	spin_lock_init(&inode->i_lock);
-	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+	lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key);
 
 	mutex_init(&inode->i_mutex);
-	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+	lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key);
 
 	init_rwsem(&inode->i_alloc_sem);
-	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key);
 
-	mapping->a_ops = &empty_aops;
+	mapping->a_ops = &vfs_empty_aops;
 	mapping->host = inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->assoc_mapping = NULL;
 	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
+	mapping->dirtied_ub = NULL;
 
 	/*
 	 * If the block_device provides a backing_dev_info for client
@@ -227,6 +234,7 @@ static struct inode *alloc_inode(struct 
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	BUG_ON(inode->i_data.dirtied_ub);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 #ifdef CONFIG_FS_POSIX_ACL
@@ -265,6 +273,7 @@ void inode_init_once(struct inode *inode
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	INIT_LIST_HEAD(&inode->i_data.i_peer_list);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_INOTIFY
 	INIT_LIST_HEAD(&inode->inotify_watches);
@@ -297,6 +306,7 @@ void __iget(struct inode *inode)
 		list_move(&inode->i_list, &inode_in_use);
 	inodes_stat.nr_unused--;
 }
+EXPORT_SYMBOL(__iget);
 
 /*
  * get additional reference to inode; caller must already hold one.
@@ -377,14 +387,77 @@ static void dispose_list(struct list_hea
 	spin_unlock(&inode_lock);
 }
 
+static void show_header(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	printk("VFS: Busy inodes after unmount. "
+			"sb = %p, fs type = %s, sb count = %d, "
+			"sb->s_root = %s\n", sb,
+			(sb->s_type != NULL) ? sb->s_type->name : "",
+			sb->s_count,
+			(sb->s_root != NULL) ?
+			(char *)sb->s_root->d_name.name : "");
+}
+
+static void show_inode(struct inode *inode)
+{
+	struct dentry *d;
+	struct vfsmount *mnt;
+	int i;
+
+	printk("inode = %p, inode->i_count = %d, "
+			"inode->i_nlink = %d, "
+			"inode->i_mode = %d, "
+			"inode->i_state = %ld, "
+			"inode->i_flags = %d, "
+			"inode->i_devices.next = %p, "
+			"inode->i_devices.prev = %p, "
+			"inode->i_ino = %ld\n",
+			inode,
+			atomic_read(&inode->i_count),
+			inode->i_nlink,
+			inode->i_mode,
+			inode->i_state,
+			inode->i_flags,
+			inode->i_devices.next,
+			inode->i_devices.prev,
+			inode->i_ino);
+	printk("inode dump: ");
+	for (i = 0; i < sizeof(*inode); i++)
+		printk("%2.2x ", *((u_char *)inode + i));
+	printk("\n");
+	list_for_each_entry(d, &inode->i_dentry, d_alias) {
+		printk("  d_alias %s d_count=%d d_flags=%x\n",
+			d->d_name.name, atomic_read(&d->d_count), d->d_flags);
+		for (i = 0; i < sizeof(*d); i++)
+			printk("%2.2x ", *((u_char *)d + i));
+		printk("\n");
+	}
+
+	spin_lock(&vfsmount_lock);
+	list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) {
+		if (mnt->mnt_sb != inode->i_sb)
+			continue;
+		printk("mnt=%p count=%d flags=%x exp_mask=%x\n",
+				mnt, atomic_read(&mnt->mnt_count),
+				mnt->mnt_flags,
+				mnt->mnt_expiry_mark);
+		for (i = 0; i < sizeof(*mnt); i++)
+			printk("%2.2x ", *((u_char *)mnt + i));
+		printk("\n");
+	}
+	spin_unlock(&vfsmount_lock);
+}
+
 /*
  * Invalidate all inodes for a device.
  */
 static int invalidate_list(struct list_head *head, struct list_head *dispose,
-			   bool kill_dirty)
+			   bool kill_dirty, int check)
 {
 	struct list_head *next;
-	int busy = 0, count = 0;
+	int busy = 0, count = 0, once = 1;
 
 	next = head->next;
 	for (;;) {
@@ -418,6 +491,14 @@ static int invalidate_list(struct list_h
 			continue;
 		}
 		busy = 1;
+
+		if (check) {
+			if (once) {
+				once = 0;
+				show_header(inode);
+			}
+			show_inode(inode);
+		}
 	}
 	/* only unused inodes may be cached with i_count zero */
 	inodes_stat.nr_unused -= count;
@@ -435,7 +516,7 @@ static int invalidate_list(struct list_h
  *	If @kill_dirty is set, discard dirty inodes too, otherwise treat
  *	them as busy.
  */
-int invalidate_inodes(struct super_block *sb, bool kill_dirty)
+int invalidate_inodes_check(struct super_block *sb, bool kill_dirty, int check)
 {
 	int busy;
 	LIST_HEAD(throw_away);
@@ -444,7 +525,7 @@ int invalidate_inodes(struct super_block
 	spin_lock(&inode_lock);
 	inotify_unmount_inodes(&sb->s_inodes);
 	fsnotify_unmount_inodes(sb);
-	busy = invalidate_list(&sb->s_inodes, &throw_away, kill_dirty);
+	busy = invalidate_list(&sb->s_inodes, &throw_away, kill_dirty, check);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&throw_away);
@@ -452,7 +533,7 @@ int invalidate_inodes(struct super_block
 
 	return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
+EXPORT_SYMBOL(invalidate_inodes_check);
 
 static int can_unuse(struct inode *inode)
 {
@@ -543,6 +624,7 @@ static void prune_icache(int nr_to_scan)
  */
 static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+	KSTAT_PERF_ENTER(shrink_icache)
 	if (nr) {
 		/*
 		 * Nasty deadlock avoidance.  We may hold various FS locks,
@@ -553,6 +635,7 @@ static int shrink_icache_memory(struct s
 			return -1;
 		prune_icache(nr);
 	}
+	KSTAT_PERF_LEAVE(shrink_icache)
 	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
@@ -660,6 +743,31 @@ void inode_add_to_lists(struct super_blo
 EXPORT_SYMBOL_GPL(inode_add_to_lists);
 
 /**
+ *	new_inode_pseudo 	- obtain an inode
+ *	@sb: superblock
+ *
+ *	Allocates a new inode for given superblock.
+ *	Inode wont be chained in superblock s_inodes list
+ *	This means :
+ *	- fs can't be unmount
+ *	- quotas, fsnotify, writeback can't work
+ */
+struct inode *new_inode_pseudo(struct super_block *sb)
+{
+	struct inode *inode = alloc_inode(sb);
+
+	if (inode) {
+		spin_lock(&inode->i_lock);
+		inodes_stat.nr_inodes++;
+		inode->i_state = 0;
+		spin_unlock(&inode->i_lock);
+		INIT_LIST_HEAD(&inode->i_list);
+		INIT_LIST_HEAD(&inode->i_sb_list);
+	}
+	return inode;
+}
+
+/**
  *	new_inode 	- obtain an inode
  *	@sb: superblock
  *
@@ -703,14 +811,14 @@ void unlock_new_inode(struct inode *inod
 
 		/* Set new key only if filesystem hasn't already changed it */
 		if (!lockdep_match_class(&inode->i_mutex,
-		    &type->i_mutex_key)) {
+		    &type->proto->i_mutex_key)) {
 			/*
 			 * ensure nobody is actually holding i_mutex
 			 */
 			mutex_destroy(&inode->i_mutex);
 			mutex_init(&inode->i_mutex);
 			lockdep_set_class(&inode->i_mutex,
-					  &type->i_mutex_dir_key);
+					  &type->proto->i_mutex_dir_key);
 		}
 	}
 #endif
@@ -1273,7 +1381,7 @@ int generic_detach_inode(struct inode *i
 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 			list_move(&inode->i_list, &inode_unused);
 		inodes_stat.nr_unused++;
-		if (sb->s_flags & MS_ACTIVE) {
+		if (sb->s_flags & MS_ACTIVE && !(inode->i_flags & S_NOUNUSE)) {
 			spin_unlock(&inode_lock);
 			return 0;
 		}
@@ -1383,6 +1491,8 @@ sector_t bmap(struct inode *inode, secto
 }
 EXPORT_SYMBOL(bmap);
 
+unsigned __read_mostly relatime_interval = 24*60*60; /* one day */
+
 /*
  * With relative atime, only update atime if the previous atime is
  * earlier than either the ctime or mtime or if at least a day has
@@ -1406,10 +1516,10 @@ static int relatime_need_update(struct v
 		return 1;
 
 	/*
-	 * Is the previous atime value older than a day? If yes,
-	 * update atime:
+	 * Is the previous atime value older than a update interval?
+	 * If yes, update atime:
 	 */
-	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= relatime_interval)
 		return 1;
 	/*
 	 * Good, we can skip the atime update:
@@ -1479,10 +1589,19 @@ EXPORT_SYMBOL(touch_atime);
 
 void file_update_time(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct timespec now;
 	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
 
+	/*
+	 * Most files has following invariant
+	 * (f_path.dentry->d_inode ==  f_mapping->host)
+	 * but stacked filestems usually store real inode in f_mapped->host,
+	 * and blkdev store real inode f_path.dentry->d_inode
+	 */
+	if (S_ISBLK(inode->i_mode))
+		inode = file->f_path.dentry->d_inode;
+
 	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
 		return;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/internal.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/internal.h
--- linux-2.6.32-504.3.3.el6.orig/fs/internal.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/internal.h	2015-01-21 12:02:52.020992836 +0300
@@ -89,6 +89,8 @@ extern void chroot_fs_refs(struct path *
 /*
  * file_table.c
  */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
 
@@ -97,6 +99,9 @@ extern struct file *get_empty_filp(void)
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
 
+extern long do_handle_open(int mountdirfd,
+			   struct file_handle __user *ufh, int open_flag);
+
 /*
  * open.c
  */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ioctl.c	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ioctl.c	2015-01-21 12:02:41.672267577 +0300
@@ -613,9 +613,11 @@ int do_vfs_ioctl(struct file *filp, unsi
 
 	case FIGETBSZ:
 	{
-		struct inode *inode = filp->f_path.dentry->d_inode;
+		struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
 		int __user *p = (int __user *)arg;
-		return put_user(inode->i_sb->s_blocksize, p);
+		if (sb->s_blocksize == 1ul << sb->s_blocksize_bits)
+			return put_user(sb->s_blocksize, p);
+		/* fail through */
 	}
 
 	default:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ioprio.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ioprio.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ioprio.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ioprio.c	2015-01-21 12:02:47.189121109 +0300
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
+#include <bc/beancounter.h>
 
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
@@ -81,6 +82,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, 
 	struct pid *pgrp;
 	int ret;
 
+	if (!ve_is_super(get_exec_env())) {
+		if (which == IOPRIO_WHO_UBC)
+			return -EPERM;
+
+		switch (class) {
+			case IOPRIO_CLASS_RT:
+				if (!capable(CAP_VE_ADMIN))
+					return -EPERM;
+				class = IOPRIO_CLASS_BE;
+				data = 0;
+				break;
+			case IOPRIO_CLASS_IDLE:
+				class = IOPRIO_CLASS_BE;
+				data = IOPRIO_BE_NR - 1;
+				break;
+		}
+		ioprio = IOPRIO_PRIO_VALUE(class, data);
+	}
+
 	switch (class) {
 		case IOPRIO_CLASS_RT:
 			if (!capable(CAP_SYS_ADMIN))
@@ -137,17 +157,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, 
 			if (!user)
 				break;
 
-			do_each_thread(g, p) {
+			do_each_thread_all(g, p) {
 				if (__task_cred(p)->uid != who)
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
 					goto free_uid;
-			} while_each_thread(g, p);
+			} while_each_thread_all(g, p);
 free_uid:
 			if (who)
 				free_uid(user);
 			break;
+		case IOPRIO_WHO_UBC:
+			if (class != IOPRIO_CLASS_BE) {
+				ret = -ERANGE;
+				break;
+			}
+
+			ret = ub_set_ioprio(who, data);
+			break;
 		default:
 			ret = -EINVAL;
 	}
@@ -230,7 +258,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, 
 			if (!user)
 				break;
 
-			do_each_thread(g, p) {
+			do_each_thread_ve(g, p) {
 				if (__task_cred(p)->uid != user->uid)
 					continue;
 				tmpio = get_task_ioprio(p);
@@ -240,7 +268,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, 
 					ret = tmpio;
 				else
 					ret = ioprio_best(ret, tmpio);
-			} while_each_thread(g, p);
+			} while_each_thread_ve(g, p);
 
 			if (who)
 				free_uid(user);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/isofs/export.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/isofs/export.c
--- linux-2.6.32-504.3.3.el6.orig/fs/isofs/export.c	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/isofs/export.c	2015-01-21 12:02:52.020992836 +0300
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *de
 	 * offset of the inode and the upper 16 bits of fh32[1] to
 	 * hold the offset of the parent.
 	 */
-
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*max_len = 5;
+		return 255;
+	} else if (len < 3) {
+		*max_len = 3;
 		return 255;
+	}
 
 	len = 3;
 	fh32[0] = ei->i_iget5_block;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/jbd/journal.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd/journal.c
--- linux-2.6.32-504.3.3.el6.orig/fs/jbd/journal.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd/journal.c	2015-01-21 12:02:52.561978475 +0300
@@ -443,8 +443,7 @@ int __log_start_commit(journal_t *journa
 	 * currently running transaction (if it exists).  Otherwise,
 	 * the target tid must be an old one.
 	 */
-	if (journal->j_commit_request != target &&
-	    journal->j_running_transaction &&
+	if (journal->j_running_transaction &&
 	    journal->j_running_transaction->t_tid == target) {
 		/*
 		 * We want a new commit: OK, mark the request and wakeup the
@@ -581,12 +580,11 @@ int log_wait_commit(journal_t *journal, 
 		spin_lock(&journal->j_state_lock);
 	}
 out_unlock:
-	spin_unlock(&journal->j_state_lock);
-
 	if (unlikely(is_journal_aborted(journal))) {
 		printk(KERN_EMERG "journal commit I/O error\n");
 		err = -EIO;
 	}
+	spin_unlock(&journal->j_state_lock);
 	return err;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/jbd/transaction.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd/transaction.c
--- linux-2.6.32-504.3.3.el6.orig/fs/jbd/transaction.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd/transaction.c	2015-01-21 12:02:43.340223293 +0300
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/virtinfo.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -97,6 +98,8 @@ static int start_this_handle(journal_t *
 		goto out;
 	}
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_JOURNAL, NULL);
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kzalloc(sizeof(*new_transaction),
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/jbd2/journal.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd2/journal.c
--- linux-2.6.32-504.3.3.el6.orig/fs/jbd2/journal.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd2/journal.c	2015-01-21 12:02:52.558978556 +0300
@@ -487,7 +487,6 @@ int __jbd2_log_start_commit(journal_t *j
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
-		return 1;
 	} else if (!tid_geq(journal->j_commit_request, target))
 		/* This should never happen, but if it does, preserve
 		   the evidence before kjournald goes into a loop and
@@ -496,7 +495,8 @@ int __jbd2_log_start_commit(journal_t *j
 		     journal->j_commit_request, journal->j_commit_sequence,
 		     target, journal->j_running_transaction ? 
 		     journal->j_running_transaction->t_tid : 0);
-	return 0;
+
+	return tid_gt(target, journal->j_commit_sequence);
 }
 
 int jbd2_log_start_commit(journal_t *journal, tid_t tid)
@@ -642,12 +642,11 @@ int jbd2_log_wait_commit(journal_t *jour
 				!tid_gt(tid, journal->j_commit_sequence));
 		spin_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
-
 	if (unlikely(is_journal_aborted(journal))) {
 		printk(KERN_EMERG "journal commit I/O error\n");
 		err = -EIO;
 	}
+	spin_unlock(&journal->j_state_lock);
 	return err;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/jbd2/recovery.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd2/recovery.c
--- linux-2.6.32-504.3.3.el6.orig/fs/jbd2/recovery.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd2/recovery.c	2015-01-21 12:02:42.281251409 +0300
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/crc32.h>
 #endif
@@ -36,6 +37,9 @@ struct recovery_info
 	int		nr_replays;
 	int		nr_revokes;
 	int		nr_revoke_hits;
+
+	unsigned int		last_log_block;
+	struct buffer_head	*last_commit_bh;
 };
 
 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
@@ -211,6 +215,71 @@ do {									\
 		var -= ((journal)->j_last - (journal)->j_first);	\
 } while (0)
 
+/*
+ * The 'Raid amnesia' effect protection: https://jira.sw.ru/browse/PSBM-15484
+ *
+ * Some blockdevices can return different data on read requests from same block
+ * after power failure (for example mirrored raid is out of sync, and resync is
+ * in progress) In that case following sutuation is possible:
+ *
+ * Power failure happen after transaction commit log was issued for
+ * transaction 'D', next boot first dist will have commit block, but
+ * second one will not.
+ * mirror1: journal={Ac-Bc-Cc-Dc }
+ * mirror2: journal={Ac-Bc-Cc-D  }
+ * Now let's let assumes that we read from mirror1 and found that 'D' has
+ * valid commit block, so journal_replay will replay that transaction, but
+ * second power failure may happen before journal_reset() so next
+ * journal_replay() may read from mirror2 and found that 'C' is last valid
+ * transaction. This result in corruption because we already replayed
+ * trandaction 'D'.
+ * In order to avoid such ambiguity we should pefrorm 'stabilize write'.
+ * 1) Read and rewrite latest commit id block
+ * 2) Invalidate next block in
+ * order to guarantee that journal head becomes stable.
+ * Yes i know that 'stabilize write' approach is ugly but this is the only
+ * way to run filesystem on blkdevices with 'raid amnesia' effect
+ */
+static int stabilize_journal_head(journal_t *journal, struct recovery_info *info)
+{
+	struct buffer_head *bh[2] = {NULL, NULL};
+	int err, err2, i;
+
+	if (!info->last_commit_bh)
+		return 0;
+
+	bh[0] = info->last_commit_bh;
+	info->last_commit_bh = NULL;
+
+	err = jread(&bh[1], journal, info->last_log_block);
+	if (err)
+		goto out;
+
+	for (i = 0; i < 2; i++) {
+		lock_buffer(bh[i]);
+		/* Explicitly invalidate block beyond last commit block */
+		if (i == 1)
+			memset(bh[i]->b_data, 0, journal->j_blocksize);
+
+		BUFFER_TRACE(bh[i], "marking dirty");
+		set_buffer_uptodate(bh[i]);
+		mark_buffer_dirty(bh[i]);
+		BUFFER_TRACE(bh[i], "marking uptodate");
+		unlock_buffer(bh[i]);
+	}
+	err = sync_blockdev(journal->j_dev);
+	/* Make sure data is on permanent storage */
+	if (journal->j_flags & JBD2_BARRIER) {
+		err2 = blkdev_issue_flush(journal->j_dev, NULL);
+		if (!err)
+			err = err2;
+	}
+out:
+	brelse(bh[0]);
+	brelse(bh[1]);
+	return err;
+}
+
 /**
  * jbd2_journal_recover - recovers a on-disk journal
  * @journal: the journal to recover
@@ -248,6 +317,8 @@ int jbd2_journal_recover(journal_t *jour
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
 	if (!err)
+		err = stabilize_journal_head(journal, &info);
+	if (!err)
 		err = do_one_pass(journal, &info, PASS_REVOKE);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
@@ -266,6 +337,13 @@ int jbd2_journal_recover(journal_t *jour
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
+	/* Make sure data is on permanent storage */
+	if (journal->j_flags & JBD2_BARRIER) {
+		err2 = blkdev_issue_flush(journal->j_fs_dev, NULL);
+		if (!err)
+			err = err2;
+	}
+
 
 	return err;
 }
@@ -294,6 +372,7 @@ int jbd2_journal_skip_recovery(journal_t
 	sb = journal->j_superblock;
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
+	brelse(info.last_commit_bh);
 
 	if (err) {
 		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
@@ -357,6 +436,7 @@ static int do_one_pass(journal_t *journa
 {
 	unsigned int		first_commit_ID, next_commit_ID;
 	unsigned long		next_log_block;
+	unsigned long		last_commit_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -380,6 +460,7 @@ static int do_one_pass(journal_t *journa
 	sb = journal->j_superblock;
 	next_commit_ID = be32_to_cpu(sb->s_sequence);
 	next_log_block = be32_to_cpu(sb->s_start);
+	last_commit_block = 0;
 
 	first_commit_ID = next_commit_ID;
 	if (pass == PASS_SCAN)
@@ -654,7 +735,9 @@ static int do_one_pass(journal_t *journa
 				}
 				crc32_sum = ~0;
 			}
-			brelse(bh);
+			brelse(info->last_commit_bh);
+			info->last_commit_bh = bh;
+			info->last_log_block = next_log_block;
 			next_commit_ID++;
 			continue;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/jbd2/transaction.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd2/transaction.c
--- linux-2.6.32-504.3.3.el6.orig/fs/jbd2/transaction.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/jbd2/transaction.c	2015-01-21 12:02:43.341223267 +0300
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/virtinfo.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -100,6 +101,8 @@ static int start_this_handle(journal_t *
 		goto out;
 	}
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_JOURNAL, NULL);
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kzalloc(sizeof(*new_transaction),
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/jfs/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/jfs/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/jfs/namei.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/jfs/namei.c	2015-01-21 12:02:52.020992836 +0300
@@ -802,9 +802,6 @@ static int jfs_link(struct dentry *old_d
 	if (ip->i_nlink == JFS_LINK_MAX)
 		return -EMLINK;
 
-	if (ip->i_nlink == 0)
-		return -ENOENT;
-
 	tid = txBegin(ip->i_sb, 0);
 
 	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/libfs.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/libfs.c
--- linux-2.6.32-504.3.3.el6.orig/fs/libfs.c	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/libfs.c	2015-01-21 12:02:42.204253452 +0300
@@ -543,8 +543,10 @@ int simple_fill_super(struct super_block
 		if (!dentry)
 			goto out;
 		inode = new_inode(s);
-		if (!inode)
+		if (!inode) {
+			dput(dentry);
 			goto out;
+		}
 		inode->i_mode = S_IFREG | files->mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_fop = files->ops;
@@ -555,6 +557,7 @@ int simple_fill_super(struct super_block
 	return 0;
 out:
 	d_genocide(root);
+	shrink_dcache_parent(root);
 	dput(root);
 	return -ENOMEM;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/clntlock.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/clntlock.c
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/clntlock.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/clntlock.c	2015-01-21 12:02:47.584110623 +0300
@@ -78,8 +78,12 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
  */
 void nlmclnt_done(struct nlm_host *host)
 {
+	struct ve_struct *old_ve;
+
 	nlm_release_host(host);
+	old_ve = set_exec_env(host->owner_env);
 	lockd_down();
+	(void)set_exec_env(old_ve);
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
 
@@ -214,9 +218,11 @@ reclaimer(void *ptr)
 	struct nlm_wait	  *block;
 	struct file_lock *fl, *next;
 	u32 nsmstate;
+	struct ve_struct *old_ve;
 
 	allow_signal(SIGKILL);
 
+	old_ve = set_exec_env(host->owner_env);
 	down_write(&host->h_rwsem);
 
 	/* This one ensures that our parent doesn't terminate while the
@@ -273,5 +279,6 @@ restart:
 	nlm_release_host(host);
 	lockd_down();
 	unlock_kernel();
+	set_exec_env(old_ve);
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/clntproc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/clntproc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/clntproc.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/clntproc.c	2015-01-21 12:02:51.094017444 +0300
@@ -17,6 +17,9 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
+#include <linux/nfs_mount.h>
+
+#include "ve.h"
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT	(5*HZ)
@@ -27,7 +30,7 @@ static int	nlmclnt_test(struct nlm_rqst 
 static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
 static int	nlm_stat_to_errno(__be32 stat);
-static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
+static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host, long pid);
 static int	nlmclnt_cancel(struct nlm_host *, int , struct file_lock *);
 
 static const struct rpc_call_ops nlmclnt_unlock_ops;
@@ -92,7 +95,7 @@ static struct nlm_lockowner *__nlm_find_
 	return NULL;
 }
 
-static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner, long pid)
 {
 	struct nlm_lockowner *res, *new = NULL;
 
@@ -107,7 +110,7 @@ static struct nlm_lockowner *nlm_find_lo
 			res = new;
 			atomic_set(&new->count, 1);
 			new->owner = owner;
-			new->pid = __nlm_alloc_pid(host);
+			new->pid = (pid < 0) ? __nlm_alloc_pid(host) : (uint32_t)pid;
 			new->host = nlm_get_host(host);
 			list_add(&new->list, &host->h_lockowners);
 			new = NULL;
@@ -118,6 +121,38 @@ static struct nlm_lockowner *nlm_find_lo
 	return res;
 }
 
+int nlmclnt_set_lockowner(struct inode *inode, struct file_lock *fl, int svid)
+{
+	struct nlm_host *host;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const char *hostname = clp->cl_hostname;
+	const struct sockaddr *address = (struct sockaddr *)&clp->cl_addr;
+	size_t addrlen = clp->cl_addrlen;
+	unsigned short protocol = (clp->cl_proto == XPRT_TRANSPORT_UDP) 
+								? IPPROTO_UDP 
+								: IPPROTO_TCP;
+	u32 nfs_version = clp->rpc_ops->version;
+	int noresvport = server->flags & NFS_MOUNT_NORESVPORT ?	1 : 0;
+	u32 nlm_version = (nfs_version == 2) ? 1 : 4;
+
+	if (nfs_version > 3)
+		return 0;
+	if (server->flags & NFS_MOUNT_NONLM)
+		return 0;
+
+	host = nlmclnt_lookup_host(address, addrlen, protocol,
+				   nlm_version, hostname, noresvport);
+	if (host == NULL)
+		return -ENOLCK;
+
+	nlmclnt_locks_init_private(fl, host, svid);
+	nlm_release_host(host);
+
+	return 0;
+}
+EXPORT_SYMBOL(nlmclnt_set_lockowner);
+
 /*
  * Initialize arguments for TEST/LOCK/UNLOCK/CANCEL calls
  */
@@ -155,13 +190,16 @@ int nlmclnt_proc(struct nlm_host *host, 
 {
 	struct nlm_rqst		*call;
 	int			status;
+	struct ve_struct *ve;
 
 	nlm_get_host(host);
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return -ENOMEM;
 
-	nlmclnt_locks_init_private(fl, host);
+	ve = set_exec_env(host->owner_env);
+
+	nlmclnt_locks_init_private(fl, host, -1);
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(call, fl);
 
@@ -182,6 +220,7 @@ int nlmclnt_proc(struct nlm_host *host, 
 	unlock_kernel();
 
 	dprintk("lockd: clnt proc returns %d\n", status);
+	(void)set_exec_env(ve);
 	return status;
 }
 EXPORT_SYMBOL_GPL(nlmclnt_proc);
@@ -458,16 +497,23 @@ static void nlmclnt_locks_release_privat
 	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
 
+static int nlm_get_lockid(struct file_lock *fl)
+{
+	return fl->fl_u.nfs_fl.owner->pid;
+}
+
 static const struct file_lock_operations nlmclnt_lock_ops = {
 	.fl_copy_lock = nlmclnt_locks_copy_lock,
 	.fl_release_private = nlmclnt_locks_release_private,
+	.fl_owner_id = nlm_get_lockid,
 };
 
-static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
+static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host,
+				       long pid)
 {
 	BUG_ON(fl->fl_ops != NULL);
 	fl->fl_u.nfs_fl.state = 0;
-	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
+	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner, pid);
 	INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
 	fl->fl_ops = &nlmclnt_lock_ops;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/grace.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/grace.c
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/grace.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/grace.c	2015-01-21 12:02:47.659108631 +0300
@@ -4,8 +4,10 @@
 
 #include <linux/module.h>
 #include <linux/lockd/bind.h>
+#include <linux/sched.h>
+
+#include "ve.h"
 
-static LIST_HEAD(grace_list);
 static DEFINE_SPINLOCK(grace_lock);
 
 /**
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/host.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/host.c
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/host.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/host.c	2015-01-21 12:02:47.659108631 +0300
@@ -19,6 +19,8 @@
 
 #include <net/ipv6.h>
 
+#include "ve.h"
+
 #define NLMDBG_FACILITY		NLMDBG_HOSTCACHE
 #define NLM_HOST_NRHASH		32
 #define NLM_HOST_REBIND		(60 * HZ)
@@ -30,7 +32,7 @@ static unsigned long		next_gc;
 static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
-static void			nlm_gc_hosts(void);
+static int			nlm_gc_hosts(struct ve_struct *ve);
 
 struct nlm_lookup_host_info {
 	const int		server;		/* search for server|client */
@@ -96,11 +98,13 @@ static struct nlm_host *nlm_lookup_host(
 	struct hlist_node *pos;
 	struct nlm_host	*host;
 	struct nsm_handle *nsm = NULL;
+	struct ve_struct *ve;
 
+	ve = get_exec_env();
 	mutex_lock(&nlm_host_mutex);
 
 	if (time_after_eq(jiffies, next_gc))
-		nlm_gc_hosts();
+		nlm_gc_hosts(ve);
 
 	/* We may keep several nlm_host objects for a peer, because each
 	 * nlm_host is identified by
@@ -109,10 +113,13 @@ static struct nlm_host *nlm_lookup_host(
 	 * different NLM rpc_clients into one single nlm_host object.
 	 * This would allow us to have one nlm_host per address.
 	 */
+
 	chain = &nlm_hosts[nlm_hash_address(ni->sap)];
 	hlist_for_each_entry(host, pos, chain, h_hash) {
 		if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
 			continue;
+		if (!ve_accessible_strict(host->owner_env, ve))
+			continue;
 
 		/* See if we have an NSM handle for this client */
 		if (!nsm)
@@ -187,6 +194,7 @@ static struct nlm_host *nlm_lookup_host(
 	spin_lock_init(&host->h_lock);
 	INIT_LIST_HEAD(&host->h_granted);
 	INIT_LIST_HEAD(&host->h_reclaim);
+	host->owner_env    = ve;
 
 	nrhosts++;
 
@@ -491,6 +499,11 @@ nlm_shutdown_hosts(void)
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 	struct nlm_host	*host;
+	int nr_hosts_local;
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	nr_hosts_local = 0;
 
 	dprintk("lockd: shutting down host module\n");
 	mutex_lock(&nlm_host_mutex);
@@ -499,24 +512,29 @@ nlm_shutdown_hosts(void)
 	dprintk("lockd: nuking all hosts...\n");
 	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
 		hlist_for_each_entry(host, pos, chain, h_hash) {
+			if (!ve_accessible_strict(host->owner_env, ve))
+				continue;
 			host->h_expires = jiffies - 1;
 			if (host->h_rpcclnt) {
 				rpc_shutdown_client(host->h_rpcclnt);
 				host->h_rpcclnt = NULL;
 			}
+			nr_hosts_local++;
 		}
 	}
 
 	/* Then, perform a garbage collection pass */
-	nlm_gc_hosts();
+	nr_hosts_local -= nlm_gc_hosts(ve);
 	mutex_unlock(&nlm_host_mutex);
 
 	/* complain if any hosts are left */
-	if (nrhosts) {
+	if (nr_hosts_local) {
 		printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-		dprintk("lockd: %d hosts left:\n", nrhosts);
+		dprintk("lockd: %d hosts left:\n", nr_hosts_local);
 		for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
 			hlist_for_each_entry(host, pos, chain, h_hash) {
+				if (!ve_accessible_strict(host->owner_env, ve))
+					continue;
 				dprintk("       %s (cnt %d use %d exp %ld)\n",
 					host->h_name, atomic_read(&host->h_count),
 					host->h_inuse, host->h_expires);
@@ -530,17 +548,23 @@ nlm_shutdown_hosts(void)
  * This GC combines reference counting for async operations with
  * mark & sweep for resources held by remote clients.
  */
-static void
-nlm_gc_hosts(void)
+static int
+nlm_gc_hosts(struct ve_struct *ve)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos, *next;
 	struct nlm_host	*host;
+	int freed;
+
+	freed = 0;
 
 	dprintk("lockd: host garbage collection\n");
 	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-		hlist_for_each_entry(host, pos, chain, h_hash)
+		hlist_for_each_entry(host, pos, chain, h_hash) {
+			if (!ve_accessible_strict(host->owner_env, ve))
+				continue;
 			host->h_inuse = 0;
+		}
 	}
 
 	/* Mark all hosts that hold locks, blocks or shares */
@@ -549,7 +573,8 @@ nlm_gc_hosts(void)
 	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
 		hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
 			if (atomic_read(&host->h_count) || host->h_inuse
-			 || time_before(jiffies, host->h_expires)) {
+			 || time_before(jiffies, host->h_expires)
+			 || !ve_accessible_strict(host->owner_env, ve)) {
 				dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
 					host->h_name, atomic_read(&host->h_count),
 					host->h_inuse, host->h_expires);
@@ -560,8 +585,10 @@ nlm_gc_hosts(void)
 
 			nlm_destroy_host(host);
 			nrhosts--;
+			freed++;
 		}
 	}
 
 	next_gc = jiffies + NLM_HOST_COLLECT;
+	return freed;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/svc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/svc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/svc.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/svc.c	2015-01-21 12:02:51.094017444 +0300
@@ -27,6 +27,7 @@
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/ve_proto.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -35,7 +36,8 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
-#include <linux/nfs.h>
+
+#include "ve.h"
 
 #define NLMDBG_FACILITY		NLMDBG_SVC
 #define LOCKD_BUFSIZE		(1024 + NLMSVC_XDRSIZE)
@@ -47,25 +49,29 @@ struct nlmsvc_binding *		nlmsvc_ops;
 EXPORT_SYMBOL_GPL(nlmsvc_ops);
 
 static DEFINE_MUTEX(nlmsvc_mutex);
-static unsigned int		nlmsvc_users;
-static struct task_struct	*nlmsvc_task;
-static struct svc_rqst		*nlmsvc_rqst;
-unsigned long			nlmsvc_timeout;
 
 /*
  * These can be set at insmod time (useful for NFS as root filesystem),
  * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
  */
-static unsigned long		nlm_grace_period;
 static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
 static int			nlm_udpport, nlm_tcpport;
 
+#ifndef CONFIG_VE
+static unsigned int		_nlmsvc_users;
+static struct task_struct	*_nlmsvc_task;
+static struct svc_rqst		*_nlmsvc_rqst;
+static unsigned long		_nlmsvc_grace_period;
+unsigned long			_nlmsvc_timeout;
+#endif
+
 /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
 static unsigned int		nlm_max_connections = 1024;
 
 /*
  * Constants needed for the sysctl interface.
  */
+static unsigned long		nlm_grace_period;
 static const unsigned long	nlm_grace_period_min = 0;
 static const unsigned long	nlm_grace_period_max = 240;
 static const unsigned long	nlm_timeout_min = 3;
@@ -85,15 +91,16 @@ static unsigned long get_lockd_grace_per
 		return nlm_timeout * 5 * HZ;
 }
 
-static struct lock_manager lockd_manager = {
-};
-
-static void grace_ender(struct work_struct *not_used)
+void grace_ender(struct work_struct *grace)
 {
-	locks_end_grace(&lockd_manager);
-}
+	struct delayed_work *dwork = container_of(grace, struct delayed_work,
+						  work);
+	struct ve_nlm_data *nlm = container_of(dwork, struct ve_nlm_data,
+					       _grace_period_end);
 
-static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+	locks_end_grace(&nlm->_lockd_manager);
+}
+EXPORT_SYMBOL_GPL(grace_ender);
 
 static void set_grace_period(void)
 {
@@ -176,8 +183,9 @@ lockd(void *vrqstp)
 		}
 		if (err < 0) {
 			if (err != preverr) {
-				printk(KERN_WARNING "%s: unexpected error "
-					"from svc_recv (%d)\n", __func__, err);
+				printk(KERN_WARNING "%s: ct%d unexpected error "
+					"from svc_recv (%d)\n", __func__,
+					get_exec_env()->veid, err);
 				preverr = err;
 			}
 			schedule_timeout_interruptible(HZ);
@@ -207,8 +215,8 @@ static int create_lockd_listener(struct 
 
 	xprt = svc_find_xprt(serv, name, family, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, &init_net, family, port,
-						SVC_SOCK_DEFAULTS);
+		return svc_create_xprt(serv, name, current->nsproxy->net_ns,
+					family, port, SVC_SOCK_DEFAULTS);
 	svc_xprt_put(xprt);
 	return 0;
 }
@@ -278,12 +286,14 @@ int lockd_up(void)
 	 */
 	if (nlmsvc_users)
 		printk(KERN_WARNING
-			"lockd_up: no pid, %d users??\n", nlmsvc_users);
+			"lockd_up: ct%d no pid, %d users??\n",
+			get_exec_env()->veid, nlmsvc_users);
 
 	error = -ENOMEM;
 	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
 	if (!serv) {
-		printk(KERN_WARNING "lockd_up: create service failed\n");
+		printk(KERN_WARNING "lockd_up: ct%d create service failed\n",
+				get_exec_env()->veid);
 		goto out;
 	}
 
@@ -299,22 +309,23 @@ int lockd_up(void)
 		error = PTR_ERR(nlmsvc_rqst);
 		nlmsvc_rqst = NULL;
 		printk(KERN_WARNING
-			"lockd_up: svc_rqst allocation failed, error=%d\n",
-			error);
+			"lockd_up: ct%d svc_rqst allocation failed, error=%d\n",
+			get_exec_env()->veid, error);
 		goto destroy_and_out;
 	}
 
 	svc_sock_update_bufs(serv);
 	serv->sv_maxconn = nlm_max_connections;
 
-	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+	nlmsvc_task = kthread_run_ve(get_exec_env(), lockd, nlmsvc_rqst, serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
 		error = PTR_ERR(nlmsvc_task);
 		svc_exit_thread(nlmsvc_rqst);
 		nlmsvc_task = NULL;
 		nlmsvc_rqst = NULL;
 		printk(KERN_WARNING
-			"lockd_up: kthread_run failed, error=%d\n", error);
+			"lockd_up: ct%d kthread_run failed, error=%d\n",
+			get_exec_env()->veid, error);
 		goto destroy_and_out;
 	}
 
@@ -343,14 +354,15 @@ lockd_down(void)
 		if (--nlmsvc_users)
 			goto out;
 	} else {
-		printk(KERN_ERR "lockd_down: no users! task=%p\n",
-			nlmsvc_task);
-		BUG();
+		printk(KERN_ERR "lockd_down: ct%d no users! task=%p\n",
+			get_exec_env()->veid, nlmsvc_task);
+		goto out;
 	}
 
 	if (!nlmsvc_task) {
-		printk(KERN_ERR "lockd_down: no lockd running.\n");
-		BUG();
+		printk(KERN_ERR "lockd_down: ct%d no lockd running.\n",
+				get_exec_env()->veid);
+		goto out;
 	}
 	kthread_stop(nlmsvc_task);
 	svc_exit_thread(nlmsvc_rqst);
@@ -495,7 +507,6 @@ static int lockd_authenticate(struct svc
 	return SVC_DENIED;
 }
 
-
 param_set_min_max(port, int, simple_strtol, 0, 65535)
 param_set_min_max(grace_period, unsigned long, simple_strtoul,
 		  nlm_grace_period_min, nlm_grace_period_max)
@@ -520,19 +531,66 @@ module_param(nlm_max_connections, uint, 
 /*
  * Initialising and terminating the module.
  */
+#ifdef CONFIG_VE
+static void ve_nlm_init(struct ve_nlm_data *nlm_data)
+{
+	INIT_DELAYED_WORK(&nlm_data->_grace_period_end, grace_ender);
+	INIT_LIST_HEAD(&nlm_data->_grace_list);
+
+	get_exec_env()->nlm_data = nlm_data;
+}
+
+static int ve_lockd_init(void *data)
+{
+	struct ve_nlm_data *nlm_data;
+
+	nlm_data = kzalloc(sizeof(struct ve_nlm_data), GFP_KERNEL);
+	if (nlm_data == NULL)
+		return -ENOMEM;
+	ve_nlm_init(nlm_data);
+	return 0;
+}
+
+static void ve_lockd_fini(void *data)
+{
+	struct ve_struct *ve = data;
+
+	if (!ve->nlm_data)
+		return;
+
+	kfree(ve->nlm_data);
+}
+
+static struct ve_hook lockd_ss_hook = {
+	.init	  = ve_lockd_init,
+	.fini     = ve_lockd_fini,
+	.owner	  = THIS_MODULE,
+	.priority = HOOK_PRIO_NET_POST,
+};
+
+static struct ve_nlm_data ve0_nlm_data;
+#endif
 
 static int __init init_nlm(void)
 {
 #ifdef CONFIG_SYSCTL
 	nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
-	return nlm_sysctl_table ? 0 : -ENOMEM;
-#else
-	return 0;
+	if (nlm_sysctl_table == NULL)
+		return -ENOMEM;
 #endif
+#ifdef CONFIG_VE
+	ve_nlm_init(&ve0_nlm_data);
+
+	ve_hook_register(VE_SS_CHAIN, &lockd_ss_hook);
+#endif
+	return 0;
 }
 
 static void __exit exit_nlm(void)
 {
+#ifdef CONFIG_VE
+	ve_hook_unregister(&lockd_ss_hook);
+#endif
 	/* FIXME: delete all NLM clients */
 	nlm_shutdown_hosts();
 #ifdef CONFIG_SYSCTL
@@ -586,3 +644,4 @@ static struct svc_program	nlmsvc_program
 	.pg_stats		= &nlmsvc_stats,	/* stats table */
 	.pg_authenticate = &lockd_authenticate	/* export authentication */
 };
+
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/svcsubs.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/svcsubs.c
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/svcsubs.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/svcsubs.c	2015-01-21 12:02:46.925128116 +0300
@@ -334,6 +334,9 @@ nlmsvc_is_client(void *data, struct nlm_
 {
 	struct nlm_host *host = data;
 
+	if (!ve_accessible_strict(host->owner_env, get_exec_env()))
+		return 0;
+
 	if (host->h_server) {
 		/* we are destroying locks even though the client
 		 * hasn't asked us too, so don't unmonitor the
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/lockd/ve.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/ve.h
--- linux-2.6.32-504.3.3.el6.orig/fs/lockd/ve.h	2015-01-21 12:02:47.660108605 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/lockd/ve.h	2015-01-21 12:02:51.095017417 +0300
@@ -0,0 +1,44 @@
+/*
+ * fs/lockd/ve.h
+ *
+ * VE context for LockD
+ */
+
+#ifndef __VE_LOCKD_H__
+#define __VE_LOCKD_H__
+
+struct ve_nlm_data {
+	unsigned int		_nlmsvc_users;
+	struct task_struct*	_nlmsvc_task;
+	unsigned long		_nlmsvc_timeout;
+	struct svc_rqst*	_nlmsvc_rqst;
+
+	struct delayed_work	_grace_period_end;
+	struct list_head	_grace_list;
+	struct lock_manager	_lockd_manager;
+};
+
+#ifdef CONFIG_VE
+
+#include <linux/ve.h>
+
+#define NLM_CTX_FIELD(arg)	(get_exec_env()->nlm_data->_##arg)
+
+#else
+
+#define NLM_CTX_FIELD(arg)	_##arg
+
+#endif
+
+#define nlmsvc_grace_period	NLM_CTX_FIELD(nlmsvc_grace_period)
+#define nlmsvc_timeout		NLM_CTX_FIELD(nlmsvc_timeout)
+#define nlmsvc_users		NLM_CTX_FIELD(nlmsvc_users)
+#define nlmsvc_task		NLM_CTX_FIELD(nlmsvc_task)
+#define nlmsvc_rqst		NLM_CTX_FIELD(nlmsvc_rqst)
+
+#define grace_period_end	NLM_CTX_FIELD(grace_period_end)
+#define grace_list		NLM_CTX_FIELD(grace_list)
+#define lockd_manager		NLM_CTX_FIELD(lockd_manager)
+
+#endif /* __VE_LOCKD_H__ */
+
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/locks.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/locks.c
--- linux-2.6.32-504.3.3.el6.orig/fs/locks.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/locks.c	2015-01-21 12:02:57.964835060 +0300
@@ -127,9 +127,12 @@
 #include <linux/time.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 
+#include <bc/misc.h>
+
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)	(fl->fl_flags & FL_LEASE)
@@ -146,10 +149,27 @@ static LIST_HEAD(blocked_list);
 static struct kmem_cache *filelock_cache __read_mostly;
 
 /* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(int charge)
 {
-	return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
+	struct file_lock *fl;
+
+	fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL);
+#ifdef CONFIG_BEANCOUNTERS
+	if (fl == NULL)
+		goto out;
+	fl->fl_charged = 0;
+	if (!charge)
+		goto out;
+	if (!ub_flock_charge(fl, 1))
+		goto out;
+
+	kmem_cache_free(filelock_cache, fl);
+	fl = NULL;
+out:
+#endif
+	return fl;
 }
+EXPORT_SYMBOL(locks_alloc_lock);
 
 void locks_release_private(struct file_lock *fl)
 {
@@ -168,15 +188,17 @@ void locks_release_private(struct file_l
 EXPORT_SYMBOL_GPL(locks_release_private);
 
 /* Free a lock which is not in use. */
-static void locks_free_lock(struct file_lock *fl)
+void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
 	BUG_ON(!list_empty(&fl->fl_block));
 	BUG_ON(!list_empty(&fl->fl_link));
 
+	ub_flock_uncharge(fl);
 	locks_release_private(fl);
 	kmem_cache_free(filelock_cache, fl);
 }
+EXPORT_SYMBOL(locks_free_lock);
 
 void locks_init_lock(struct file_lock *fl)
 {
@@ -277,7 +299,7 @@ static int flock_make_lock(struct file *
 	if (type < 0)
 		return type;
 	
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(type != F_UNLCK);
 	if (fl == NULL)
 		return -ENOMEM;
 
@@ -464,7 +486,7 @@ static int lease_init(struct file *filp,
 /* Allocate a file_lock initialised to this type of lease */
 static struct file_lock *lease_alloc(struct file *filp, int type)
 {
-	struct file_lock *fl = locks_alloc_lock();
+	struct file_lock *fl = locks_alloc_lock(1);
 	int error = -ENOMEM;
 
 	if (fl == NULL)
@@ -735,8 +757,13 @@ static int flock_lock_file(struct file *
 		goto find_conflict;
 
 	if (request->fl_type != F_UNLCK) {
+		/*
+		 * Nont F_UNLCK request must be already charged in
+		 * flock_make_lock(). Actually new_fl must be charged not the
+		 * request, but we try to fail earlier.
+		 */
 		error = -ENOMEM;
-		new_fl = locks_alloc_lock();
+		new_fl = locks_alloc_lock(0);
 		if (new_fl == NULL)
 			goto out;
 		error = 0;
@@ -788,6 +815,10 @@ find_conflict:
 	}
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
+
+	set_flock_charged(new_fl);
+	unset_flock_charged(request);
+
 	locks_copy_lock(new_fl, request);
 	locks_insert_lock(before, new_fl);
 	new_fl = NULL;
@@ -819,8 +850,11 @@ static int __posix_lock_file(struct inod
 	if (!(request->fl_flags & FL_ACCESS) &&
 	    (request->fl_type != F_UNLCK ||
 	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
-		new_fl = locks_alloc_lock();
-		new_fl2 = locks_alloc_lock();
+		if (request->fl_type != F_UNLCK)
+			new_fl = locks_alloc_lock(1);
+		else
+			new_fl = NULL;
+		new_fl2 = locks_alloc_lock(0);
 	}
 
 	lock_kernel();
@@ -954,7 +988,7 @@ static int __posix_lock_file(struct inod
 	 * bail out.
 	 */
 	error = -ENOLCK; /* "no luck" */
-	if (right && left == right && !new_fl2)
+	if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2))
 		goto out;
 
 	error = 0;
@@ -965,23 +999,32 @@ static int __posix_lock_file(struct inod
 			goto out;
 		}
 
-		if (!new_fl) {
-			error = -ENOLCK;
+		error = -ENOLCK;
+		if (!new_fl)
+			goto out;
+		if (right && (left == right) && ub_flock_charge(new_fl, 1))
 			goto out;
-		}
 		locks_copy_lock(new_fl, request);
 		locks_insert_lock(before, new_fl);
 		new_fl = NULL;
+		error = 0;
 	}
 	if (right) {
 		if (left == right) {
 			/* The new lock breaks the old one in two pieces,
 			 * so we have to use the second new lock.
 			 */
+			error = -ENOLCK;
+			if (added && ub_flock_charge(new_fl2,
+						request->fl_type != F_UNLCK))
+				goto out;
+			/* FIXME move all fl_charged manipulations in ub code */
+			set_flock_charged(new_fl2);
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
 			locks_insert_lock(before, left);
+			error = 0;
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1285,8 +1328,6 @@ void lease_get_mtime(struct inode *inode
 	struct file_lock *flock = inode->i_flock;
 	if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK))
 		*time = current_fs_time(inode->i_sb);
-	else
-		*time = inode->i_mtime;
 }
 
 EXPORT_SYMBOL(lease_get_mtime);
@@ -1367,7 +1408,7 @@ int generic_setlease(struct file *filp, 
 
 	if (arg != F_UNLCK) {
 		error = -ENOMEM;
-		new_fl = locks_alloc_lock();
+		new_fl = locks_alloc_lock(1);
 		if (new_fl == NULL)
 			goto out;
 
@@ -1611,6 +1652,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd,
  out:
 	return error;
 }
+EXPORT_SYMBOL(sys_flock);
 
 /**
  * vfs_test_lock - test file byte range lock
@@ -1771,7 +1813,7 @@ static int do_lock_file_wait(struct file
 int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 		struct flock __user *l)
 {
-	struct file_lock *file_lock = locks_alloc_lock();
+	struct file_lock *file_lock = locks_alloc_lock(0);
 	struct flock flock;
 	struct inode *inode;
 	struct file *f;
@@ -1889,7 +1931,7 @@ out:
 int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 		struct flock64 __user *l)
 {
-	struct file_lock *file_lock = locks_alloc_lock();
+	struct file_lock *file_lock = locks_alloc_lock(0);
 	struct flock64 flock;
 	struct inode *inode;
 	struct file *f;
@@ -2172,6 +2214,8 @@ static int locks_show(struct seq_file *f
 	struct file_lock *fl, *bfl;
 
 	fl = list_entry(v, struct file_lock, fl_link);
+	if (d_root_check(&fl->fl_file->f_path))
+		return 0;
 
 	lock_get_status(f, fl, *((loff_t *)f->private), "");
 
@@ -2223,7 +2267,7 @@ static const struct file_operations proc
 
 static int __init proc_locks_init(void)
 {
-	proc_create("locks", 0, NULL, &proc_locks_operations);
+	proc_create("locks", 0, &glob_proc_root, &proc_locks_operations);
 	return 0;
 }
 module_init(proc_locks_init);
@@ -2310,7 +2354,7 @@ EXPORT_SYMBOL(lock_may_write);
 static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
-			sizeof(struct file_lock), 0, SLAB_PANIC,
+			sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC,
 			init_once);
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/namei.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/namei.c	2015-01-21 12:02:57.964835060 +0300
@@ -144,8 +144,8 @@ static int do_getname(const char __user 
 
 #define EMBEDDED_NAME_MAX      (PATH_MAX - sizeof(struct filename))
 
-struct filename *
-getname(const char __user * filename)
+static struct filename *
+getname_flags(const char __user * filename, int flags)
 {
 	int len;
 	struct filename *result, *err;
@@ -156,6 +156,7 @@ getname(const char __user * filename)
 	if (result)
 		return result;
 
+	/*ub_dentry_checkup();*/
 	result = __getname();
 	if (unlikely(!result))
 		return ERR_PTR(-ENOMEM);
@@ -168,8 +169,10 @@ getname(const char __user * filename)
 recopy:
 	len = do_getname(filename, kname, max);
 	if (len < 0) {
-		err = ERR_PTR(len);
-		goto error;
+		if (len != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+			err = ERR_PTR(len);
+			goto error;
+		}
 	}
 
 	/*
@@ -209,6 +212,12 @@ error:
 	final_putname(result);
 	return err;
 }
+
+struct filename *
+getname(const char __user * filename)
+{
+	return getname_flags(filename, 0);
+}
 EXPORT_SYMBOL(getname);
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -403,6 +412,7 @@ int deny_write_access(struct file * file
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(deny_write_access);
 
 /**
  * path_get - get a reference to a path
@@ -441,6 +451,7 @@ void release_open_intent(struct nameidat
 	else
 		fput(nd->intent.open.file);
 }
+EXPORT_SYMBOL(release_open_intent);
 
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -570,13 +581,8 @@ static __always_inline int link_path_wal
 
 static __always_inline void set_root(struct nameidata *nd)
 {
-	if (!nd->root.mnt) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->root = fs->root;
-		path_get(&nd->root);
-		read_unlock(&fs->lock);
-	}
+	if (!nd->root.mnt)
+		get_fs_root(current->fs, &nd->root);
 }
 
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
@@ -970,6 +976,12 @@ static __always_inline void follow_dotdo
 		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
+#ifdef CONFIG_VE
+		if (nd->path.dentry == get_exec_env()->root_path.dentry &&
+		    nd->path.mnt == get_exec_env()->root_path.mnt) {
+			break;
+		}
+#endif
 		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -991,7 +1003,8 @@ static __always_inline void follow_dotdo
 		mntput(nd->path.mnt);
 		nd->path.mnt = parent;
 	}
-	follow_mount(&nd->path);
+	if (!(nd->flags & LOOKUP_DIVE))
+		follow_mount(&nd->path);
 }
 
 /*
@@ -1015,8 +1028,14 @@ found:
 	if (dentry->d_op && dentry->d_op->d_revalidate)
 		goto need_revalidate;
 done:
+	if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) {
+		dput(dentry);
+		return -ENOENT;
+	}
 	path->mnt = mnt;
 	path->dentry = dentry;
+	if (nd->flags & LOOKUP_DIVE)
+		return 0;
 	/*
 	 * Make sure follow_automount() knows about the trailing
 	 * "/" but only for the real last path component.
@@ -1099,6 +1118,7 @@ fail:
 static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 {
 	return inode && unlikely(inode->i_op->follow_link) &&
+		!(lookup_flags & LOOKUP_STRICT) &&
 		((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
 }
 
@@ -1117,6 +1137,7 @@ static int __link_path_walk(struct filen
 	int err;
 	unsigned int lookup_flags = nd->flags;
 	const char *name = filename->name;
+	int real_components = 0;
 	
 	while (*name=='/')
 		name++;
@@ -1185,6 +1206,7 @@ static int __link_path_walk(struct filen
 				break;
 		}
 		/* This does the actual lookups.. */
+		real_components++;
 		err = do_lookup(nd, &this, &next);
 		if (err)
 			break;
@@ -1195,6 +1217,9 @@ static int __link_path_walk(struct filen
 			goto out_dput;
 
 		if (inode->i_op->follow_link) {
+			err = -ENOENT;
+			if (lookup_flags & LOOKUP_STRICT)
+				goto out_dput;
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
@@ -1271,7 +1296,7 @@ return_reval:
 		 * We bypassed the ordinary revalidation routines.
 		 * We may need to check the cached dentry for staleness.
 		 */
-		if (nd->path.dentry && nd->path.dentry->d_sb) {
+		if (!real_components && nd->path.dentry && nd->path.dentry->d_sb) {
 			int fs_flags = nd->path.dentry->d_sb->s_type->fs_flags;
 
 			if (fs_flags & FS_REVAL_DOT) {
@@ -1283,6 +1308,13 @@ return_reval:
 						break;
 				} else {
 					if (!nd->path.dentry->d_op->d_revalidate(nd->path.dentry, nd))
+						/*
+						 * This lookup is for `/' or `.' or `..'.
+						 * The filesystem unhashed the dentry itself
+						 * inside d_revalidate (otherwise, d_invalidate
+						 * wouldn't succeed).  As a special courtesy to
+						 * NFS we return an error.   2003/02/19  SAW
+						 */
 						break;
 				}
 			}
@@ -1301,11 +1333,12 @@ return_err:
 	return err;
 }
 
-static int path_walk(struct filename *name, struct nameidata *nd)
+int path_walk(struct filename *name, struct nameidata *nd)
 {
 	current->total_link_count = 0;
 	return link_path_walk(name, nd);
 }
+EXPORT_SYMBOL(path_walk);
 
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
@@ -1323,11 +1356,7 @@ static int path_init(int dfd, const char
 		nd->path = nd->root;
 		path_get(&nd->root);
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->path = fs->pwd;
-		path_get(&fs->pwd);
-		read_unlock(&fs->lock);
+		get_fs_pwd(current->fs, &nd->path);
 	} else {
 		struct dentry *dentry;
 
@@ -1338,6 +1367,9 @@ static int path_init(int dfd, const char
 
 		dentry = file->f_path.dentry;
 
+		if ((flags & LOOKUP_EMPTY) && *name == '\0')
+			goto skip_checks;
+
 		retval = -ENOTDIR;
 		if (!S_ISDIR(dentry->d_inode->i_mode))
 			goto fput_fail;
@@ -1351,7 +1383,7 @@ static int path_init(int dfd, const char
 			if (retval)
 				goto fput_fail;
 		}
-
+skip_checks:
 		nd->path = file->f_path;
 		path_get(&file->f_path);
 
@@ -1587,7 +1619,7 @@ int user_path_at(int dfd, const char __u
 		 struct path *path)
 {
 	struct nameidata nd;
-	struct filename *tmp = getname(name);
+	struct filename *tmp = getname_flags(name, flags);
 	int err = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 
@@ -1881,7 +1913,8 @@ static int may_delete(struct inode *dir,
 	if (IS_APPEND(dir))
 		return -EPERM;
 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+	    IS_IMMUTABLE(victim->d_inode) ||
+	    (IS_SWAPFILE(victim->d_inode) && victim->d_inode->i_nlink == 1))
 		return -EPERM;
 	if (isdir) {
 		if (!S_ISDIR(victim->d_inode->i_mode))
@@ -1918,7 +1951,7 @@ static inline int may_create(struct inod
 /* 
  * O_DIRECTORY translates into forcing a directory lookup.
  */
-static inline int lookup_flags(unsigned int f)
+inline int lookup_flags(unsigned int f)
 {
 	unsigned long retval = LOOKUP_FOLLOW;
 
@@ -1930,6 +1963,7 @@ static inline int lookup_flags(unsigned 
 
 	return retval;
 }
+EXPORT_SYMBOL(lookup_flags);
 
 /*
  * p1 and p2 should be directories on the same fs.
@@ -2113,12 +2147,13 @@ out_unlock:
  * later).
  *
 */
-static inline int open_to_namei_flags(int flag)
+int open_to_namei_flags(int flag)
 {
 	if ((flag+1) & O_ACCMODE)
 		flag++;
 	return flag;
 }
+EXPORT_SYMBOL(open_to_namei_flags);
 
 static int open_will_truncate(int flag, struct inode *inode)
 {
@@ -2618,6 +2653,7 @@ SYSCALL_DEFINE3(mknod, const char __user
 {
 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
 }
+EXPORT_SYMBOL(sys_mknod);
 
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
@@ -2691,6 +2727,7 @@ SYSCALL_DEFINE2(mkdir, const char __user
 {
 	return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
+EXPORT_SYMBOL(sys_mkdir);
 
 /*
  * We try to drop the dentry early: we should have
@@ -2809,6 +2846,7 @@ SYSCALL_DEFINE1(rmdir, const char __user
 {
 	return do_rmdir(AT_FDCWD, pathname);
 }
+EXPORT_SYMBOL(sys_rmdir);
 
 int vfs_unlink(struct inode *dir, struct dentry *dentry)
 {
@@ -2921,6 +2959,7 @@ SYSCALL_DEFINE1(unlink, const char __use
 {
 	return do_unlinkat(AT_FDCWD, pathname);
 }
+EXPORT_SYMBOL(sys_unlink);
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
@@ -3000,6 +3039,7 @@ SYSCALL_DEFINE2(symlink, const char __us
 {
 	return sys_symlinkat(oldname, AT_FDCWD, newname);
 }
+EXPORT_SYMBOL(sys_symlink);
 
 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
 {
@@ -3228,6 +3268,9 @@ int vfs_rename(struct inode *old_dir, st
 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
 	const char *old_name;
 
+	if (vfs_dq_rename(old_dentry->d_inode, old_dir, new_dir))
+		return -EXDEV;
+
 	if (old_dentry->d_inode == new_dentry->d_inode)
  		return 0;
  
@@ -3375,6 +3418,7 @@ SYSCALL_DEFINE2(rename, const char __use
 {
 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
 }
+EXPORT_SYMBOL(sys_rename);
 
 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/namespace.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/namespace.c
--- linux-2.6.32-504.3.3.el6.orig/fs/namespace.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/namespace.c	2015-01-21 12:02:58.115831051 +0300
@@ -29,6 +29,8 @@
 #include <linux/log2.h>
 #include <linux/idr.h>
 #include <linux/fs_struct.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/ve_proto.h>
 #include <linux/proc_fs.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -40,6 +42,7 @@
 
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
+EXPORT_SYMBOL(vfsmount_lock);
 
 static int event;
 static DEFINE_IDA(mnt_id_ida);
@@ -49,12 +52,46 @@ static int mnt_group_start = 1;
 
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
-static struct rw_semaphore namespace_sem;
+struct rw_semaphore namespace_sem;
+EXPORT_SYMBOL(namespace_sem);
+
+unsigned int sysctl_ve_mount_nr = 4096;
 
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 
+static LIST_HEAD(mounts_readers);
+static DEFINE_SPINLOCK(mounts_lock);
+
+void register_mounts_reader(struct proc_mounts *p)
+{
+	spin_lock(&mounts_lock);
+	list_add(&p->reader, &mounts_readers);
+	spin_unlock(&mounts_lock);
+}
+
+void unregister_mounts_reader(struct proc_mounts *p)
+{
+	spin_lock(&mounts_lock);
+	list_del(&p->reader);
+	spin_unlock(&mounts_lock);
+}
+
+static void advance_mounts_readers(struct list_head *iter)
+{
+	struct proc_mounts *p;
+
+	spin_lock(&mounts_lock);
+	list_for_each_entry(p, &mounts_readers, reader) {
+		if (p->iter == iter) {
+			p->iter = p->iter->next;
+			p->iter_advanced = 1;
+		}
+	}
+	spin_unlock(&mounts_lock);
+}
+
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -126,9 +163,22 @@ void mnt_release_group_id(struct vfsmoun
 	mnt->mnt_group_id = 0;
 }
 
+/*
+ * Operations with a big amount of mount points can require a lot of time.
+ * This operations take the global lock namespace_sem, so they can affect
+ * other containers.
+ */
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
-	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+	struct vfsmount *mnt;
+	struct ve_struct *ve = get_exec_env();
+
+	if (atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
+							!ve_is_super(ve))
+		goto out_mnt_nr_dec;
+
+	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
 
@@ -137,11 +187,12 @@ struct vfsmount *alloc_vfsmnt(const char
 			goto out_free_cache;
 
 		if (name) {
-			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+			mnt->mnt_devname = kstrdup(name, GFP_KERNEL_UBC);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
 
+		mnt->owner = VEID(get_exec_env());
 		atomic_set(&mnt->mnt_count, 1);
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
@@ -158,7 +209,9 @@ struct vfsmount *alloc_vfsmnt(const char
 #else
 		mnt->mnt_writers = 0;
 #endif
-	}
+	} else
+		goto out_mnt_nr_dec;
+
 	return mnt;
 
 #ifdef CONFIG_SMP
@@ -169,6 +222,8 @@ out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
 	kmem_cache_free(mnt_cache, mnt);
+out_mnt_nr_dec:
+	atomic_dec(&ve->mnt_nr);
 	return NULL;
 }
 
@@ -302,7 +357,7 @@ int mnt_want_write(struct vfsmount *m)
 		sb_end_write(m->mnt_sb);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(mnt_want_write);
+EXPORT_SYMBOL(mnt_want_write);
 
 /**
  * mnt_clone_write - get write access to a mount
@@ -393,7 +448,7 @@ void mnt_drop_write(struct vfsmount *mnt
 	__mnt_drop_write(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
-EXPORT_SYMBOL_GPL(mnt_drop_write);
+EXPORT_SYMBOL(mnt_drop_write);
 
 void __mnt_drop_write_file(struct file *file)
 {
@@ -466,6 +521,13 @@ EXPORT_SYMBOL(simple_set_mnt);
 
 void free_vfsmnt(struct vfsmount *mnt)
 {
+	struct ve_struct *ve = get_ve_by_id(mnt->owner);
+
+	if (ve) {
+		atomic_dec(&ve->mnt_nr);
+		put_ve(ve);
+	}
+
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
 #ifdef CONFIG_SMP
@@ -584,7 +646,7 @@ static void commit_tree(struct vfsmount 
 	touch_mnt_namespace(n);
 }
 
-static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
+struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
 	if (next == &p->mnt_mounts) {
@@ -599,6 +661,7 @@ static struct vfsmount *next_mnt(struct 
 	}
 	return list_entry(next, struct vfsmount, mnt_child);
 }
+EXPORT_SYMBOL(next_mnt);
 
 static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
 {
@@ -663,6 +726,18 @@ static struct vfsmount *clone_mnt(struct
 	return NULL;
 }
 
+struct vfsmount *vfs_bind_mount(struct vfsmount *old, struct dentry *root)
+{
+	struct vfsmount *mnt;
+
+	mnt = clone_mnt(old, root, 0);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_bind_mount);
+
 static inline void __mntput(struct vfsmount *mnt)
 {
 	struct super_block *sb = mnt->mnt_sb;
@@ -696,6 +771,7 @@ repeat:
 		spin_unlock(&vfsmount_lock);
 		acct_auto_close_mnt(mnt);
 		security_sb_umount_close(mnt);
+		fsnotify_unmount_mnt(mnt);
 		goto repeat;
 	}
 }
@@ -789,14 +865,39 @@ static void *m_start(struct seq_file *m,
 	struct proc_mounts *p = m->private;
 
 	down_read(&namespace_sem);
-	return seq_list_start(&p->ns->list, *pos);
+	if (p->iter_advanced) {
+		p->iter_advanced = 0;
+		if (p->iter_pos < *pos)
+			p->iter_pos++;
+	}
+
+	if (!p->iter || (p->iter_pos > *pos && p->iter == &p->ns->list)) {
+		p->iter = p->ns->list.next;
+		p->iter_pos = 0;
+	}
+
+	while (p->iter_pos < *pos && p->iter != &p->ns->list) {
+		p->iter = p->iter->next;
+		p->iter_pos++;
+	}
+
+	while (p->iter_pos > *pos && p->iter != p->ns->list.next) {
+		p->iter = p->iter->prev;
+		p->iter_pos--;
+	}
+
+	p->iter_pos = *pos;
+	return p->iter != &p->ns->list ? p->iter : NULL;
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_mounts *p = m->private;
 
-	return seq_list_next(v, &p->ns->list, pos);
+	p->iter = p->iter->next;
+	p->iter_pos++;
+	*pos = p->iter_pos;
+	return p->iter != &p->ns->list ? p->iter : NULL;
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -849,22 +950,60 @@ static void show_mnt_opts(struct seq_fil
 
 static void show_type(struct seq_file *m, struct super_block *sb)
 {
-	mangle(m, sb->s_type->name);
-	if (sb->s_subtype && sb->s_subtype[0]) {
-		seq_putc(m, '.');
-		mangle(m, sb->s_subtype);
+	if (!sb->s_op->show_type) {
+		mangle(m, sb->s_type->name);
+		if (sb->s_subtype && sb->s_subtype[0]) {
+			seq_putc(m, '.');
+			mangle(m, sb->s_subtype);
+		}
+	} else
+	        sb->s_op->show_type(m, sb);
+}
+
+static int prepare_mnt_root_mangle(struct path *path,
+		char **path_buf, char **ret_path)
+{
+	/* skip FS_NOMOUNT mounts (rootfs) */
+	if (path->mnt->mnt_sb->s_flags & MS_NOUSER)
+		return -EACCES;
+
+	*path_buf = (char *)__get_free_page(GFP_KERNEL);
+	if (!*path_buf)
+		return -ENOMEM;
+
+	*ret_path = d_path(path, *path_buf, PAGE_SIZE);
+	if (IS_ERR(*ret_path)) {
+		free_page((unsigned long)*path_buf);
+		/*
+		 * This means that the file position will be incremented, i.e.
+		 * the total number of "invisible" vfsmnt will leak.
+		 */
+		return -EACCES;
 	}
+	return 0;
 }
 
 static int show_vfsmnt(struct seq_file *m, void *v)
 {
 	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
-	int err = 0;
+	int err;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	char *path_buf, *path;
 
-	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path);
+	if (err < 0)
+		return (err == -EACCES ? 0 : err);
+
+	if (ve_is_super(get_exec_env()) ||
+	    !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC))
+		mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	else {
+		seq_puts(m, "/dev/");
+		mangle(m, mnt->mnt_sb->s_type->name);
+	}
 	seq_putc(m, ' ');
-	seq_path(m, &mnt_path, " \t\n\\");
+	mangle(m, path);
+	free_page((unsigned long) path_buf);
 	seq_putc(m, ' ');
 	show_type(m, mnt->mnt_sb);
 	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
@@ -899,8 +1038,9 @@ static int show_mountinfo(struct seq_fil
 		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
 	seq_dentry(m, mnt->mnt_root, " \t\n\\");
 	seq_putc(m, ' ');
-	seq_path_root(m, &mnt_path, &root, " \t\n\\");
-	if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
+	err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
+	if (root.mnt != p->root.mnt || root.dentry != p->root.dentry ||
+							err == -EINVAL) {
 		/*
 		 * Mountpoint is outside root, discard that one.  Ugly,
 		 * but less so than trying to do that in iterator in a
@@ -951,18 +1091,27 @@ static int show_vfsstat(struct seq_file 
 {
 	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-	int err = 0;
+	char *path_buf, *path;
+	int err;
+
+	err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path);
+	if (err < 0)
+		return (err == -EACCES ? 0 : err);
 
 	/* device */
 	if (mnt->mnt_devname) {
 		seq_puts(m, "device ");
-		mangle(m, mnt->mnt_devname);
+		if (ve_is_super(get_exec_env()))
+			mangle(m, mnt->mnt_devname);
+		else
+			mangle(m, mnt->mnt_sb->s_type->name);
 	} else
 		seq_puts(m, "no device");
 
 	/* mount point */
 	seq_puts(m, " mounted on ");
-	seq_path(m, &mnt_path, " \t\n\\");
+	mangle(m, path);
+	free_page((unsigned long)path_buf);
 	seq_putc(m, ' ');
 
 	/* file system type */
@@ -1066,16 +1215,18 @@ void release_mounts(struct list_head *he
 
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
+	LIST_HEAD(tmp_list);
 	struct vfsmount *p;
 
 	for (p = mnt; p; p = next_mnt(p, mnt))
-		list_move(&p->mnt_hash, kill);
+		list_move(&p->mnt_hash, &tmp_list);
 
 	if (propagate)
-		propagate_umount(kill);
+		propagate_umount(&tmp_list);
 
-	list_for_each_entry(p, kill, mnt_hash) {
+	list_for_each_entry(p, &tmp_list, mnt_hash) {
 		list_del_init(&p->mnt_expire);
+		advance_mounts_readers(&p->mnt_list);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
@@ -1086,6 +1237,7 @@ void umount_tree(struct vfsmount *mnt, i
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
+	list_splice(&tmp_list, kill);
 }
 
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1174,6 +1326,36 @@ static int do_umount(struct vfsmount *mn
 	return retval;
 }
 
+#ifdef CONFIG_VE
+void umount_ve_fs_type(struct file_system_type *local_fs_type, int veid)
+{
+	struct vfsmount *mnt;
+	LIST_HEAD(kill);
+	LIST_HEAD(umount_list);
+
+	down_write(&namespace_sem);
+	spin_lock(&vfsmount_lock);
+	list_for_each_entry(mnt, &current->nsproxy->mnt_ns->list, mnt_list) {
+		if (mnt->mnt_sb->s_type != local_fs_type)
+			continue;
+		if (veid >= 0 && mnt->owner != veid)
+			continue;
+		list_move(&mnt->mnt_hash, &kill);
+	}
+
+	while (!list_empty(&kill)) {
+		LIST_HEAD(kill2);
+		mnt = list_entry(kill.next, struct vfsmount, mnt_hash);
+		umount_tree(mnt, 1, &kill2);
+		list_splice(&kill2, &umount_list);
+	}
+	spin_unlock(&vfsmount_lock);
+	up_write(&namespace_sem);
+	release_mounts(&umount_list);
+}
+EXPORT_SYMBOL(umount_ve_fs_type);
+#endif
+
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
@@ -1194,7 +1376,7 @@ SYSCALL_DEFINE2(umount, char __user *, n
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 
-	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
+	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	retval = -EINVAL;
@@ -1204,7 +1386,7 @@ SYSCALL_DEFINE2(umount, char __user *, n
 		goto dput_and_out;
 
 	retval = -EPERM;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN))
 		goto dput_and_out;
 
 	retval = do_umount(path.mnt, flags);
@@ -1230,7 +1412,7 @@ SYSCALL_DEFINE1(oldumount, char __user *
 
 static int mount_is_safe(struct path *path)
 {
-	if (capable(CAP_SYS_ADMIN))
+	if (capable(CAP_VE_SYS_ADMIN) || capable(CAP_SYS_ADMIN))
 		return 0;
 	return -EPERM;
 #ifdef notyet
@@ -1475,6 +1657,37 @@ static int attach_recursive_mnt(struct v
 	return err;
 }
 
+void replace_mount(struct vfsmount *src_mnt, struct vfsmount *dst_mnt)
+{
+	struct nameidata src_nd, dst_nd;
+	LIST_HEAD(umount_list);
+
+	down_write(&namespace_sem);
+	spin_lock(&vfsmount_lock);
+
+	detach_mnt(dst_mnt, &dst_nd.path);
+	umount_tree(dst_mnt, 0, &umount_list);
+
+	if (src_mnt->mnt_parent != src_mnt) {
+		detach_mnt(src_mnt, &src_nd.path);
+		attach_mnt(src_mnt, &dst_nd.path);
+	} else {
+		memset(&src_nd, 0, sizeof(src_nd));
+		mnt_set_mountpoint(dst_nd.path.mnt, dst_nd.path.dentry, src_mnt);
+		commit_tree(src_mnt);
+	}
+
+	spin_unlock(&vfsmount_lock);
+	up_write(&namespace_sem);
+
+	path_put(&src_nd.path);
+	path_put(&dst_nd.path);
+	release_mounts(&umount_list);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(replace_mount);
+
 static int graft_tree(struct vfsmount *mnt, struct path *path)
 {
 	int err;
@@ -1514,12 +1727,15 @@ static int do_change_type(struct path *p
 	int type = flag & ~MS_REC;
 	int err = 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
 		return -EPERM;
 
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
+	if (!ve_accessible_veid(path->mnt->owner, get_exec_env()->veid))
+		return -EPERM;
+
 	down_write(&namespace_sem);
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
@@ -1541,7 +1757,7 @@ static int do_change_type(struct path *p
  * do loopback mount.
  */
 static int do_loopback(struct path *path, char *old_name,
-				int recurse)
+				int recurse, int mnt_flags)
 {
 	struct path old_path;
 	struct vfsmount *mnt = NULL;
@@ -1556,7 +1772,7 @@ static int do_loopback(struct path *path
 
 	err = -EINVAL;
 	if (mnt_ns_loop(&old_path))
-		goto out;
+		goto out_path;
 
 	down_write(&namespace_sem);
 	err = -EINVAL;
@@ -1575,6 +1791,7 @@ static int do_loopback(struct path *path
 	if (!mnt)
 		goto out;
 
+	mnt->mnt_flags |= mnt_flags;
 	err = graft_tree(mnt, path);
 	if (err) {
 		LIST_HEAD(umount_list);
@@ -1586,6 +1803,7 @@ static int do_loopback(struct path *path
 
 out:
 	up_write(&namespace_sem);
+out_path:
 	path_put(&old_path);
 	return err;
 }
@@ -1607,18 +1825,169 @@ static int change_mount_flags(struct vfs
 	return error;
 }
 
+static inline int ve_remount_allowed(struct vfsmount *mnt, int flags, int mnt_flags)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct super_block *sb = mnt->mnt_sb;
+
+	if (ve_accessible_veid(mnt->owner, ve->veid))
+		return 0;
+
+	if (mnt != ve->root_path.mnt)
+		return -EPERM;
+	if ((sb->s_flags ^ flags) & MS_RMT_MASK & ~MS_VE_RMT_MASK)
+		return -EPERM;
+	if ((mnt->mnt_flags ^ mnt_flags) & ~MNT_VE_RMT_MASK)
+		return -EPERM;
+
+	return 0;
+}
+
+#ifdef CONFIG_VE
+/*
+ * Returns first occurrence of needle in haystack separated by sep,
+ * or NULL if not found
+ */
+static char *strstr_separated(char *haystack, char *needle, char sep)
+{
+	int needle_len = strlen(needle);
+
+	while (haystack) {
+		if (!strncmp(haystack, needle, needle_len) &&
+		    (haystack[needle_len] == 0 || /* end-of-line or */
+		     haystack[needle_len] == sep)) /* separator */
+			return haystack;
+
+		haystack = strchr(haystack, sep);
+		if (haystack)
+			haystack++;
+	}
+
+	return NULL;
+}
+
+static int ve_devmnt_check(char *options, char *allowed)
+{
+	char *p;
+
+	if (!options || !*options)
+		return 0;
+
+	if (!allowed)
+		return -EPERM;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		if (!strstr_separated(allowed, p, ','))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
+static int ve_devmnt_insert(char *options, char *hidden)
+{
+	int options_len;
+	int hidden_len;
+
+	if (!hidden)
+		return 0;
+
+	if (!options)
+		return -EAGAIN;
+
+	options_len = strlen(options);
+	hidden_len = strlen(hidden);
+	
+	if (hidden_len + options_len + 2 > PAGE_SIZE)
+		return -EPERM;
+
+	memmove(options + hidden_len + 1, options, options_len);
+	memcpy(options, hidden, hidden_len);
+
+	options[hidden_len] = ',';
+	options[hidden_len + options_len + 1] = 0;
+		
+	return 0;
+}
+
+int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remount)
+{
+	void *data = *data_pp;
+	struct ve_devmnt *devmnt;
+	int err;
+again:
+	err = 1;
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(devmnt, &ve->devmnt_list, link) {
+		if (devmnt->dev == dev) {
+			err = ve_devmnt_check(data, devmnt->allowed_options);
+
+			if (!err && !remount)
+				err = ve_devmnt_insert(data, devmnt->hidden_options);
+
+			break;
+		}
+	}
+	mutex_unlock(&ve->devmnt_mutex);
+
+	switch (err) {
+	case -EAGAIN:
+		if (!(data = (void *)__get_free_page(GFP_KERNEL)))
+			return -ENOMEM;
+		*(char *)data = 0; /* the string must be zero-terminated */
+		goto again;
+	case 1:
+		if (*data_pp) {
+			ve_printk(VE_LOG_BOTH, KERN_WARNING "VE%u: no allowed "
+				  "mount options found for device %u:%u\n",
+				  ve->veid, MAJOR(dev), MINOR(dev));
+			err = -EPERM;
+		} else
+			err = 0;
+		break;
+	case 0:
+		*data_pp = data;
+		break;
+	}
+
+	if (data && data != *data_pp)
+		free_page((unsigned long)data);
+
+	return err;
+}
+#endif
+
+static int do_check_and_remount_sb(struct super_block *sb, int flags, void *data)
+{
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+
+	if (sb->s_bdev && data && !ve_is_super(ve)) {
+		int err;
+
+		err = ve_devmnt_process(ve, sb->s_bdev->bd_dev, &data, 1);
+		if (err)
+			return err;
+		
+	}
+#endif
+	return do_remount_sb(sb, flags, data, 0);
+}
+
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
-static int do_remount(struct path *path, int flags, int mnt_flags,
-		      void *data)
+int do_remount(struct path *path, int flags, int mnt_flags, void *data)
 {
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (!check_mnt(path->mnt))
@@ -1627,6 +1996,10 @@ static int do_remount(struct path *path,
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
+	err = ve_remount_allowed(path->mnt, flags, mnt_flags);
+	if (err)
+		return err;
+
 	err = security_sb_remount(sb, data);
 	if (err)
 		return err;
@@ -1635,7 +2008,7 @@ static int do_remount(struct path *path,
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
 	else
-		err = do_remount_sb(sb, flags, data, 0);
+		err = do_check_and_remount_sb(sb, flags, data);
 	if (!err)
 		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
@@ -1648,6 +2021,7 @@ static int do_remount(struct path *path,
 	}
 	return err;
 }
+EXPORT_SYMBOL(do_remount);
 
 static inline int tree_contains_unbindable(struct vfsmount *mnt)
 {
@@ -1664,7 +2038,7 @@ static int do_move_mount(struct path *pa
 	struct path old_path, parent_path;
 	struct vfsmount *p;
 	int err = 0;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (!old_name || !*old_name)
 		return -EINVAL;
@@ -1672,6 +2046,10 @@ static int do_move_mount(struct path *pa
 	if (err)
 		return err;
 
+	err = -EPERM;
+	if (!ve_accessible_veid(old_path.mnt->owner, get_exec_env()->veid))
+		goto out_nosem;
+
 	down_write(&namespace_sem);
 	err = __follow_down(path, true);
 	if (err < 0)
@@ -1730,6 +2108,7 @@ out:
 	up_write(&namespace_sem);
 	if (!err)
 		path_put(&parent_path);
+out_nosem:
 	path_put(&old_path);
 	return err;
 }
@@ -1750,7 +2129,7 @@ static int do_new_mount(struct path *pat
 		return -EINVAL;
 
 	/* we need capabilities... */
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	lock_kernel();
@@ -1809,7 +2188,7 @@ static int do_add_mount_unlocked(struct 
 		goto unlock;
 
 	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+	if (!(mnt_flags & (MNT_SHRINKABLE | MNT_CPT)) && !check_mnt(path->mnt))
 		goto unlock;
 
         /* and in any case, we want non-NULL ->mnt_ns */
@@ -1818,7 +2197,8 @@ static int do_add_mount_unlocked(struct 
 
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;
-	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+	if (!(mnt_flags & MNT_CPT) &&
+	    path->mnt->mnt_sb == newmnt->mnt_sb &&
 	    path->mnt->mnt_root == path->dentry)
 		goto unlock;
 
@@ -2122,7 +2502,7 @@ long do_mount(char *dev_name, const char
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME);
+		   MS_STRICTATIME | MS_CPTMOUNT);
 
 	/* ... and get the mountpoint */
 	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
@@ -2138,7 +2518,7 @@ long do_mount(char *dev_name, const char
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&path, dev_name, flags & MS_REC);
+		retval = do_loopback(&path, dev_name, flags & MS_REC, mnt_flags);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
@@ -2197,22 +2577,48 @@ static struct mnt_namespace *dup_mnt_ns(
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
-	struct vfsmount *p, *q;
+	struct vfsmount *p, *q, *old_root, *new_root;
 
 	new_ns = alloc_mnt_ns();
 	if (IS_ERR(new_ns))
 		return new_ns;
 
 	down_write(&namespace_sem);
+
+	/* For VE clone only its vfsmount tree */
+	old_root = get_exec_env()->root_path.mnt;
+	if (old_root != mnt_ns->root && old_root->mnt_ns == mnt_ns) {
+		/* clone rootfs vfsmount */
+		new_ns->root = clone_mnt(mnt_ns->root,
+				mnt_ns->root->mnt_root, 0);
+		if (!new_ns->root) {
+			up_write(&namespace_sem);
+			kfree(new_ns);
+			return ERR_PTR(-ENOMEM);
+		}
+	} else
+		old_root = mnt_ns->root;
+
 	/* First pass: copy the tree topology */
-	new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
+	new_root = copy_tree(old_root, old_root->mnt_root,
 					CL_COPY_ALL | CL_EXPIRE);
-	if (!new_ns->root) {
+	if (!new_root) {
 		up_write(&namespace_sem);
-		free_mnt_ns(mnt_ns);
+		mntput(new_ns->root);
+		free_mnt_ns(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
 	spin_lock(&vfsmount_lock);
+	if (new_ns->root) {
+		struct path root_path = {
+			.mnt = new_ns->root,
+			.dentry = new_ns->root->mnt_root,
+		};
+		new_ns->root->mnt_ns = new_ns;
+		list_add(&new_ns->root->mnt_list, &new_root->mnt_list);
+		attach_mnt(new_root, &root_path);
+	} else
+		new_ns->root = new_root;
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
 	spin_unlock(&vfsmount_lock);
 
@@ -2221,8 +2627,8 @@ static struct mnt_namespace *dup_mnt_ns(
 	 * as belonging to new namespace.  We have already acquired a private
 	 * fs_struct, so tsk->fs->lock is not needed.
 	 */
-	p = mnt_ns->root;
-	q = new_ns->root;
+	p = old_root;
+	q = new_root;
 	while (p) {
 		q->mnt_ns = new_ns;
 		if (fs) {
@@ -2235,8 +2641,8 @@ static struct mnt_namespace *dup_mnt_ns(
 				fs->pwd.mnt = mntget(q);
 			}
 		}
-		p = next_mnt(p, mnt_ns->root);
-		q = next_mnt(q, new_ns->root);
+		p = next_mnt(p, old_root);
+		q = next_mnt(q, new_root);
 	}
 	up_write(&namespace_sem);
 
@@ -2323,6 +2729,7 @@ out_dir:
 out_type:
 	return ret;
 }
+EXPORT_SYMBOL(sys_mount);
 
 /*
  * pivot_root Semantics:
@@ -2354,9 +2761,10 @@ SYSCALL_DEFINE2(pivot_root, const char _
 {
 	struct vfsmount *tmp;
 	struct path new, old, parent_path, root_parent, root;
+	struct ve_struct *ve = get_exec_env();
 	int error;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
 		return -EPERM;
 
 	error = user_path_dir(new_root, &new);
@@ -2371,15 +2779,19 @@ SYSCALL_DEFINE2(pivot_root, const char _
 		goto out1;
 
 	error = security_sb_pivotroot(&old, &new);
-	if (error) {
-		path_put(&old);
-		goto out1;
-	}
+	if (error)
+		goto out1_5;
+
+	error = -EPERM;
+	if (!ve_accessible_veid(old.mnt->owner, ve->veid)||
+	    !ve_accessible_veid(new.mnt->owner, ve->veid))
+		goto out1_5;
+
+	get_fs_root(current->fs, &root);
+	if (ve->root_path.mnt == root.mnt &&
+	    ve->root_path.dentry == root.dentry)
+		goto out1_6;
 
-	read_lock(&current->fs->lock);
-	root = current->fs->root;
-	path_get(&current->fs->root);
-	read_unlock(&current->fs->lock);
 	down_write(&namespace_sem);
 	mutex_lock(&old.dentry->d_inode->i_mutex);
 	error = -EINVAL;
@@ -2440,7 +2852,9 @@ SYSCALL_DEFINE2(pivot_root, const char _
 out2:
 	mutex_unlock(&old.dentry->d_inode->i_mutex);
 	up_write(&namespace_sem);
+out1_6:
 	path_put(&root);
+out1_5:
 	path_put(&old);
 out1:
 	path_put(&new);
@@ -2482,7 +2896,7 @@ void __init mnt_init(void)
 	init_rwsem(&namespace_sem);
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL);
 
 	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/Kconfig	2014-12-12 23:29:14.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/Kconfig	2015-01-21 12:02:53.563951877 +0300
@@ -121,3 +121,12 @@ config NFS_USE_KERNEL_DNS
 	depends on NFS_V4 && !NFS_USE_LEGACY_DNS
 	select DNS_RESOLVER
 	default y
+
+config NFS_QUOTA
+	bool "Virtuozzo NFS local quota support"
+	depends on NFS_FS
+	select CONFIG_VZ_QUOTA
+	help
+	  This option enables local NFS quota accounting
+
+	  If unsure, say Y.
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/Makefile	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/Makefile	2015-01-21 12:02:53.563951877 +0300
@@ -18,6 +18,8 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4x
 nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+nfs-$(CONFIG_NFS_QUOTA) += quota.o
+nfs-$(CONFIG_VE) += ve.o
 
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/callback.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/callback.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/callback.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/callback.c	2015-01-21 12:02:47.684107967 +0300
@@ -18,6 +18,7 @@
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
 
 #include <net/inet_sock.h>
 
@@ -27,20 +28,14 @@
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
-struct nfs_callback_data {
-	unsigned int users;
-	struct svc_serv *serv;
-	struct svc_rqst *rqst;
-	struct task_struct *task;
-};
+#ifndef CONFIG_VE
+static struct nfs_callback_data _nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+static DEFINE_MUTEX(_nfs_callback_mutex);
+#endif
 
-static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
-static DEFINE_MUTEX(nfs_callback_mutex);
 static struct svc_program nfs4_callback_program;
 
 unsigned int nfs_callback_set_tcpport;
-unsigned short nfs_callback_tcpport;
-unsigned short nfs_callback_tcpport6;
 #define NFS_CALLBACK_MAXPORTNR (65535U)
 
 static int param_set_portnr(const char *val, struct kernel_param *kp)
@@ -108,7 +103,7 @@ nfs4_callback_up(struct svc_serv *serv)
 {
 	int ret;
 
-	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
+	ret = svc_create_xprt(serv, "tcp", current->nsproxy->net_ns, PF_INET,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
@@ -116,7 +111,7 @@ nfs4_callback_up(struct svc_serv *serv)
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
 			nfs_callback_tcpport, PF_INET);
 
-	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
+	ret = svc_create_xprt(serv, "tcp", current->nsproxy->net_ns, PF_INET6,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret > 0) {
 		nfs_callback_tcpport6 = ret;
@@ -185,7 +180,7 @@ nfs41_callback_up(struct svc_serv *serv,
 	 * fore channel connection.
 	 * Returns the input port (0) and sets the svc_serv bc_xprt on success
 	 */
-	ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+	ret = svc_create_xprt(serv, "tcp-bc", current->nsproxy->net_ns, PF_INET, 0,
 			      SVC_SOCK_ANONYMOUS);
 	if (ret < 0) {
 		rqstp = ERR_PTR(ret);
@@ -252,7 +247,6 @@ int nfs_callback_up(u32 minorversion, st
 	struct svc_rqst *rqstp;
 	int (*callback_svc)(void *vrqstp);
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
-	char svc_name[12];
 	int ret = 0;
 	int minorversion_setup;
 
@@ -282,10 +276,11 @@ int nfs_callback_up(u32 minorversion, st
 
 	svc_sock_update_bufs(serv);
 
-	sprintf(svc_name, "nfsv4.%u-svc", minorversion);
 	cb_info->serv = serv;
 	cb_info->rqst = rqstp;
-	cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+	cb_info->task = kthread_run_ve(xprt->owner_env, callback_svc, 
+					cb_info->rqst, "nfsv4.%u-svc/%d", 
+					minorversion, xprt->owner_env->veid);
 	if (IS_ERR(cb_info->task)) {
 		ret = PTR_ERR(cb_info->task);
 		svc_exit_thread(cb_info->rqst);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/callback.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/callback.h
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/callback.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/callback.h	2015-01-21 12:02:46.926128090 +0300
@@ -207,7 +207,15 @@ extern int nfs4_set_callback_sessionid(s
 #define NFS41_BC_MAX_CALLBACKS 1
 
 extern unsigned int nfs_callback_set_tcpport;
+
+#ifndef CONFIG_VE
 extern unsigned short nfs_callback_tcpport;
 extern unsigned short nfs_callback_tcpport6;
+#else
+extern struct ve_nfs4_cb_data ve0_nfs4_cb_data;
+
+extern int ve_nfs4_cb_init(struct ve_struct *ve);
+extern void ve_nfs4_cb_fini(struct ve_struct *ve);
+#endif
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/client.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/client.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/client.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/client.c	2015-01-21 12:02:53.528952807 +0300
@@ -52,7 +52,7 @@
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
-static DEFINE_SPINLOCK(nfs_client_lock);
+DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
@@ -156,6 +156,7 @@ static struct nfs_client *nfs_alloc_clie
 
 	atomic_set(&clp->cl_count, 1);
 	clp->cl_cons_state = NFS_CS_INITING;
+	clp->owner_env = get_exec_env();
 
 	memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
 	clp->cl_addrlen = cl_init->addrlen;
@@ -189,6 +190,7 @@ static struct nfs_client *nfs_alloc_clie
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
 	nfs_fscache_get_client_cookie(clp);
+	ve_nfs_data_get();
 
 	return clp;
 
@@ -218,8 +220,12 @@ static void nfs4_shutdown_session(struct
  */
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
+	struct ve_struct *ve;
+
+	ve = set_exec_env(clp->owner_env);
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
 		nfs_callback_down(clp->cl_mvops->minor_version);
+	(void)set_exec_env(ve);
 }
 
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -294,6 +300,7 @@ static void nfs_free_client(struct nfs_c
 	if (clp->cl_machine_cred != NULL)
 		put_rpccred(clp->cl_machine_cred);
 
+	ve_nfs_data_put(clp->owner_env);
 	kfree(clp->cl_hostname);
 	kfree(clp);
 
@@ -460,13 +467,18 @@ static struct nfs_client *nfs_match_clie
 {
 	struct nfs_client *clp;
 	const struct sockaddr *sap = data->addr;
+	struct ve_struct *ve;
 
+	ve = get_exec_env();
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
 	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 		/* Don't match clients that failed to initialise properly */
 		if (clp->cl_cons_state < 0)
 			continue;
 
+		if (!ve_accessible_strict(clp->owner_env, ve))
+				continue;
+
 		/* Different NFS versions cannot share the same nfs_client */
 		if (clp->rpc_ops != data->rpc_ops)
 			continue;
@@ -592,7 +604,7 @@ int nfs4_check_client_ready(struct nfs_c
 /*
  * Initialise the timeout values for a connection
  */
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 				    unsigned int timeo, unsigned int retrans)
 {
 	to->to_initval = timeo * HZ / 10;
@@ -779,7 +791,7 @@ static int nfs_init_server_rpcclient(str
 		}
 	}
 	server->client->cl_softrtry = 0;
-	if (server->flags & NFS_MOUNT_SOFT)
+	if (server->flags & (NFS_MOUNT_SOFT | NFS_MOUNT_RESTORE))
 		server->client->cl_softrtry = 1;
 
 	return 0;
@@ -1085,6 +1097,8 @@ static struct nfs_server *nfs_alloc_serv
 
 	pnfs_init_server(server);
 
+	nfs_dq_init_prealloc_list(server);
+
 	return server;
 }
 
@@ -1197,11 +1211,15 @@ struct nfs_client *
 nfs4_find_client_no_ident(const struct sockaddr *addr)
 {
 	struct nfs_client *clp;
+	struct ve_struct *ve = get_exec_env();
 
 	spin_lock(&nfs_client_lock);
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
 		if (nfs4_cb_match_client(addr, clp, 0) == false)
 			continue;
+		if (!ve_accessible_strict(clp->owner_env, ve))
+			continue;
+
 		atomic_inc(&clp->cl_count);
 		spin_unlock(&nfs_client_lock);
 		return clp;
@@ -1241,6 +1259,7 @@ nfs4_find_client_sessionid(const struct 
 			   struct nfs4_sessionid *sid)
 {
 	struct nfs_client *clp;
+	struct ve_struct *ve = get_exec_env();
 
 	spin_lock(&nfs_client_lock);
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
@@ -1255,6 +1274,9 @@ nfs4_find_client_sessionid(const struct 
 		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
 			continue;
 
+		if (!ve_accessible_strict(clp->owner_env, ve))
+			continue;
+
 		atomic_inc(&clp->cl_count);
 		spin_unlock(&nfs_client_lock);
 		return clp;
@@ -1709,6 +1731,7 @@ struct nfs_server *nfs_clone_server(stru
 	struct nfs_server *server;
 	struct nfs_fattr *fattr_fsinfo;
 	int error;
+	struct ve_struct *ve;
 
 	dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
 		(unsigned long long) fattr->fsid.major,
@@ -1718,6 +1741,8 @@ struct nfs_server *nfs_clone_server(stru
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
+	ve = set_exec_env(source->nfs_client->owner_env);
+
 	error = -ENOMEM;
 	fattr_fsinfo = nfs_alloc_fattr();
 	if (fattr_fsinfo == NULL)
@@ -1760,12 +1785,14 @@ struct nfs_server *nfs_clone_server(stru
 
 	nfs_free_fattr(fattr_fsinfo);
 	dprintk("<-- nfs_clone_server() = %p\n", server);
+	(void)set_exec_env(ve);
 	return server;
 
 out_free_server:
 	nfs_free_fattr(fattr_fsinfo);
 	nfs_free_server(server);
 	dprintk("<-- nfs_clone_server() = error %d\n", error);
+	(void)set_exec_env(ve);
 	return ERR_PTR(error);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/dir.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/dir.c	2015-01-21 12:02:53.495953682 +0300
@@ -542,7 +542,7 @@ void nfs_prime_dcache(struct dentry *par
 		return;
 
 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, NULL);
 	if (IS_ERR(inode))
 		goto out;
 
@@ -1142,8 +1142,21 @@ static int nfs_lookup_revalidate(struct 
 	inode = dentry->d_inode;
 
 	if (!inode) {
-		if (nfs_neg_need_reval(dir, dentry, nd))
-			goto out_bad;
+		if (nfs_neg_need_reval(dir, dentry, nd)) {
+			unsigned long verifier = nfs_save_change_attribute(dir);
+
+			error = -ENOMEM;
+			fhandle = nfs_alloc_fhandle();
+			fattr = nfs_alloc_fattr();
+			if (fhandle == NULL || fattr == NULL)
+				goto out_error;
+			error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+			if (error != -ENOENT)
+				goto out_bad;
+			nfs_set_verifier(dentry, verifier);
+			nfs_free_fattr(fattr);
+			nfs_free_fhandle(fhandle);
+		}
 		goto out_valid_noent;
 	}
 
@@ -1378,7 +1391,7 @@ static struct dentry *nfs_lookup(struct 
 		res = ERR_PTR(error);
 		goto out_unblock_sillyrename;
 	}
-	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, NULL);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
 		goto out_unblock_sillyrename;
@@ -1707,7 +1720,7 @@ out_err:
  * Code common to create, mkdir, and mknod.
  */
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
-				struct nfs_fattr *fattr)
+				struct nfs_fattr *fattr, struct inode *dummy)
 {
 	struct dentry *parent = dget_parent(dentry);
 	struct inode *dir = parent->d_inode;
@@ -1731,7 +1744,7 @@ int nfs_instantiate(struct dentry *dentr
 		if (error < 0)
 			goto out_error;
 	}
-	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, dummy);
 	error = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_error;
@@ -1840,6 +1853,8 @@ static int nfs_rmdir(struct inode *dir, 
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
 
 	if (dentry->d_inode) {
+		nfs_dq_init(dentry->d_inode);
+
 		nfs_wait_on_sillyrename(dentry);
 		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 		/* Ensure the VFS deletes this inode */
@@ -1904,6 +1919,8 @@ static int nfs_unlink(struct inode *dir,
 	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry->d_name.name);
 
+	nfs_dq_init(dentry->d_inode);
+
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) > 1) {
@@ -2056,6 +2073,9 @@ static int nfs_rename(struct inode *old_
 		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
 		 atomic_read(&new_dentry->d_count));
 
+	if (new_inode)
+		nfs_dq_init(new_inode);
+
 	/*
 	 * For non-directories, check whether the target is busy and if so,
 	 * make a copy of the dentry and then do a silly-rename. If the
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/direct.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/direct.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/direct.c	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/direct.c	2015-01-21 12:02:43.342223240 +0300
@@ -48,6 +48,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/task_io_accounting_ops.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -418,6 +419,8 @@ static ssize_t nfs_direct_read(struct ki
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	dreq = nfs_direct_req_alloc();
 	if (dreq == NULL)
 		goto out;
@@ -436,6 +439,9 @@ static ssize_t nfs_direct_read(struct ki
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+	if (result > 0)
+		task_io_account_read(result);
+
 out_release:
 	nfs_direct_req_release(dreq);
 out:
@@ -804,6 +810,8 @@ static ssize_t nfs_direct_write(struct k
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
 		goto out;
@@ -822,6 +830,8 @@ static ssize_t nfs_direct_write(struct k
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+	if (result > 0)
+		task_io_account_write(result);
 out_release:
 	nfs_direct_req_release(dreq);
 out:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/file.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/file.c	2015-01-21 12:02:53.516953124 +0300
@@ -57,7 +57,7 @@ static ssize_t nfs_file_write(struct kio
 				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
-static int nfs_check_flags(int flags);
+static int nfs_set_flags(struct file *file, int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
@@ -79,7 +79,7 @@ const struct file_operations nfs_file_op
 	.flock		= nfs_flock,
 	.splice_read	= nfs_file_splice_read,
 	.splice_write	= nfs_file_splice_write,
-	.check_flags	= nfs_check_flags,
+	.set_flags	= nfs_set_flags,
 	.setlease	= nfs_setlease,
 };
 
@@ -106,12 +106,12 @@ const struct inode_operations nfs3_file_
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
-static int nfs_check_flags(int flags)
+static int nfs_set_flags(struct file * filp, int flags)
 {
 	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
 		return -EINVAL;
 
-	return 0;
+	return generic_set_file_flags(filp, flags);
 }
 
 /*
@@ -126,7 +126,7 @@ nfs_file_open(struct inode *inode, struc
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name);
 
-	res = nfs_check_flags(filp->f_flags);
+	res = nfs_set_flags(filp, filp->f_flags);
 	if (res)
 		return res;
 
@@ -564,6 +564,11 @@ static int nfs_vm_page_mkwrite(struct vm
 	int ret = VM_FAULT_NOPAGE;
 	struct address_space *mapping;
 
+	if (filp->f_op->get_host) {
+		filp = filp->f_op->get_host(filp);
+		dentry = filp->f_path.dentry;
+	}
+
 	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		filp->f_mapping->host->i_ino,
@@ -618,6 +623,7 @@ static ssize_t nfs_file_write(struct kio
 	struct inode * inode = dentry->d_inode;
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
+	long prealloc_blocks;
 
 	result = nfs_key_timeout_notify(iocb->ki_filp, inode);
 	if (result)
@@ -646,6 +652,10 @@ static ssize_t nfs_file_write(struct kio
 	if (!count)
 		goto out;
 
+	prealloc_blocks = nfs_dq_prealloc_space(inode, pos, count);
+	if (prealloc_blocks < 0)
+		return prealloc_blocks;
+
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	/* Return error values for O_SYNC and IS_SYNC() */
@@ -654,6 +664,9 @@ static ssize_t nfs_file_write(struct kio
 		if (err < 0)
 			result = err;
 	}
+
+	if (result < 0)
+		nfs_dq_release_preallocated_blocks(inode, prealloc_blocks);
 out:
 	return result;
 
@@ -763,7 +776,7 @@ do_unlk(struct file *filp, int cmd, stru
 	 * Use local locking if mounted with "-onolock" or with appropriate
 	 * "-olocal_lock="
 	 */
-	if (!is_local)
+	if (!is_local && !(fl->fl_flags & FL_LOCAL))
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
 		status = do_vfs_lock(filp, fl);
@@ -793,7 +806,7 @@ do_setlk(struct file *filp, int cmd, str
 	 * Use local locking if mounted with "-onolock" or with appropriate
 	 * "-olocal_lock="
 	 */
-	if (!is_local)
+	if (!is_local && !(fl->fl_flags & FL_LOCAL))
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
 		status = do_vfs_lock(filp, fl);
@@ -927,7 +940,7 @@ const struct file_operations nfs4_file_o
 	.flock		= nfs_flock,
 	.splice_read	= nfs_file_splice_read,
 	.splice_write	= nfs_file_splice_write,
-	.check_flags	= nfs_check_flags,
+	.set_flags	= nfs_set_flags,
 	.setlease	= nfs_setlease,
 };
 #endif /* CONFIG_NFS_V4 */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/fscache.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/fscache.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/fscache.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/fscache.c	2015-01-21 12:02:50.169041999 +0300
@@ -55,6 +55,16 @@ void nfs_fscache_release_client_cookie(s
 	clp->fscache = NULL;
 }
 
+void nfs_fscache_dup_uniq_id(char *dst, struct super_block *sb)
+{
+	struct nfs_fscache_key *key;
+	struct nfs_server *nfss = NFS_SB(sb);
+
+	key = nfss->fscache_key;
+	strcpy(dst, key->key.uniquifier);
+}
+EXPORT_SYMBOL(nfs_fscache_dup_uniq_id);
+
 /*
  * Get the cache cookie for an NFS superblock.  We have to handle
  * uniquification here because the cache doesn't do it for us.
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/getroot.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/getroot.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/getroot.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/getroot.c	2015-01-21 12:02:53.483954002 +0300
@@ -94,7 +94,7 @@ struct dentry *nfs_get_root(struct super
 		goto out;
 	}
 
-	inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+	inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
 	if (IS_ERR(inode)) {
 		dprintk("nfs_get_root: get root inode failed\n");
 		ret = ERR_CAST(inode);
@@ -201,7 +201,7 @@ struct dentry *nfs4_get_root(struct supe
 		goto out;
 	}
 
-	inode = nfs_fhget(sb, mntfh, fattr);
+	inode = nfs_fhget(sb, mntfh, fattr, NULL);
 	if (IS_ERR(inode)) {
 		dprintk("nfs_get_root: get root inode failed\n");
 		ret = ERR_CAST(inode);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/inode.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/inode.c	2015-01-21 12:02:53.575951560 +0300
@@ -40,6 +40,7 @@
 #include <linux/compat.h>
 #include <linux/freezer.h>
 #include <linux/crc32.h>
+#include <linux/quotaops.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -79,7 +80,7 @@ int nfs_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	freezable_schedule();
+	schedule();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -112,6 +113,12 @@ void nfs_clear_inode(struct inode *inode
 	/*
 	 * The following should never happen...
 	 */
+#ifdef CONFIG_NFS_QUOTA
+	BUG_ON(!list_empty(&NFS_I(inode)->prealloc));
+	WARN(NFS_I(inode)->i_reserved_quota,
+	     "%s: inode (ino: %ld) reserved bytes: %Ld\n", __func__,
+	     inode->i_ino, NFS_I(inode)->i_reserved_quota);
+#endif
 	BUG_ON(nfs_have_writebacks(inode));
 	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
@@ -261,7 +268,8 @@ nfs_init_locked(struct inode *inode, voi
  * instead of inode number.
  */
 struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr,
+		struct inode *dummy)
 {
 	struct nfs_find_desc desc = {
 		.fh	= fh,
@@ -293,6 +301,8 @@ nfs_fhget(struct super_block *sb, struct
 		 * such as stat(2) */
 		inode->i_ino = hash;
 
+		nfs_dq_swap_inode(inode, dummy);
+
 		/* We can't support update_atime(), since the server will reset it */
 		inode->i_flags |= S_NOATIME|S_NOCMTIME;
 		inode->i_mode = fattr->mode;
@@ -335,6 +345,10 @@ nfs_fhget(struct super_block *sb, struct
 		inode->i_uid = -2;
 		inode->i_gid = -2;
 		inode->i_blocks = 0;
+		/*
+		 * report the blocks in 512byte units
+		 */
+		inode->i_blkbits = 9;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 
 		nfsi->read_cache_jiffies = fattr->time_start;
@@ -374,11 +388,12 @@ nfs_fhget(struct super_block *sb, struct
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
-		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
-			/*
-			 * report the blocks in 512byte units
-			 */
-			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED)
+			inode->i_blocks = nfs_calc_block_size(inode, fattr->du.nfs3.used);
+
+		if (dummy) {
+			inode->i_blocks = 0;
+			nfs_dq_sync_blocks(inode, fattr, NFS_DQ_SYNC_PREALLOC_RELEASE);
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
@@ -429,6 +444,10 @@ nfs_setattr(struct dentry *dentry, struc
 	if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
 		return 0;
 
+	error = nfs_dq_transfer_inode(inode, attr);
+	if (error)
+		return error;
+
 	/* Write all dirty data */
 	if (S_ISREG(inode->i_mode))
 		nfs_wb_all(inode);
@@ -1239,6 +1258,8 @@ int nfs_refresh_inode(struct inode *inod
 	status = nfs_refresh_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
 
+	if (status == 0)
+		nfs_dq_sync_blocks(inode, fattr, NFS_DQ_SYNC_PREALLOC_RELEASE);
 	return status;
 }
 
@@ -1275,6 +1296,8 @@ int nfs_post_op_update_inode(struct inod
 	spin_lock(&inode->i_lock);
 	status = nfs_post_op_update_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
+	if (status == 0)
+		nfs_dq_sync_blocks(inode, fattr, NFS_DQ_SYNC_PREALLOC_RELEASE);
 	return status;
 }
 
@@ -1326,6 +1349,8 @@ int nfs_post_op_update_inode_force_wcc(s
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
+	if (status == 0)
+		nfs_dq_sync_blocks(inode, fattr, NFS_DQ_SYNC_PREALLOC_HOLD);
 	return status;
 }
 
@@ -1485,15 +1510,20 @@ static int nfs_update_inode(struct inode
 	} else if (server->caps & NFS_CAP_NLINK)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
 				| NFS_INO_REVAL_FORCED);
-
-	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
-		/*
-		 * report the blocks in 512byte units
-		 */
-		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- 	}
-	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
-		inode->i_blocks = fattr->du.nfs2.blocks;
+	/*
+	 * Incredibly ugly. Must be threw away with proper NFS quota
+	 * reimplemetation.
+	 */
+	if (!sb_any_quota_active(inode->i_sb)) {
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+			/*
+			 * report the blocks in 512byte units
+			 */
+			inode->i_blocks = nfs_calc_block_size(inode, fattr->du.nfs3.used);
+		}
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+	}
 
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1575,6 +1605,7 @@ struct inode *nfs_alloc_inode(struct sup
 #ifdef CONFIG_NFS_V4
 	nfsi->nfs4_acl = NULL;
 #endif /* CONFIG_NFS_V4 */
+	nfs_dq_init_nfs_inode(nfsi);
 	return &nfsi->vfs_inode;
 }
 
@@ -1583,6 +1614,13 @@ void nfs_destroy_inode(struct inode *ino
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
+void nfs_delete_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	nfs_dq_delete_inode(inode);
+	clear_inode(inode);
+}
+
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
@@ -1630,16 +1668,18 @@ static void nfs_destroy_inodecache(void)
 	kmem_cache_destroy(nfs_inode_cachep);
 }
 
-struct workqueue_struct *nfsiod_workqueue;
+#ifndef CONFIG_VE
+struct workqueue_struct *_nfsiod_workqueue;
+#endif
 
 /*
  * start up the nfsiod workqueue
  */
-static int nfsiod_start(void)
+int nfsiod_start(void)
 {
 	struct workqueue_struct *wq;
 	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = create_singlethread_workqueue("nfsiod");
+	wq = create_singlethread_workqueue_ve("nfsiod", get_exec_env());
 	if (wq == NULL)
 		return -ENOMEM;
 	nfsiod_workqueue = wq;
@@ -1649,7 +1689,7 @@ static int nfsiod_start(void)
 /*
  * Destroy the nfsiod workqueue
  */
-static void nfsiod_stop(void)
+void nfsiod_stop(void)
 {
 	struct workqueue_struct *wq;
 
@@ -1667,6 +1707,8 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	ve0_nfs_data_init();
+
 	err = nfs_idmap_init();
 	if (err < 0)
 		goto out9;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/internal.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/internal.h
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/internal.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/internal.h	2015-01-21 12:02:53.563951877 +0300
@@ -5,6 +5,7 @@
 #include "nfs4_fs.h"
 #include <linux/mount.h>
 #include <linux/security.h>
+#include "ve.h"
 
 #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 
@@ -143,6 +144,7 @@ extern void nfs_umount(const struct nfs_
 
 /* client.c */
 extern struct rpc_program nfs_program;
+extern spinlock_t nfs_client_lock;
 
 extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
@@ -168,6 +170,9 @@ extern int nfs4_check_client_ready(struc
 extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     const struct sockaddr *ds_addr,
 					     int ds_addrlen, int ds_proto);
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+				    unsigned int timeo, unsigned int retrans);
+
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -238,6 +243,77 @@ extern const u32 nfs41_maxread_overhead;
 extern const u32 nfs41_maxwrite_overhead;
 #endif
 
+/* quota.c */
+typedef enum  {
+	NFS_DQ_SYNC_PREALLOC_RELEASE,
+	NFS_DQ_SYNC_PREALLOC_HOLD,
+} nfs_dq_sync_flags_t;
+
+#ifdef CONFIG_NFS_QUOTA
+extern void nfs_dq_init(struct inode *inode);
+extern struct inode * nfs_dq_reserve_inode(struct inode * dir);
+extern void nfs_dq_release_inode(struct inode *inode);
+extern void nfs_dq_swap_inode(struct inode * inode, struct inode * dummy);
+extern int nfs_dq_transfer_inode(struct inode *inode, struct iattr *attr);
+extern void nfs_dq_delete_inode(struct inode *);
+
+extern void nfs_dq_init_sb(struct super_block *sb);
+extern void nfs_dq_init_nfs_inode(struct nfs_inode *nfsi);
+
+extern long nfs_dq_prealloc_space(struct inode *inode, loff_t pos, size_t size);
+extern void nfs_dq_release_preallocated_blocks(struct inode *inode,
+						blkcnt_t blocks);
+extern void nfs_dq_sync_blocks(struct inode *inode, struct nfs_fattr *fattr,
+						nfs_dq_sync_flags_t flag);
+extern void nfs_dq_init_prealloc_list(struct nfs_server *server);
+
+extern blkcnt_t nfs_quota_reserve_barrier;
+#else
+static inline void nfs_dq_init(struct inode *inode)
+{
+}
+static inline struct inode *nfs_dq_reserve_inode(struct inode *dir)
+{
+	return NULL;
+}
+static inline void nfs_dq_release_inode(struct inode *inode)
+{
+}
+static inline void nfs_dq_swap_inode(struct inode * inode, struct inode * dummy)
+{
+}
+static inline int nfs_dq_transfer_inode(struct inode * inode, struct iattr *attr)
+{
+	return 0;
+}
+static inline void nfs_dq_delete_inode(struct inode * inode)
+{
+}
+static inline void nfs_dq_init_sb(struct super_block *sb)
+{
+}
+static inline void nfs_dq_init_nfs_inode(struct nfs_inode *nfsi)
+{
+}
+static inline long nfs_dq_prealloc_space(struct inode *inode, loff_t pos,
+						size_t size)
+{
+	return 0;
+}
+static inline void nfs_dq_release_preallocated_blocks(struct inode *inode,
+						blkcnt_t blocks)
+{
+}
+static inline void nfs_dq_sync_blocks(struct inode *inode,
+					struct nfs_fattr *fattr,
+					nfs_dq_sync_flags_t flag)
+{
+}
+static inline void nfs_dq_init_prealloc_list(struct nfs_server *server)
+{
+}
+#endif
+
 /* nfs4proc.c */
 #ifdef CONFIG_NFS_V4
 extern struct rpc_procinfo nfs4_procedures[];
@@ -258,9 +334,11 @@ extern int nfs_access_cache_shrinker(str
 					int nr_to_scan, gfp_t gfp_mask);
 
 /* inode.c */
-extern struct workqueue_struct *nfsiod_workqueue;
+extern int nfsiod_start(void);
+extern void nfsiod_stop(void);
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
+extern void nfs_delete_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
@@ -284,6 +362,8 @@ extern void nfs_sb_active(struct super_b
 extern void nfs_sb_deactive(struct super_block *sb);
 extern void nfs_sb_deactive_async(struct super_block *sb);
 
+extern int nfs_enable_v4_in_ct;
+
 /* namespace.c */
 extern char *nfs_path(const char *base,
 		      const struct dentry *droot,
@@ -436,9 +516,9 @@ unsigned long nfs_block_bits(unsigned lo
 /*
  * Calculate the number of 512byte blocks used.
  */
-static inline blkcnt_t nfs_calc_block_size(u64 tsize)
+static inline blkcnt_t nfs_calc_block_size(struct inode *inode, u64 tsize)
 {
-	blkcnt_t used = (tsize + 511) >> 9;
+	blkcnt_t used = (tsize + (1 << inode->i_blkbits) - 1) >> inode->i_blkbits;
 	return (used > ULONG_MAX) ? ULONG_MAX : used;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/namespace.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/namespace.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/namespace.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/namespace.c	2015-01-21 12:02:48.536085350 +0300
@@ -15,13 +15,14 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
 #include <linux/sunrpc/gss_api.h>
+#include <linux/module.h>
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
 static void nfs_expire_automounts(struct work_struct *work);
 
-static LIST_HEAD(nfs_automount_list);
+LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs3proc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs3proc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs3proc.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs3proc.c	2015-01-21 12:02:53.496953655 +0300
@@ -295,11 +295,18 @@ static struct nfs3_createdata *nfs3_allo
 static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
 {
 	int status;
+	struct inode *dummy;
+
+	dummy = nfs_dq_reserve_inode(dir);
+	if (IS_ERR(dummy))
+		return -EDQUOT;
 
 	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
 	nfs_post_op_update_inode(dir, data->res.dir_attr);
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh,
+						data->res.fattr, dummy);
+	nfs_dq_release_inode(dummy);
 	return status;
 }
 
@@ -433,8 +440,10 @@ nfs3_proc_unlink_done(struct rpc_task *t
 	struct nfs_removeres *res;
 	if (nfs3_async_handle_jukebox(task, dir))
 		return 0;
-	res = task->tk_msg.rpc_resp;
-	nfs_post_op_update_inode(dir, res->dir_attr);
+	if (task->tk_status >= 0) {
+		res = task->tk_msg.rpc_resp;
+		nfs_post_op_update_inode(dir, res->dir_attr);
+	}
 	return 1;
 }
 
@@ -805,8 +814,10 @@ static int nfs3_read_done(struct rpc_tas
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 
-	nfs_invalidate_atime(inode);
-	nfs_refresh_inode(inode, &data->fattr);
+	if (task->tk_status >= 0) {
+		nfs_invalidate_atime(inode);
+		nfs_refresh_inode(inode, &data->fattr);
+	}
 	return 0;
 }
 
@@ -852,7 +863,8 @@ static int nfs3_commit_done(struct rpc_t
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
-	nfs_refresh_inode(data->inode, data->res.fattr);
+	if (task->tk_status >= 0)
+		nfs_refresh_inode(data->inode, data->res.fattr);
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs4proc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs4proc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs4proc.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs4proc.c	2015-01-21 12:02:53.485953948 +0300
@@ -1143,7 +1143,7 @@ static struct nfs4_state *nfs4_opendata_
 	ret = -EAGAIN;
 	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
 		goto err;
-	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, NULL);
 	ret = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto err;
@@ -1455,7 +1455,7 @@ static int _nfs4_proc_open_confirm(struc
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_open_confirm_ops,
 		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(data->dir->d_inode),
 		.flags = RPC_TASK_ASYNC,
 	};
 	int status;
@@ -1611,7 +1611,7 @@ static int nfs4_run_open_task(struct nfs
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_open_ops,
 		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(data->dir->d_inode),
 		.flags = RPC_TASK_ASYNC,
 	};
 	int status;
@@ -2226,7 +2226,7 @@ int nfs4_do_close(struct nfs4_state *sta
 		.rpc_client = server->client,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_close_ops,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(state->inode),
 		.flags = RPC_TASK_ASYNC,
 	};
 	int status = -ENOMEM;
@@ -3067,7 +3067,8 @@ static int nfs4_do_create(struct inode *
 				    &data->arg.seq_args, &data->res.seq_res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &data->res.dir_cinfo);
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh,
+						data->res.fattr, NULL);
 	}
 	return status;
 }
@@ -4584,7 +4585,7 @@ static struct rpc_task *nfs4_do_unlck(st
 		.rpc_client = NFS_CLIENT(lsp->ls_state->inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_locku_ops,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(lsp->ls_state->inode),
 		.flags = RPC_TASK_ASYNC,
 	};
 
@@ -4823,7 +4824,7 @@ static int _nfs4_do_setlk(struct nfs4_st
 		.rpc_client = NFS_CLIENT(state->inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_lock_ops,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(state->inode),
 		.flags = RPC_TASK_ASYNC,
 	};
 	int ret;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs4renewd.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs4renewd.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs4renewd.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs4renewd.c	2015-01-21 12:02:46.929128011 +0300
@@ -88,8 +88,11 @@ nfs4_renew_state(struct work_struct *wor
 			}
 			nfs_expire_all_delegations(clp);
 		} else {
+			struct ve_struct *ve;
 			/* Queue an asynchronous RENEW. */
+			ve = set_exec_env(clp->cl_rpcclient->cl_xprt->owner_env);
 			ops->sched_state_renewal(clp, cred, renew_flags);
+			(void)set_exec_env(ve);
 			put_rpccred(cred);
 			goto out_exp;
 		}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs4state.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs4state.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/nfs4state.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/nfs4state.c	2015-01-21 12:02:47.685107941 +0300
@@ -1152,9 +1152,11 @@ void nfs4_schedule_state_manager(struct 
 		return;
 	__module_get(THIS_MODULE);
 	atomic_inc(&clp->cl_count);
-	task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
+	task = kthread_run_ve(clp->owner_env, nfs4_run_state_manager, clp,
+				"%s-manager/%d", 
 				rpc_peeraddr2str(clp->cl_rpcclient,
-							RPC_DISPLAY_ADDR));
+							RPC_DISPLAY_ADDR),
+				clp->owner_env->veid);
 	if (!IS_ERR(task))
 		return;
 	nfs4_clear_state_manager_bit(clp);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/proc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/proc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/proc.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/proc.c	2015-01-21 12:02:53.485953948 +0300
@@ -244,7 +244,7 @@ nfs_proc_create(struct inode *dir, struc
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	nfs_free_createdata(data);
 out:
 	dprintk("NFS reply create: %d\n", status);
@@ -291,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct
 		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	nfs_free_createdata(data);
 out:
 	dprintk("NFS reply mknod: %d\n", status);
@@ -434,7 +434,7 @@ nfs_proc_symlink(struct inode *dir, stru
 	 * should fill in the data with a LOOKUP call on the wire.
 	 */
 	if (status == 0)
-		status = nfs_instantiate(dentry, fh, fattr);
+		status = nfs_instantiate(dentry, fh, fattr, NULL);
 
 out_free:
 	nfs_free_fattr(fattr);
@@ -463,7 +463,7 @@ nfs_proc_mkdir(struct inode *dir, struct
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	nfs_free_createdata(data);
 out:
 	dprintk("NFS reply mkdir: %d\n", status);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/quota.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/quota.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/quota.c	2015-01-21 12:02:53.496953655 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/quota.c	2015-01-21 12:02:53.581951399 +0300
@@ -0,0 +1,398 @@
+#include <linux/quotaops.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_QUOTA
+
+static void nfs_dq_free_blocks(struct inode *inode, blkcnt_t blocks);
+void nfs_dq_release_preallocated_blocks(struct inode *inode, blkcnt_t blocks);
+static blkcnt_t nfs_dq_get_reserved_blocks(struct inode *inode);
+static void nfs_dq_update_shrink(struct inode *inode, blkcnt_t shrink,
+				 blkcnt_t reserved_blocks);
+static void nfs_dq_add_to_prealloc_list(struct inode *inode);
+static void nfs_dq_remove_from_prealloc_list(struct inode *inode);
+static int nfs_dq_try_to_release_quota(struct inode *inode);
+
+blkcnt_t nfs_quota_reserve_barrier = 1024;
+
+inline void nfs_dq_init(struct inode *inode)
+{
+	vfs_dq_init(inode);
+}
+
+struct inode * nfs_dq_reserve_inode(struct inode * dir)
+{
+	struct inode * inode;
+	struct nfs_inode *nfsi;
+
+	if (!sb_any_quota_active(dir->i_sb))
+		return NULL;
+
+	/* Second, allocate "quota" inode and initialize required fields */
+	inode = new_inode(dir->i_sb);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nfsi = NFS_I(inode);
+	nfsi->access_cache = RB_ROOT;
+#ifdef CONFIG_NFS_FSCACHE
+	nfsi->fscache = NULL;
+#endif
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	/* Is this optional? */
+	if (dir->i_mode & S_ISGID)
+		inode->i_gid = dir->i_gid;
+
+	if (vfs_dq_alloc_inode(inode) == NO_QUOTA)
+		goto err_drop;
+
+	dprintk("NFS: DQ reserve inode (ino: %ld)\n", inode->i_ino);
+
+	return inode;
+
+err_drop:
+	vfs_dq_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	iput(inode);
+	return ERR_PTR(-EDQUOT);
+}
+
+void nfs_dq_release_inode(struct inode *inode)
+{
+	if (inode) {
+		dprintk("NFS: DQ release inode (ino: %ld)\n", inode->i_ino);
+		vfs_dq_free_inode(inode);
+		vfs_dq_drop(inode);
+		inode->i_flags |= S_NOQUOTA;
+		iput(inode);
+	}
+}
+
+void nfs_dq_swap_inode(struct inode * inode, struct inode * dummy)
+{
+	if (dummy) {
+		dprintk("NFS: DQ swap inodes (ino: %ld to ino: %ld)\n",
+						dummy->i_ino, inode->i_ino);
+		DQUOT_SWAP(inode, dummy);
+	}
+}
+
+int nfs_dq_transfer_inode(struct inode *inode, struct iattr *attr)
+{
+	if (((attr->ia_valid & ATTR_UID) && attr->ia_uid != inode->i_uid) ||
+	    ((attr->ia_valid & ATTR_GID) && attr->ia_gid != inode->i_gid)) {
+		dprintk("NFS: DQ transfer inode (ino: %ld)\n", inode->i_ino);
+		return vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+	}
+	return 0;
+}
+
+static int nfs_dq_drop_inode(struct inode *inode)
+{
+	if (is_bad_inode(inode))
+		return 0;
+
+	if (!sb_any_quota_active(inode->i_sb))
+		return 0;
+
+	mutex_lock(&NFS_I(inode)->quota_sync);
+	nfs_dq_update_shrink(inode, inode->i_blocks,
+			     nfs_dq_get_reserved_blocks(inode));
+	mutex_unlock(&NFS_I(inode)->quota_sync);
+	nfs_dq_remove_from_prealloc_list(inode);
+	dprintk("NFS: DQ drop inode (ino: %ld)\n", inode->i_ino);
+	return 1;
+}
+
+/* Added only to hook vfs_dq_free_inode. --ANK */
+void nfs_dq_delete_inode(struct inode * inode)
+{
+	if (!nfs_dq_drop_inode(inode))
+		return;
+
+	dprintk("NFS: DQ delete inode (ino: %ld)\n", inode->i_ino);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+}
+
+static qsize_t *nfs_get_reserved_space(struct inode *inode)
+{
+	return &NFS_I(inode)->i_reserved_quota;
+}
+
+static const struct dquot_operations nfs_dquot_operations = {
+	.reserve_space		= dquot_reserve_space,
+	.get_reserved_space	= nfs_get_reserved_space,
+	.drop			= nfs_dq_drop_inode,
+};
+
+inline void nfs_dq_init_sb(struct super_block *sb)
+{
+	sb->dq_op = &nfs_dquot_operations;
+}
+
+inline void nfs_dq_init_nfs_inode(struct nfs_inode *nfsi)
+{
+	nfsi->i_reserved_quota = 0;
+	INIT_LIST_HEAD(&nfsi->prealloc);
+	mutex_init(&nfsi->quota_sync);
+}
+
+/*
+ * Calculate the number of pages used.
+ */
+static inline blkcnt_t nfs_calc_page_size(u64 tsize)
+{
+	blkcnt_t used = (tsize + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+	return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+static int nfs_dq_get_new_blocks(struct inode *inode, loff_t pos, size_t size)
+{
+	int new_pages;
+
+	/*
+	 * Quota will be preallocated by page size chunks. We always prealloc
+	 * at least 1 page size chunk. If write request crosses page size
+	 * borders, we will prealloc more pages accordingly. 
+	 */
+
+	/* 
+	 * Here we have a complicated situation. 
+	 * We know nothing about current file configuration. I.e. it could be a
+	 * sparse file. And in this case we can't recognize, was quota already
+	 * allocated for current writing blocks or not. Thus we have to
+	 * prealloc and claim quota for any write operation. Surplus
+	 * preallocated quota will be later freed after inode revalidation.
+	 * This approach garantees, that we will not cross quota border, but,
+	 * on other hand, could come to quota border during preallocation very
+	 * fast in case of many small sizes write requests.
+	 *
+	 * TODO: we can use RB tree per NFS inode to store already "quoted"
+	 * blocks. This will allow us to avoid excess blocks preallocation.
+	 * During syncing inode blocks we can shink this tree to root.
+	 */
+	new_pages = nfs_calc_page_size(pos + size) - nfs_calc_page_size(pos);
+	if (!new_pages)
+		new_pages = 1;
+	return nfs_calc_block_size(inode, new_pages << PAGE_SHIFT);
+}
+
+long nfs_dq_prealloc_space(struct inode *inode, loff_t pos, size_t size)
+{
+	blkcnt_t new_blocks;
+
+	if (!sb_any_quota_active(inode->i_sb))
+		return 0;
+
+	new_blocks = nfs_dq_get_new_blocks(inode, pos, size);
+	if (new_blocks == 0)
+		return 0;
+
+	dprintk("NFS: DQ prealloc %ld blocks (ino: %ld)\n", new_blocks,
+							inode->i_ino);
+
+	while (vfs_dq_reserve_block(inode, new_blocks)) {
+		if (nfs_dq_try_to_release_quota(inode) < 0)
+			return -EDQUOT;
+	}
+
+	nfs_dq_add_to_prealloc_list(inode);
+
+	return new_blocks;
+}
+
+void nfs_dq_release_preallocated_blocks(struct inode *inode, blkcnt_t blocks)
+{
+	if (!sb_any_quota_active(inode->i_sb))
+		return;
+
+	if (blocks == 0)
+		return;
+
+	dprintk("NFS: DQ release %ld reservation blocks (ino: %ld)\n",
+		       			blocks, inode->i_ino);
+	vfs_dq_release_reservation_block(inode, blocks);
+}
+
+static void nfs_dq_claim_preallocated_blocks(struct inode *inode, long new_blocks)
+{
+	if (new_blocks == 0)
+		return;
+
+	dprintk("NFS: DQ claim %ld reserved blocks (ino: %ld)\n",
+					new_blocks, inode->i_ino);
+	if (vfs_dq_claim_block(inode, new_blocks))
+		BUG();
+}
+
+static void nfs_dq_free_blocks(struct inode *inode, blkcnt_t blocks)
+{
+	if (blocks == 0)
+		return;
+
+	dprintk("NFS: DQ free %ld blocks (ino: %ld)\n",
+			blocks, inode->i_ino);
+	vfs_dq_free_block_nodirty(inode, blocks);
+}
+
+static qsize_t nfs_inode_rsv_space(struct nfs_inode *nfs_inode)
+{
+	return inode_get_rsv_space(&nfs_inode->vfs_inode);
+}
+
+static blkcnt_t nfs_dq_get_reserved_blocks(struct inode *inode)
+{
+	qsize_t reserve;
+
+	reserve = inode_get_rsv_space(inode);
+	return nfs_calc_block_size(inode, reserve);
+}
+
+static void nfs_dq_update_grow(struct inode *inode, blkcnt_t grow,
+			       nfs_dq_sync_flags_t flag,
+			       blkcnt_t reserved_blocks)
+{
+	dprintk("NFS: DQ grow %ld blocks (ino: %ld, reserved blocks: %ld)\n",
+			grow, inode->i_ino, reserved_blocks);
+
+	if (reserved_blocks >= grow) {
+		nfs_dq_claim_preallocated_blocks(inode, grow);
+		if (flag == NFS_DQ_SYNC_PREALLOC_RELEASE)
+			nfs_dq_release_preallocated_blocks(inode, reserved_blocks - grow);
+	} else {
+		blkcnt_t blocks_to_alloc = grow - reserved_blocks;
+
+		nfs_dq_claim_preallocated_blocks(inode, reserved_blocks);
+
+		if (blocks_to_alloc) {
+			dprintk("NFS: DQ alloc %ld blocks (ino: %ld)\n",
+							blocks_to_alloc,
+							inode->i_ino);
+			vfs_dq_alloc_block_nofail(inode, blocks_to_alloc);
+		}
+	}
+}
+
+static void nfs_dq_update_shrink(struct inode *inode, blkcnt_t shrink,
+				 blkcnt_t reserved_blocks)
+{
+	if (!reserved_blocks && !shrink)
+		return;
+
+	dprintk("NFS: DQ shrink %ld blocks (ino: %ld, reserved blocks: %ld)\n",
+			shrink, inode->i_ino, reserved_blocks);
+
+	nfs_dq_release_preallocated_blocks(inode, reserved_blocks);
+	nfs_dq_free_blocks(inode, shrink);
+}
+
+void nfs_dq_sync_blocks(struct inode *inode, struct nfs_fattr *fattr,
+				nfs_dq_sync_flags_t flag)
+{
+	blkcnt_t blocks, reserved_blocks;
+
+	if (!sb_any_quota_active(inode->i_sb))
+		return;
+
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return;
+
+	nfs_dq_remove_from_prealloc_list(inode);
+
+	blocks = inode->i_blocks;
+
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED)
+		blocks = nfs_calc_block_size(inode, fattr->du.nfs3.used);
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		blocks = fattr->du.nfs2.blocks;
+
+	mutex_lock(&NFS_I(inode)->quota_sync);
+	reserved_blocks = nfs_dq_get_reserved_blocks(inode);
+	if (blocks > inode->i_blocks)
+		nfs_dq_update_grow(inode, blocks - inode->i_blocks,
+				   flag, reserved_blocks);
+	else
+		nfs_dq_update_shrink(inode, inode->i_blocks - blocks,
+				     reserved_blocks);
+	mutex_unlock(&NFS_I(inode)->quota_sync);
+}
+
+inline void nfs_dq_init_prealloc_list(struct nfs_server *server)
+{
+	INIT_LIST_HEAD(&server->prealloc_list);
+	spin_lock_init(&server->prealloc_lock);
+}
+
+static void nfs_dq_add_to_prealloc_list(struct inode *inode)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	/*
+	 * We omit adding of inodes, which preallocated less than
+	 * "nfs_quota_reserve_barrier" blocks, to "quota fat inodes" list.
+	 */
+	if (nfs_dq_get_reserved_blocks(inode) < nfs_quota_reserve_barrier)
+		return;
+
+	spin_lock(&server->prealloc_lock);
+	if (list_empty(&nfsi->prealloc)) {
+		dprintk("NFS: DQ add inode %ld to prealloc list\n", inode->i_ino);
+		list_add(&nfsi->prealloc, &server->prealloc_list);
+	}
+	spin_unlock(&server->prealloc_lock);
+}
+
+static void nfs_dq_remove_from_prealloc_list(struct inode *inode)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (!list_empty(&nfsi->prealloc)) {
+		dprintk("NFS: DQ remove inode %ld from prealloc list\n", inode->i_ino);
+		spin_lock(&server->prealloc_lock);
+		list_del_init(&nfsi->prealloc);
+		spin_unlock(&server->prealloc_lock);
+	}
+}
+
+static int nfs_dq_try_to_release_quota(struct inode *inode)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *fattest = NFS_I(inode);
+	struct nfs_inode *tmp;
+	struct inode *rev_inode;
+
+	dprintk("NFS: DQ trying to release quota (ino: %ld)\n", inode->i_ino);
+
+	spin_lock(&server->prealloc_lock);
+	list_for_each_entry(tmp, &server->prealloc_list, prealloc) {
+		if (nfs_inode_rsv_space(tmp) > nfs_inode_rsv_space(fattest))
+			fattest = tmp;
+	}
+	spin_unlock(&server->prealloc_lock);
+	
+	rev_inode = &fattest->vfs_inode;
+	dprintk("NFS: DQ fattest inode: %ld (preallocated blocks: %ld)\n",
+		rev_inode->i_ino,
+		nfs_calc_block_size(rev_inode, nfs_inode_rsv_space(fattest)));
+
+	if (!nfs_inode_rsv_space(fattest))
+		return -EDQUOT;
+		
+	/*
+	 * We found inode with maximum non-zero preallocated space. Or at least
+	 * current inode has some preallocated space.
+	 * Now we will try to refresh it.
+	 * We hope, that after this inode refresh we will release some quota
+	 * space.
+	 */
+	dprintk("NFS: DQ trying to revalidate quota (ino: %ld)\n",
+						rev_inode->i_ino);
+	return __nfs_revalidate_inode(server, rev_inode);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/read.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/read.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/read.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/read.c	2015-01-21 12:02:46.929128011 +0300
@@ -219,7 +219,7 @@ int nfs_initiate_read(struct rpc_clnt *c
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(inode),
 		.flags = RPC_TASK_ASYNC | swap_flags,
 	};
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/super.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/super.c	2015-01-21 12:02:53.555952091 +0300
@@ -54,6 +54,9 @@
 #include <linux/magic.h>
 #include <linux/parser.h>
 #include <linux/kthread.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+#include <linux/writeback.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -277,21 +280,23 @@ struct file_system_type nfs_fs_type = {
 	.name		= "nfs",
 	.get_sb		= nfs_get_sb,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
-EXPORT_SYMBOL_GPL(nfs_fs_type);
 
 struct file_system_type nfs_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs",
 	.get_sb		= nfs_xdev_get_sb,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
 
 static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
+	.delete_inode	= nfs_delete_inode,
 	.write_inode	= nfs_write_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
@@ -319,12 +324,13 @@ static int nfs4_remote_referral_get_sb(s
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs4_kill_super(struct super_block *sb);
 
-static struct file_system_type nfs4_fs_type = {
+struct file_system_type nfs4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
 	.get_sb		= nfs4_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
 
 static struct file_system_type nfs4_remote_fs_type = {
@@ -332,7 +338,8 @@ static struct file_system_type nfs4_remo
 	.name		= "nfs4",
 	.get_sb		= nfs4_remote_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
 
 struct file_system_type nfs4_xdev_fs_type = {
@@ -340,7 +347,8 @@ struct file_system_type nfs4_xdev_fs_typ
 	.name		= "nfs4",
 	.get_sb		= nfs4_xdev_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
 
 static struct file_system_type nfs4_remote_referral_fs_type = {
@@ -348,7 +356,8 @@ static struct file_system_type nfs4_remo
 	.name		= "nfs4",
 	.get_sb		= nfs4_remote_referral_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
 
 struct file_system_type nfs4_referral_fs_type = {
@@ -356,7 +365,8 @@ struct file_system_type nfs4_referral_fs
 	.name		= "nfs4",
 	.get_sb		= nfs4_referral_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA|FS_WEAK_REVALIDATE|
+			  FS_VIRTUALIZED,
 };
 
 static const struct super_operations nfs4_sops = {
@@ -398,6 +408,7 @@ int __init register_nfs_fs(void)
 		goto error_2;
 #endif
 	register_shrinker(&acl_shrinker);
+	ve_register_nfs_hooks();
 	return 0;
 
 #ifdef CONFIG_NFS_V4
@@ -415,6 +426,7 @@ error_0:
  */
 void __exit unregister_nfs_fs(void)
 {
+	ve_unregister_nfs_hooks();
 	unregister_shrinker(&acl_shrinker);
 #ifdef CONFIG_NFS_V4
 	unregister_filesystem(&nfs4_fs_type);
@@ -645,7 +657,9 @@ static void nfs_show_mountd_options(stru
 
 	if (nfss->mountd_version || showdefaults)
 		seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-	if (nfss->mountd_port || showdefaults)
+	if ((nfss->mountd_port &&
+		nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) || 
+		showdefaults)
 		seq_printf(m, ",mountport=%u", nfss->mountd_port);
 
 	nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -1864,6 +1878,98 @@ static int nfs_parse_devname(const char 
 					 export_path, maxpathlen);
 }
 
+int nfs_enable_v4_in_ct = 0;
+EXPORT_SYMBOL(nfs_enable_v4_in_ct);
+
+static int nfs_absorb_migrated_mount_data(void *options_dump,
+					  struct nfs_parsed_mount_data *args,
+					  struct nfs_fh *mntfh)
+{
+	struct nfs_mount_data_dump *dump = options_dump;
+
+	memset(args, 0, sizeof(*args));
+
+	args->version = dump->mount_server.version;
+	args->minorversion = dump->minorversion;
+
+	args->flags     = dump->flags | NFS_MOUNT_RESTORE;
+
+	args->rsize     = dump->rsize;
+	args->wsize     = dump->wsize;
+	args->timeo     = dump->timeo;
+	args->retrans   = dump->retrans;
+	args->acregmin  = dump->acregmin;
+	args->acregmax  = dump->acregmax;
+	args->acdirmin  = dump->acdirmin;
+	args->acdirmax  = dump->acdirmax;
+	args->namlen	= dump->namlen;
+	args->options   = dump->options;
+	args->bsize     = dump->bsize;
+	args->auth_flavor_len = 1;
+	args->auth_flavors[0] = dump->auth_flavors;
+
+	args->mount_server.addrlen = dump->mount_server.addrlen;
+	memcpy(&args->mount_server.address, &dump->mount_server.address,
+			args->mount_server.addrlen);
+
+	args->mount_server.version = dump->mount_server.version;
+	args->mount_server.port = dump->mount_server.port;
+	args->mount_server.protocol = dump->mount_server.protocol;
+
+	args->client_address = kstrdup(dump->client_address, GFP_KERNEL);
+	if (!args->client_address) {
+		printk(KERN_ERR "%s: client_address duplication failed\n", __func__);
+		return -EFAULT;
+	}
+
+#ifdef CONFIG_NFS_FSCACHE
+	if (strlen(dump->fscache_uniq)) {
+		args->fscache_uniq = kstrdup(dump->fscache_uniq, GFP_KERNEL);
+		if (!args->fscache_uniq) {
+			printk(KERN_ERR "%s: fscache_uniq duplication failed\n", __func__);
+			goto err_fscache;
+		}
+	}
+#endif
+
+	args->nfs_server.addrlen = dump->nfs_server.addrlen;
+	memcpy(&args->nfs_server.address, &dump->nfs_server.address,
+			args->nfs_server.addrlen);
+	args->nfs_server.hostname = kstrdup(dump->nfs_server.hostname, GFP_KERNEL);
+	if (!args->nfs_server.hostname) {
+		printk(KERN_ERR "%s: nfs_server.hostname duplication failed\n", __func__);
+		goto err_hostname;
+	}
+
+	args->nfs_server.export_path = kstrdup(dump->nfs_server.export_path, GFP_KERNEL);
+	if (!args->nfs_server.export_path) {
+		printk(KERN_ERR "%s: nfs_server.export_path duplication failed\n", __func__);
+		goto err_export;
+	}
+
+	args->nfs_server.port = dump->nfs_server.port;
+	args->nfs_server.protocol = dump->nfs_server.protocol;
+
+	if (mntfh) {
+		mntfh->size = dump->root.size;
+		memcpy(mntfh->data, dump->root.data, mntfh->size);
+		if (mntfh->size < sizeof(mntfh->data))
+			memset(mntfh->data + mntfh->size, 0,
+			       sizeof(mntfh->data) - mntfh->size);
+	}
+	return 0;
+
+err_export:
+	kfree(args->nfs_server.hostname);
+err_hostname:
+#ifdef CONFIG_NFS_FSCACHE
+	kfree(args->fscache_uniq);
+#endif
+err_fscache:
+	kfree(args->client_address);
+	return -EFAULT;
+}
+
 /*
  * Validate the NFS2/NFS3 mount data
  * - fills in the mount root filehandle
@@ -1987,6 +2093,8 @@ static int nfs_validate_mount_data(void 
 		}
 
 		break;
+	case NFS_MOUNT_MIGRATED:
+		return nfs_absorb_migrated_mount_data(options, args, mntfh);
 	default: {
 		int status;
 
@@ -1996,13 +2104,16 @@ static int nfs_validate_mount_data(void 
 		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
-		if (args->version == 4)
+		if (args->version == 4) {
+			if (!nfs_enable_v4_in_ct && !ve_is_super(get_exec_env()))
+				goto out_v4_not_compiled;
 #ifdef CONFIG_NFS_V4
 			return nfs4_validate_text_mount_data(options,
 							     args, dev_name);
 #else
 			goto out_v4_not_compiled;
 #endif
+		}
 
 		nfs_set_port(sap, &args->nfs_server.port, 0);
 
@@ -2031,6 +2142,11 @@ static int nfs_validate_mount_data(void 
 		goto out_v3_not_compiled;
 #endif /* !CONFIG_NFS_V3 */
 
+	if (!(args->flags & NFS_MOUNT_VER3)) {
+		printk("NFSv2 is broken and not supported\n");
+		return -EPROTONOSUPPORT;
+	}
+
 	return 0;
 
 out_no_data:
@@ -2052,11 +2168,9 @@ out_v3_not_compiled:
 	return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
 
-#ifndef CONFIG_NFS_V4
 out_v4_not_compiled:
 	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
 	return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V4 */
 
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
@@ -2169,6 +2283,8 @@ static inline void nfs_initialise_sb(str
 {
 	struct nfs_server *server = NFS_SB(sb);
 
+	nfs_dq_init_sb(sb);
+
 	sb->s_magic = NFS_SUPER_MAGIC;
 
 	/* We probably want something more informative here */
@@ -2326,6 +2442,10 @@ static int nfs_compare_super(struct supe
 	struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
 	int mntflags = sb_mntdata->mntflags;
 
+	if (!ve_accessible_strict(old->client->cl_xprt->owner_env,
+				  get_exec_env()))
+		return 0;
+
 	if (!nfs_compare_super_address(old, server))
 		return 0;
 	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
@@ -2354,6 +2474,11 @@ static int nfs_get_sb(struct file_system
 		.mntflags = flags,
 	};
 	int error = -ENOMEM;
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!(ve->features & VE_FEATURE_NFS))
+		return -ENODEV;
 
 	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
 	mntfh = nfs_alloc_fhandle();
@@ -2700,6 +2825,8 @@ static int nfs4_validate_mount_data(void
 		nfs_validate_transport_protocol(args);
 
 		break;
+	case NFS_MOUNT_MIGRATED:
+		return nfs_absorb_migrated_mount_data(options, args, NULL);
 	default:
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
@@ -2742,6 +2869,11 @@ static int nfs4_remote_get_sb(struct fil
 		.mntflags = flags,
 	};
 	int error = -ENOMEM;
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!(ve->features & VE_FEATURE_NFS))
+		return -ENODEV;
 
 	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
@@ -3012,6 +3144,11 @@ static int nfs4_get_sb(struct file_syste
 {
 	struct nfs_parsed_mount_data *data;
 	int error = -ENOMEM;
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!(ve->features & VE_FEATURE_NFS))
+		return -ENODEV;
 
 	data = nfs_alloc_parsed_mount_data(4);
 	if (data == NULL)
@@ -3154,9 +3291,14 @@ static int nfs4_remote_referral_get_sb(s
 		.mntflags = flags,
 	};
 	int error = -ENOMEM;
+	struct ve_struct *ve;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
 
+	ve = get_exec_env();
+	if (!(ve->features & VE_FEATURE_NFS))
+		return -ENODEV;
+
 	mntfh = nfs_alloc_fhandle();
 	if (mntfh == NULL)
 		goto out_err_nofh;
@@ -3254,9 +3396,14 @@ static int nfs4_referral_get_sb(struct f
 	char *export_path;
 	struct vfsmount *root_mnt;
 	int error;
+	struct ve_struct *ve;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
 
+	ve = get_exec_env();
+	if (!(ve->features & VE_FEATURE_NFS))
+		return -ENODEV;
+
 	export_path = data->mnt_path;
 	data->mnt_path = "/";
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/sysctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/sysctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/sysctl.c	2014-12-12 23:29:14.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/sysctl.c	2015-01-21 12:02:53.563951877 +0300
@@ -14,6 +14,7 @@
 #include <linux/nfs_fs.h>
 
 #include "callback.h"
+#include "internal.h"
 
 #ifdef CONFIG_NFS_V4
 static const int nfs_set_port_min = 0;
@@ -60,6 +61,24 @@ static ctl_table nfs_cb_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nfs4_ct_enable",
+		.data		= &nfs_enable_v4_in_ct,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_NFS_QUOTA
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "quota_reserve_blocks_thresh",
+		.data		= &nfs_quota_reserve_barrier,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/unlink.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/unlink.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/unlink.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/unlink.c	2015-01-21 12:02:49.435061484 +0300
@@ -20,15 +20,6 @@
 #include "iostat.h"
 #include "delegation.h"
 
-struct nfs_unlinkdata {
-	struct hlist_node list;
-	struct nfs_removeargs args;
-	struct nfs_removeres res;
-	struct inode *dir;
-	struct rpc_cred	*cred;
-	struct nfs_fattr dir_attr;
-};
-
 /**
  * nfs_free_unlinkdata - release data from a sillydelete operation.
  * @data: pointer to unlink structure.
@@ -139,7 +130,7 @@ static int nfs_do_call_unlink(struct den
 		.rpc_message = &msg,
 		.callback_ops = &nfs_unlink_ops,
 		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(dir),
 		.flags = RPC_TASK_ASYNC,
 	};
 	struct rpc_task *task;
@@ -424,7 +415,7 @@ nfs_async_rename(struct inode *old_dir, 
 	struct rpc_task_setup task_setup_data = {
 		.rpc_message = &msg,
 		.callback_ops = &nfs_rename_ops,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(old_dir),
 		.rpc_client = NFS_CLIENT(old_dir),
 		.flags = RPC_TASK_ASYNC,
 	};
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/ve.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/ve.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/ve.c	2015-01-21 12:02:47.686107915 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/ve.c	2015-01-21 12:02:47.686107915 +0300
@@ -0,0 +1,227 @@
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+
+#include "internal.h"
+
+static int ve_nfs_init(void *data)
+{
+	int err;
+	struct ve_nfs_data *nfs_data;
+	struct ve_struct *ve = (struct ve_struct *) data;
+
+	if (!(ve->features & VE_FEATURE_NFS))
+		return 0;
+
+	nfs_data = kzalloc(sizeof(struct ve_nfs_data), GFP_KERNEL);
+	if (nfs_data == NULL)
+		return -ENOMEM;
+	ve_nfs_data_init(nfs_data);
+	err = nfsiod_start();
+	if (err)
+		goto err_nfsiod;
+	__module_get(THIS_MODULE);
+	return 0;
+
+err_nfsiod:
+	kfree(ve->nfs_data);
+	return err;
+}
+
+void ve_nfs_data_put(struct ve_struct *ve)
+{
+	struct ve_struct *curr_ve;
+
+	curr_ve = set_exec_env(ve);
+	if (atomic_dec_and_test(&ve->nfs_data->_users)) {
+		nfsiod_stop();
+		kfree(ve->nfs_data);
+		ve->nfs_data = NULL;
+		module_put(THIS_MODULE);
+	}
+	(void)set_exec_env(curr_ve);
+}
+
+static void ve_nfs_fini(void *data)
+{
+	struct ve_struct *ve = data;
+
+	if (ve->nfs_data == NULL)
+		return;
+
+	umount_ve_fs_type(&nfs_fs_type, ve->veid);
+	umount_ve_fs_type(&nfs4_fs_type, ve->veid);
+
+	ve_nfs_data_put(ve);
+	if (ve->nfs_data)
+		printk(KERN_WARNING "CT%d: NFS mounts used outside CT. Release "
+				"all external references to CT's NFS mounts to "
+				"continue shutdown.\n", ve->veid);
+}
+
+inline int is_nfs_automount(struct vfsmount *mnt)
+{
+	struct vfsmount *submnt;
+
+	spin_lock(&vfsmount_lock);
+	list_for_each_entry(submnt, &nfs_automount_list, mnt_expire) {
+		if (mnt == submnt) {
+			spin_unlock(&vfsmount_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&vfsmount_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(is_nfs_automount);
+
+static int ve_nfs_sync_fs(struct file_system_type *fs, struct ve_struct *env, int wait)
+{
+	struct super_block *sb;
+	int ret = 0;
+
+	spin_lock(&sb_lock);
+rescan:
+	list_for_each_entry(sb, &fs->fs_supers, s_instances) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+
+		down_read(&sb->s_umount);
+		if (sb->s_root && !(sb->s_flags & MS_RDONLY)) {
+			struct rpc_clnt *clnt = NFS_SB(sb)->client;
+			struct ve_struct *owner_env = clnt->cl_xprt->owner_env;
+			if (ve_accessible_strict(owner_env, env)) {
+				ret = __sync_filesystem(sb, NULL, wait);
+				if (ret < 0) {
+					up_read(&sb->s_umount);
+					put_super(sb);
+					return ret;
+				}
+			}
+		}
+		up_read(&sb->s_umount);
+
+		spin_lock(&sb_lock);
+
+		/* This logic is taken from sync_inodes()  */
+		if (__put_super_and_need_restart(sb))
+			goto rescan;
+	}
+
+	spin_unlock(&sb_lock);
+	return ret;
+}
+
+int ve_nfs_sync(struct ve_struct *env, int wait)
+{
+	int ret;
+
+	ret = ve_nfs_sync_fs(&nfs_fs_type, env, wait);
+	if (!ret)
+		ret = ve_nfs_sync_fs(&nfs4_fs_type, env, wait);
+	return ret;
+}
+EXPORT_SYMBOL(ve_nfs_sync);
+
+static void ve_nfs_umount_begin(struct ve_struct *ve, struct file_system_type *nfs)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &nfs->fs_supers, s_instances)
+		if (ve_accessible_strict(NFS_SB(sb)->nfs_client->owner_env, ve))
+			sb->s_op->umount_begin(sb);
+	spin_unlock(&sb_lock);
+}
+
+static void ve_nfs_stop(void *data)
+{
+	struct ve_struct *ve = data;
+
+	if (ve->nfs_data == NULL)
+		return;
+
+	ve_nfs_umount_begin(ve, &nfs_fs_type);
+	ve_nfs_umount_begin(ve, &nfs4_fs_type);
+}
+
+static struct ve_hook nfs_ss_hook = {
+	.init	  = ve_nfs_init,
+	.fini	  = ve_nfs_fini,
+	.owner	  = THIS_MODULE,
+	.priority = HOOK_PRIO_NET_POST,
+};
+
+static struct ve_hook nfs_hook = {
+	.fini	  = ve_nfs_stop,
+	.owner	  = THIS_MODULE,
+	.priority = HOOK_PRIO_NET_POST,
+};
+
+void ve_register_nfs_hooks(void)
+{
+	ve_hook_register(VE_SS_CHAIN, &nfs_ss_hook);
+	ve_hook_register(VE_INIT_EXIT_CHAIN, &nfs_hook);
+}
+
+void ve_unregister_nfs_hooks(void)
+{
+	ve_hook_unregister(&nfs_hook);
+	ve_hook_unregister(&nfs_ss_hook);
+}
+
+static void nfs_client_update_params(struct nfs_client *nfs_client,
+				     const struct rpc_timeout *timeparams)
+{
+	struct rpc_clnt *clnt = nfs_client->cl_rpcclient;
+
+	spin_lock_bh(&clnt->cl_xprt->transport_lock);
+	clnt->cl_timeout_default = *timeparams;
+	spin_unlock_bh(&clnt->cl_xprt->transport_lock);
+}
+
+static void nfs_update_one_server(struct nfs_server *nfs_server,
+				  const struct rpc_timeout *timeparams)
+{
+	struct rpc_clnt *clnt = nfs_server->client;
+
+	nfs_server->flags &= ~NFS_MOUNT_RESTORE;
+	if (!(nfs_server->flags & NFS_MOUNT_SOFT))
+		clnt->cl_softrtry = 0;
+
+	spin_lock_bh(&clnt->cl_xprt->transport_lock);
+	clnt->cl_timeout_default = *timeparams;
+	rpc_init_rtt(&clnt->cl_rtt_default, timeparams->to_initval);
+	spin_unlock_bh(&clnt->cl_xprt->transport_lock);
+}
+
+void nfs_change_server_params(void *data, int timeo, int retrans)
+{
+	struct nfs_server *nfs_server = data;
+	struct nfs_client *nfs_client = nfs_server->nfs_client;
+	int proto = (nfs_server->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP 
+							: IPPROTO_UDP;
+	struct rpc_timeout timeparams;
+
+	nfs_init_timeout_values(&timeparams, proto, timeo, retrans);
+
+	spin_lock(&nfs_client_lock);
+	nfs_client_update_params(nfs_server->nfs_client, &timeparams);
+	list_for_each_entry(nfs_server, &nfs_client->cl_superblocks, client_link) {
+		nfs_update_one_server(nfs_server, &timeparams);
+	}
+	spin_unlock(&nfs_client_lock);
+}
+EXPORT_SYMBOL(nfs_change_server_params);
+
+void ve0_nfs_data_init(void)
+{
+	static struct ve_nfs_data ve0_nfs_data;
+
+	ve_nfs_data_init(&ve0_nfs_data);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/ve.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/ve.h
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/ve.h	2015-01-21 12:02:47.686107915 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/ve.h	2015-01-21 12:02:47.686107915 +0300
@@ -0,0 +1,94 @@
+/*
+ * fs/nfs/ve.h
+ *
+ * VE context for NFS
+ *
+ * Copyright (C) 2007 SWsoft
+ */
+
+#ifndef __VE_NFS_H__
+#define __VE_NFS_H__
+
+#ifdef CONFIG_NFS_V4
+#include <linux/nfs4.h>
+
+#define nfs_callback_tcpport	NFS_CTX_FIELD(nfs_callback_tcpport)
+#define nfs_callback_tcpport6	NFS_CTX_FIELD(nfs_callback_tcpport6)
+
+struct nfs_callback_data {
+	unsigned int users;
+	struct svc_serv *serv;
+	struct svc_rqst *rqst;
+	struct task_struct *task;
+};
+#endif
+
+struct ve_nfs_data {
+	struct workqueue_struct *_nfsiod_workqueue;
+	atomic_t		_users;
+#ifdef CONFIG_NFS_V4
+	struct nfs_callback_data _nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+	struct mutex		_nfs_callback_mutex;
+
+	unsigned short		_nfs_callback_tcpport;
+	unsigned short		_nfs_callback_tcpport6;
+#endif
+};
+
+#ifdef CONFIG_VE
+
+#include <linux/ve.h>
+
+#define NFS_CTX_FIELD(arg)	(get_exec_env()->nfs_data->_##arg)
+
+static inline void ve_nfs_data_init(struct ve_nfs_data *data)
+{
+	atomic_set(&data->_users, 1);
+#ifdef CONFIG_NFS_V4
+	mutex_init(&data->_nfs_callback_mutex);
+#endif
+	get_exec_env()->nfs_data = data;
+}
+
+static inline void ve_nfs_data_get(void)
+{
+	atomic_inc(&get_exec_env()->nfs_data->_users);
+}
+
+extern inline void ve_nfs_data_put(struct ve_struct *ve);
+extern void ve0_nfs_data_init(void);
+extern void ve_register_nfs_hooks(void);
+extern void ve_unregister_nfs_hooks(void);
+
+static inline struct workqueue_struct *inode_nfsiod_wq(struct inode *inode)
+{
+	return NFS_SERVER(inode)->nfs_client->owner_env->nfs_data->_nfsiod_workqueue;
+}
+
+#else /* CONFIG_VE */
+
+#define NFS_CTX_FIELD(arg)	_##arg
+
+static void ve_nfs_data_init(void)
+{}
+static void ve_nfs_data_get(void)
+{}
+static void ve_nfs_data_put(struct ve_struct *ve)
+{}
+static void ve0_nfs_data_init(struct ve_struct *ve)
+{}
+static void ve_register_nfs_hooks(struct ve_struct *ve)
+{}
+static void ve_unregister_nfs_hooks(struct ve_struct *ve)
+{}
+
+extern struct workqueue_struct *nfsiod_workqueue;
+#define inode_nfsiod_wq(inode)	nfsiod_workqueue
+
+#endif /* CONFIG_VE */
+
+#define nfsiod_workqueue	NFS_CTX_FIELD(nfsiod_workqueue)
+#define nfs_callback_info	NFS_CTX_FIELD(nfs_callback_info)
+#define nfs_callback_mutex	NFS_CTX_FIELD(nfs_callback_mutex)
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfs/write.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/write.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfs/write.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfs/write.c	2015-01-21 12:02:46.931127957 +0300
@@ -992,11 +992,14 @@ int nfs_initiate_write(struct rpc_clnt *
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(inode),
 		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
 	int ret = 0;
+	struct ve_struct *ve;
+
+	ve = set_exec_env(NFS_SERVER(inode)->nfs_client->owner_env);
 
 	/* Set up the initial task struct.  */
 	NFS_PROTO(inode)->write_setup(data, &msg);
@@ -1021,6 +1024,7 @@ int nfs_initiate_write(struct rpc_clnt *
 	}
 	rpc_put_task(task);
 out:
+	(void)set_exec_env(ve);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_write);
@@ -1464,21 +1468,27 @@ int nfs_initiate_commit(struct rpc_clnt 
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
+		.workqueue = inode_nfsiod_wq(data->inode),
 		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
+	struct ve_struct *ve;
+
+	ve = set_exec_env(NFS_SERVER(data->inode)->nfs_client->owner_env);
 	/* Set up the initial task struct.  */
 	NFS_PROTO(data->inode)->commit_setup(data, &msg);
 
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
 	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
+	if (IS_ERR(task)) {
+		(void)set_exec_env(ve);
 		return PTR_ERR(task);
+	}
 	if (how & FLUSH_SYNC)
 		rpc_wait_for_completion_task(task);
 	rpc_put_task(task);
+	(void)set_exec_env(ve);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_commit);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/export.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/export.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/export.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/export.c	2015-01-21 12:02:53.128963425 +0300
@@ -15,6 +15,7 @@
 #include <linux/namei.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
+#include <linux/quotaops.h>
 
 #include <linux/nfsd/syscall.h>
 #include <net/ipv6.h>
@@ -39,7 +40,6 @@ typedef struct svc_export	svc_export;
 #define	EXPKEY_HASHBITS		8
 #define	EXPKEY_HASHMAX		(1 << EXPKEY_HASHBITS)
 #define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
-static struct cache_head *expkey_table[EXPKEY_HASHMAX];
 
 static void expkey_put(struct kref *ref)
 {
@@ -74,7 +74,10 @@ static int expkey_upcall(struct cache_de
 
 static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
-static struct cache_detail svc_expkey_cache;
+#ifndef CONFIG_VE
+static struct cache_detail *_svc_expkey_cache;
+static struct cache_detail *_svc_export_cache;
+#endif
 
 static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
@@ -165,7 +168,7 @@ static int expkey_parse(struct cache_det
 	cache_flush();
  out:
 	if (ek)
-		cache_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, svc_expkey_cache);
 	if (dom)
 		auth_domain_put(dom);
 	kfree(buf);
@@ -243,10 +246,9 @@ static struct cache_head *expkey_alloc(v
 		return NULL;
 }
 
-static struct cache_detail svc_expkey_cache = {
+static struct cache_detail __svc_expkey_cache = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
-	.hash_table	= expkey_table,
 	.name		= "nfsd.fh",
 	.cache_put	= expkey_put,
 	.cache_upcall	= expkey_upcall,
@@ -277,7 +279,7 @@ svc_expkey_lookup(struct svc_expkey *ite
 	struct cache_head *ch;
 	int hash = svc_expkey_hash(item);
 
-	ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
+	ch = sunrpc_cache_lookup(svc_expkey_cache, &item->h,
 				 hash);
 	if (ch)
 		return container_of(ch, struct svc_expkey, h);
@@ -291,7 +293,7 @@ svc_expkey_update(struct svc_expkey *new
 	struct cache_head *ch;
 	int hash = svc_expkey_hash(new);
 
-	ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
+	ch = sunrpc_cache_update(svc_expkey_cache, &new->h,
 				 &old->h, hash);
 	if (ch)
 		return container_of(ch, struct svc_expkey, h);
@@ -304,8 +306,6 @@ svc_expkey_update(struct svc_expkey *new
 #define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
 #define	EXPORT_HASHMASK		(EXPORT_HASHMAX -1)
 
-static struct cache_head *export_table[EXPORT_HASHMAX];
-
 static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
 {
 	int i;
@@ -384,7 +384,8 @@ static int check_export(struct inode *in
 	 * 2:  We must be able to find an inode from a filehandle.
 	 *       This means that s_export_op must be set.
 	 */
-	if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
+	if (!(inode->i_sb->s_type->fs_flags &
+			(FS_REQUIRES_DEV | FS_NFS_EXPORTABLE)) &&
 	    !(*flags & NFSEXP_FSID) &&
 	    uuid == NULL) {
 		dprintk("exp_export: export of non-dev fs without fsid\n");
@@ -534,6 +535,8 @@ static int svc_export_parse(struct cache
 	if (err)
 		goto out1;
 
+	vfs_dq_init(exp.ex_path.dentry->d_inode);
+
 	exp.ex_client = dom;
 
 	err = -ENOMEM;
@@ -729,10 +732,9 @@ static struct cache_head *svc_export_all
 		return NULL;
 }
 
-struct cache_detail svc_export_cache = {
+struct cache_detail __svc_export_cache = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPORT_HASHMAX,
-	.hash_table	= export_table,
 	.name		= "nfsd.export",
 	.cache_put	= svc_export_put,
 	.cache_upcall	= svc_export_upcall,
@@ -744,6 +746,21 @@ struct cache_detail svc_export_cache = {
 	.alloc		= svc_export_alloc,
 };
 
+void exp_put(struct svc_export *exp)
+{
+	cache_put(&exp->h, svc_export_cache);
+}
+
+dev_t exp_get_dev(struct svc_export *ex)
+{
+	/*
+	 * we should return the device reported by the
+	 * stat syscall inside the container
+	 */
+
+	return ex->ex_path.mnt->mnt_sb->s_dev;
+}
+
 static int
 svc_export_hash(struct svc_export *exp)
 {
@@ -761,7 +778,7 @@ svc_export_lookup(struct svc_export *exp
 	struct cache_head *ch;
 	int hash = svc_export_hash(exp);
 
-	ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
+	ch = sunrpc_cache_lookup(svc_export_cache, &exp->h,
 				 hash);
 	if (ch)
 		return container_of(ch, struct svc_export, h);
@@ -775,7 +792,7 @@ svc_export_update(struct svc_export *new
 	struct cache_head *ch;
 	int hash = svc_export_hash(old);
 
-	ch = sunrpc_cache_update(&svc_export_cache, &new->h,
+	ch = sunrpc_cache_update(svc_export_cache, &new->h,
 				 &old->h,
 				 hash);
 	if (ch)
@@ -801,7 +818,7 @@ exp_find_key(svc_client *clp, int fsid_t
 	ek = svc_expkey_lookup(&key);
 	if (ek == NULL)
 		return ERR_PTR(-ENOMEM);
-	err = cache_check(&svc_expkey_cache, &ek->h, reqp);
+	err = cache_check(svc_expkey_cache, &ek->h, reqp);
 	if (err)
 		return ERR_PTR(err);
 	return ek;
@@ -824,7 +841,7 @@ static int exp_set_key(svc_client *clp, 
 	if (ek)
 		ek = svc_expkey_update(&key,ek);
 	if (ek) {
-		cache_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, svc_expkey_cache);
 		return 0;
 	}
 	return -ENOMEM;
@@ -875,7 +892,7 @@ static svc_export *exp_get_by_name(svc_c
 	exp = svc_export_lookup(&key);
 	if (exp == NULL)
 		return ERR_PTR(-ENOMEM);
-	err = cache_check(&svc_export_cache, &exp->h, reqp);
+	err = cache_check(svc_export_cache, &exp->h, reqp);
 	if (err)
 		return ERR_PTR(err);
 	return exp;
@@ -956,8 +973,8 @@ static void exp_fsid_unhash(struct svc_e
 
 	ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
 	if (!IS_ERR(ek)) {
-		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
-		cache_put(&ek->h, &svc_expkey_cache);
+		sunrpc_invalidate(&ek->h, svc_expkey_cache);
+		cache_put(&ek->h, svc_expkey_cache);
 	}
 }
 
@@ -976,8 +993,9 @@ static int exp_hash(struct auth_domain *
 {
 	u32 fsid[2];
 	struct inode *inode = exp->ex_path.dentry->d_inode;
-	dev_t dev = inode->i_sb->s_dev;
+	dev_t dev;
 
+	dev = exp_get_dev(exp);
 	if (old_valid_dev(dev)) {
 		mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL);
 		return exp_set_key(clp, FSID_DEV, fsid, exp);
@@ -990,11 +1008,13 @@ static void exp_unhash(struct svc_export
 {
 	struct svc_expkey *ek;
 	struct inode *inode = exp->ex_path.dentry->d_inode;
+	dev_t ex_dev;
 
-	ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
+	ex_dev = exp_get_dev(exp);
+	ek = exp_get_key(exp->ex_client, ex_dev, inode->i_ino);
 	if (!IS_ERR(ek)) {
-		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
-		cache_put(&ek->h, &svc_expkey_cache);
+		sunrpc_invalidate(&ek->h, svc_expkey_cache);
+		cache_put(&ek->h, svc_expkey_cache);
 	}
 }
 	
@@ -1068,6 +1088,8 @@ exp_export(struct nfsctl_export *nxp)
 
 	dprintk("nfsd: creating export entry %p for client %p\n", exp, clp);
 
+	vfs_dq_init(path.dentry->d_inode);
+
 	new.h.expiry_time = NEVER;
 	new.h.flags = 0;
 	new.ex_pathname = kstrdup(nxp->ex_path, GFP_KERNEL);
@@ -1099,7 +1121,7 @@ finish:
 	if (!IS_ERR_OR_NULL(exp))
 		exp_put(exp);
 	if (!IS_ERR_OR_NULL(fsid_key))
-		cache_put(&fsid_key->h, &svc_expkey_cache);
+		cache_put(&fsid_key->h, svc_expkey_cache);
 	path_put(&path);
 out_put_clp:
 	auth_domain_put(clp);
@@ -1116,7 +1138,7 @@ out:
 static void
 exp_do_unexport(svc_export *unexp)
 {
-	sunrpc_invalidate(&unexp->h, &svc_export_cache);
+	sunrpc_invalidate(&unexp->h, svc_export_cache);
 	exp_unhash(unexp);
 	exp_fsid_unhash(unexp);
 }
@@ -1226,7 +1248,7 @@ static struct svc_export *exp_find(struc
 		return ERR_CAST(ek);
 
 	exp = exp_get_by_name(clp, &ek->ek_path, reqp);
-	cache_put(&ek->h, &svc_expkey_cache);
+	cache_put(&ek->h, svc_expkey_cache);
 
 	if (IS_ERR(exp))
 		return ERR_CAST(exp);
@@ -1384,25 +1406,25 @@ static void *e_start(struct seq_file *m,
 	struct cache_head *ch;
 	
 	exp_readlock();
-	read_lock(&svc_export_cache.hash_lock);
+	read_lock(&svc_export_cache->hash_lock);
 	if (!n--)
 		return SEQ_START_TOKEN;
 	hash = n >> 32;
 	export = n & ((1LL<<32) - 1);
 
 	
-	for (ch=export_table[hash]; ch; ch=ch->next)
+	for (ch=svc_export_cache->hash_table[hash]; ch; ch=ch->next)
 		if (!export--)
 			return ch;
 	n &= ~((1LL<<32) - 1);
 	do {
 		hash++;
 		n += 1LL<<32;
-	} while(hash < EXPORT_HASHMAX && export_table[hash]==NULL);
+	} while(hash < EXPORT_HASHMAX && svc_export_cache->hash_table[hash]==NULL);
 	if (hash >= EXPORT_HASHMAX)
 		return NULL;
 	*pos = n+1;
-	return export_table[hash];
+	return svc_export_cache->hash_table[hash];
 }
 
 static void *e_next(struct seq_file *m, void *p, loff_t *pos)
@@ -1420,20 +1442,20 @@ static void *e_next(struct seq_file *m, 
 		return ch->next;
 	}
 	*pos &= ~((1LL<<32) - 1);
-	while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) {
+	while (hash < EXPORT_HASHMAX && svc_export_cache->hash_table[hash] == NULL) {
 		hash++;
 		*pos += 1LL<<32;
 	}
 	if (hash >= EXPORT_HASHMAX)
 		return NULL;
 	++*pos;
-	return export_table[hash];
+	return svc_export_cache->hash_table[hash];
 }
 
 static void e_stop(struct seq_file *m, void *p)
 	__releases(svc_export_cache.hash_lock)
 {
-	read_unlock(&svc_export_cache.hash_lock);
+	read_unlock(&svc_export_cache->hash_lock);
 	exp_readunlock();
 }
 
@@ -1557,10 +1579,10 @@ static int e_show(struct seq_file *m, vo
 	}
 
 	cache_get(&exp->h);
-	if (cache_check(&svc_export_cache, &exp->h, NULL))
+	if (cache_check(svc_export_cache, &exp->h, NULL))
 		return 0;
-	cache_put(&exp->h, &svc_export_cache);
-	return svc_export_show(m, &svc_export_cache, cp);
+	cache_put(&exp->h, svc_export_cache);
+	return svc_export_show(m, svc_export_cache, cp);
 }
 
 const struct seq_operations nfs_exports_op = {
@@ -1668,17 +1690,28 @@ exp_verify_string(char *cp, int max)
 int
 nfsd_export_init(void)
 {
-	int rv;
-	dprintk("nfsd: initializing export module.\n");
+	struct cache_detail *exp, *key;
 
-	rv = cache_register(&svc_export_cache);
-	if (rv)
-		return rv;
-	rv = cache_register(&svc_expkey_cache);
-	if (rv)
-		cache_unregister(&svc_export_cache);
-	return rv;
+	exp = cache_alloc(&__svc_export_cache, EXPORT_HASHMAX);
+	if (exp == NULL)
+		goto err_exp;
+	cache_register(exp);
+
+	key = cache_alloc(&__svc_expkey_cache, EXPKEY_HASHMAX);
+	if (key == NULL)
+		goto err_key;
 
+	cache_register(key);
+
+	svc_export_cache = exp;
+	svc_expkey_cache = key;
+
+	return 0;
+
+err_key:
+	cache_free(exp);
+err_exp:
+	return -ENOMEM;
 }
 
 /*
@@ -1688,8 +1721,8 @@ void
 nfsd_export_flush(void)
 {
 	exp_writelock();
-	cache_purge(&svc_expkey_cache);
-	cache_purge(&svc_export_cache);
+	cache_purge(svc_expkey_cache);
+	cache_purge(svc_export_cache);
 	exp_writeunlock();
 }
 
@@ -1699,15 +1732,9 @@ nfsd_export_flush(void)
 void
 nfsd_export_shutdown(void)
 {
-
-	dprintk("nfsd: shutting down export module.\n");
-
 	exp_writelock();
-
-	cache_unregister(&svc_expkey_cache);
-	cache_unregister(&svc_export_cache);
+	cache_free(svc_expkey_cache);
+	cache_free(svc_export_cache);
 	svcauth_unix_purge();
-
 	exp_writeunlock();
-	dprintk("nfsd: export shutdown complete.\n");
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfs3proc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfs3proc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfs3proc.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfs3proc.c	2015-01-21 12:02:42.464246551 +0300
@@ -571,6 +571,9 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqst
 			resp->f_properties = NFS3_FSF_BILLYBOY;
 		}
 		resp->f_maxfilesize = sb->s_maxbytes;
+		resp->f_time_gran = 0;
+		if (!strcmp(sb->s_type->name, "ext4"))
+			resp->f_time_gran = sb->s_time_gran;
 	}
 
 	fh_put(&argp->fh);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfs3xdr.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfs3xdr.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfs3xdr.c	2014-12-12 23:28:59.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfs3xdr.c	2015-01-21 12:02:46.931127957 +0300
@@ -144,7 +144,7 @@ static __be32 *encode_fsid(__be32 *p, st
 	default:
 	case FSIDSOURCE_DEV:
 		p = xdr_encode_hyper(p, (u64)huge_encode_dev
-				     (fhp->fh_dentry->d_inode->i_sb->s_dev));
+				     (exp_get_dev(fhp->fh_export)));
 		break;
 	case FSIDSOURCE_FSID:
 		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
@@ -1056,8 +1056,13 @@ nfs3svc_encode_fsinfores(struct svc_rqst
 		*p++ = htonl(resp->f_wtmult);
 		*p++ = htonl(resp->f_dtpref);
 		p = xdr_encode_hyper(p, resp->f_maxfilesize);
-		*p++ = xdr_one;
-		*p++ = xdr_zero;
+		if (resp->f_time_gran) {
+			*p++ = xdr_zero;
+			*p++ = htonl(resp->f_time_gran);
+		} else {
+			*p++ = xdr_one;
+			*p++ = xdr_zero;
+		}
 		*p++ = htonl(resp->f_properties);
 	}
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsctl.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsctl.c	2015-01-21 12:02:47.671108312 +0300
@@ -14,6 +14,9 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
 
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+
 #include "nfsd.h"
 #include "cache.h"
 
@@ -451,7 +454,7 @@ static ssize_t write_getfs(struct file *
 
 	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
 
-	clp = auth_unix_lookup(&init_net, &in6);
+	clp = auth_unix_lookup(current->nsproxy->net_ns, &in6);
 	if (!clp)
 		err = -EPERM;
 	else {
@@ -514,7 +517,7 @@ static ssize_t write_getfd(struct file *
 
 	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
 
-	clp = auth_unix_lookup(&init_net, &in6);
+	clp = auth_unix_lookup(current->nsproxy->net_ns, &in6);
 	if (!clp)
 		err = -EPERM;
 	else {
@@ -841,7 +844,7 @@ static ssize_t __write_versions(struct f
 	ssize_t tlen = 0;
 	char *sep;
 
-	if (size>0) {
+	if (size>0 && ve_is_super(get_exec_env())) {
 		if (nfsd_serv)
 			/* Cannot change versions without updating
 			 * nfsd_serv->sv_xdrsize, and reallocing
@@ -1050,12 +1053,12 @@ static ssize_t __write_ports_addxprt(cha
 	if (err != 0)
 		return err;
 
-	err = svc_create_xprt(nfsd_serv, transport, &init_net,
+	err = svc_create_xprt(nfsd_serv, transport, current->nsproxy->net_ns,
 				PF_INET, port, SVC_SOCK_ANONYMOUS);
 	if (err < 0)
 		goto out_err;
 
-	err = svc_create_xprt(nfsd_serv, transport, &init_net,
+	err = svc_create_xprt(nfsd_serv, transport, current->nsproxy->net_ns,
 				PF_INET6, port, SVC_SOCK_ANONYMOUS);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_close;
@@ -1405,32 +1408,45 @@ static ssize_t write_recoverydir(struct 
 
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
+#define NFSD_DEPR_FILES							\
+	[NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},		\
+	[NFSD_Add] = {".add", &transaction_ops, S_IWUSR},		\
+	[NFSD_Del] = {".del", &transaction_ops, S_IWUSR},		\
+	[NFSD_Export] = {".export", &transaction_ops, S_IWUSR},		\
+	[NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},	\
+	[NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},	\
+	[NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}
+
+#define NFSD_V3_FILES							\
+	[NFSD_List] = {"exports", &exports_operations, S_IRUGO},	\
+	[NFSD_Export_features] = {"export_features",			\
+				&export_features_operations, S_IRUGO},	\
+	[NFSD_FO_UnlockIP] = {"unlock_ip",				\
+				&transaction_ops, S_IWUSR|S_IRUSR},	\
+	[NFSD_FO_UnlockFS] = {"unlock_filesystem",			\
+				&transaction_ops, S_IWUSR|S_IRUSR},	\
+	[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},	\
+	[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},\
+	[NFSD_Pool_Threads] = {"pool_threads",				\
+				&transaction_ops, S_IWUSR|S_IRUSR},	\
+	[NFSD_Pool_Stats] = {"pool_stats",				\
+				&pool_stats_operations, S_IRUGO},	\
+	[NFSD_Reply_Cache_Stats] = {"reply_cache_stats",		\
+				&reply_cache_stats_operations, S_IRUGO},\
+	[NFSD_Versions] = {"versions",					\
+				&transaction_ops, S_IWUSR|S_IRUSR},	\
+	[NFSD_Ports] = {"portlist",					\
+				&transaction_ops, S_IWUSR|S_IRUGO},	\
+	[NFSD_MaxBlkSize] = {"max_block_size",				\
+				&transaction_ops, S_IWUSR|S_IRUGO},	\
+	[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",		\
+				&supported_enctypes_ops, S_IRUGO}
+
 	static struct tree_descr nfsd_files[] = {
 #ifdef CONFIG_NFSD_DEPRECATED
-		[NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
-		[NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
-		[NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
-		[NFSD_Export] = {".export", &transaction_ops, S_IWUSR},
-		[NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
-		[NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+		NFSD_DEPR_FILES,
 #endif
-		[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
-		[NFSD_Export_features] = {"export_features",
-					&export_features_operations, S_IRUGO},
-		[NFSD_FO_UnlockIP] = {"unlock_ip",
-					&transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_FO_UnlockFS] = {"unlock_filesystem",
-					&transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
-		[NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
-		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
-		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
-		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
-		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+		NFSD_V3_FILES,
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
@@ -1438,7 +1454,16 @@ static int nfsd_fill_super(struct super_
 #endif
 		/* last one */ {""}
 	};
-	return simple_fill_super(sb, 0x6e667364, nfsd_files);
+	static struct tree_descr ve_nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
+		NFSD_DEPR_FILES,
+#endif
+		NFSD_V3_FILES,
+		/* last one */ {""}
+	};
+
+	return simple_fill_super(sb, 0x6e667364,
+		ve_is_super(get_exec_env()) ? nfsd_files : ve_nfsd_files);
 }
 
 static int nfsd_get_sb(struct file_system_type *fs_type,
@@ -1457,28 +1482,144 @@ static struct file_system_type nfsd_fs_t
 #ifdef CONFIG_PROC_FS
 static int create_proc_exports_entry(void)
 {
-	struct proc_dir_entry *entry;
+	struct proc_dir_entry *root, *entry;
 
-	entry = proc_mkdir("fs/nfs", NULL);
+	root = get_exec_env()->proc_root;
+	entry = proc_mkdir("fs/nfs", root);
 	if (!entry)
 		return -ENOMEM;
 	entry = proc_create("exports", 0, entry, &exports_operations);
-	if (!entry)
+	if (!entry) {
+		remove_proc_entry("fs/nfs", root);
 		return -ENOMEM;
+	}
 	return 0;
 }
+
+void remove_proc_exports_entry(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = get_exec_env()->proc_root;
+	remove_proc_entry("fs/nfs/exports", entry);
+	remove_proc_entry("fs/nfs", entry);
+}
 #else /* CONFIG_PROC_FS */
 static int create_proc_exports_entry(void)
 {
 	return 0;
 }
+
+void remove_proc_exports_entry(void)
+{
+}
 #endif
 
+static int ve_init_nfsctl(void *data)
+{
+	struct ve_struct *ve = data;
+	struct ve_nfsd_data *d;
+	int err = -ENOMEM;
+
+	if (!(ve->features & VE_FEATURE_NFSD))
+		return 0;
+
+	d = kzalloc(sizeof(struct ve_nfsd_data), GFP_KERNEL);
+	if (d == NULL)
+		goto err_data;
+	ve->nfsd_data = d;
+
+	err = create_proc_exports_entry();
+	if (err)
+		goto err_proc;
+
+	err = nfsd_export_init();
+	if (err)
+		goto err_exp;
+
+	err = nfsd_stat_init();
+	if (err)
+		goto err_stat;
+
+	err = register_ve_fs_type(ve, &nfsd_fs_type, &d->nfsd_fs, NULL);
+	if (err) {
+		printk("Can't register nfsdfs\n");
+		goto err_nfsdfs;
+	}
+
+	return 0;
+
+err_nfsdfs:
+	nfsd_stat_shutdown();
+err_stat:
+	nfsd_export_shutdown();
+err_exp:
+	remove_proc_exports_entry();
+err_proc:
+	kfree(d);
+err_data:
+	return err;
+}
+
+static void ve_exit_nfsctl(void *data)
+{
+	struct ve_struct *ve = data;
+	struct ve_nfsd_data *d = ve->nfsd_data;
+
+	if (d == NULL)
+		return;
+
+	if (nfsd_up)
+		wait_for_completion(&nfsd_exited);
+
+	nfsd_stat_shutdown();
+
+	unregister_ve_fs_type(d->nfsd_fs, NULL);
+	nfsd_export_shutdown();
+	remove_proc_exports_entry();
+
+	ve->nfsd_data = NULL;
+	kfree(d);
+}
+
+static struct ve_hook nfsd_ctl_hook = {
+	.init = ve_init_nfsctl,
+	.fini = ve_exit_nfsctl,
+	.owner	  = THIS_MODULE,
+	.priority = HOOK_PRIO_NET_POST,
+};
+
+static struct ve_nfsd_data ve0_nfsd_data;
+
+int report_stale = 0;
+static struct ctl_table_header *nfs_ctl;
+static ctl_table debug_table[] = {
+	{
+		.ctl_name	= 9475,
+		.procname	= "nfs_stale",
+		.data		= &report_stale,
+		.maxlen		= sizeof(report_stale),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+static ctl_table root_table[] = {
+	{
+		.ctl_name	= CTL_DEBUG,
+		.procname	= "debug",
+		.mode		= 0555,
+		.child		= debug_table,
+	},
+	{ .ctl_name = 0 }
+};
+
 static int __init init_nfsd(void)
 {
 	int retval;
 	printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
 
+	get_ve0()->nfsd_data = &ve0_nfsd_data;
 	retval = nfs4_state_init(); /* nfs4 locking state */
 	if (retval)
 		return retval;
@@ -1499,10 +1640,11 @@ static int __init init_nfsd(void)
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval)
 		goto out_free_all;
+	ve_hook_register(VE_SS_CHAIN, &nfsd_ctl_hook);
+	nfs_ctl = register_sysctl_table(root_table);
 	return 0;
 out_free_all:
-	remove_proc_entry("fs/nfs/exports", NULL);
-	remove_proc_entry("fs/nfs", NULL);
+	remove_proc_exports_entry();
 out_free_idmap:
 	nfsd_idmap_shutdown();
 out_free_lockd:
@@ -1513,20 +1655,23 @@ out_free_cache:
 out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd4_free_slabs();
+	get_ve0()->nfsd_data = NULL;
 	return retval;
 }
 
 static void __exit exit_nfsd(void)
 {
+	unregister_sysctl_table(nfs_ctl);
+	ve_hook_unregister(&nfsd_ctl_hook);
 	nfsd_export_shutdown();
 	nfsd_reply_cache_shutdown();
-	remove_proc_entry("fs/nfs/exports", NULL);
-	remove_proc_entry("fs/nfs", NULL);
+	remove_proc_exports_entry();
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd_idmap_shutdown();
 	nfsd4_free_slabs();
 	unregister_filesystem(&nfsd_fs_type);
+	get_ve0()->nfsd_data = NULL;
 }
 
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsd.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsd.h
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsd.h	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsd.h	2015-01-21 12:02:47.671108312 +0300
@@ -14,6 +14,9 @@
 #include <linux/nfsd/debug.h>
 #include <linux/nfsd/export.h>
 #include <linux/nfsd/stats.h>
+
+#include <linux/sched.h>
+#include "ve.h"
 /*
  * nfsd version
  */
@@ -29,7 +32,6 @@ extern struct svc_version	nfsd_version2,
 				nfsd_version4;
 extern u32			nfsd_supported_minorversion;
 extern struct mutex		nfsd_mutex;
-extern struct svc_serv		*nfsd_serv;
 extern spinlock_t		nfsd_drc_lock;
 extern unsigned long		nfsd_drc_max_mem;
 extern unsigned long		nfsd_drc_mem_used;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsfh.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsfh.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsfh.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsfh.c	2015-01-21 12:02:47.671108312 +0300
@@ -13,6 +13,7 @@
 #include "nfsd.h"
 #include "vfs.h"
 #include "auth.h"
+#include "ve.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_FH
 
@@ -145,6 +146,7 @@ static __be32 nfsd_set_fh_dentry(struct 
 	int fileid_type;
 	int data_left = fh->fh_size/4;
 	__be32 error;
+	int err_line = 0;
 
 	error = nfserr_stale;
 	if (rqstp->rq_vers > 2)
@@ -155,13 +157,16 @@ static __be32 nfsd_set_fh_dentry(struct 
 	if (fh->fh_version == 1) {
 		int len;
 
-		if (--data_left < 0)
-			return error;
-		if (fh->fh_auth_type != 0)
-			return error;
+		if (--data_left < 0) {
+			err_line = __LINE__; goto out_err;
+		}
+		if (fh->fh_auth_type != 0) {
+			err_line = __LINE__; goto out_err;
+		}
 		len = key_len(fh->fh_fsid_type) / 4;
-		if (len == 0)
-			return error;
+		if (len == 0) {
+			err_line = __LINE__; goto out_err;
+		}
 		if  (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
 			/* deprecated, convert to type 3 */
 			len = key_len(FSID_ENCODE_DEV)/4;
@@ -170,8 +175,9 @@ static __be32 nfsd_set_fh_dentry(struct 
 			fh->fh_fsid[1] = fh->fh_fsid[2];
 		}
 		data_left -= len;
-		if (data_left < 0)
-			return error;
+		if (data_left < 0) {
+			err_line = __LINE__; goto out_err;
+		}
 		exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
 		fid = (struct fid *)(fh->fh_auth + len);
 	} else {
@@ -179,8 +185,9 @@ static __be32 nfsd_set_fh_dentry(struct 
 		dev_t xdev;
 		ino_t xino;
 
-		if (fh->fh_size != NFS_FHSIZE)
-			return error;
+		if (fh->fh_size != NFS_FHSIZE) {
+			err_line = __LINE__; goto out_err;
+		}
 		/* assume old filehandle format */
 		xdev = old_decode_dev(fh->ofh_xdev);
 		xino = u32_to_ino_t(fh->ofh_xino);
@@ -189,8 +196,9 @@ static __be32 nfsd_set_fh_dentry(struct 
 	}
 
 	error = nfserr_stale;
-	if (PTR_ERR(exp) == -ENOENT)
-		return error;
+	if (PTR_ERR(exp) == -ENOENT) {
+		err_line = __LINE__; goto out_err;
+	}
 
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
@@ -215,8 +223,10 @@ static __be32 nfsd_set_fh_dentry(struct 
 		put_cred(new);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, exp);
-		if (error)
+		if (error) {
+			err_line = __LINE__;
 			goto out;
+		}
 	}
 
 	/*
@@ -245,12 +255,20 @@ static __be32 nfsd_set_fh_dentry(struct 
 		dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
 				data_left, fileid_type,
 				nfsd_acceptable, exp);
+		if (dentry == NULL)
+			printk(KERN_ERR "%s: exportfs_decode_fh failed\n", __func__);
 	}
-	if (dentry == NULL)
+	if (dentry == NULL) {
+		err_line = __LINE__;
 		goto out;
+	}
 	if (IS_ERR(dentry)) {
 		if (PTR_ERR(dentry) != -EINVAL)
 			error = nfserrno(PTR_ERR(dentry));
+		if (PTR_ERR(dentry) != -ESTALE)
+			printk(KERN_ERR "%s: encoded dentry is err: %ld\n",
+					__func__, PTR_ERR(dentry));
+		err_line = __LINE__;
 		goto out;
 	}
 
@@ -265,6 +283,9 @@ static __be32 nfsd_set_fh_dentry(struct 
 	return 0;
 out:
 	exp_put(exp);
+out_err:
+	if (error == nfserr_badhandle)
+		printk(KERN_ERR "%s: return BAD_HANDLE, line: %d\n", __func__, err_line);
 	return error;
 }
 
@@ -371,8 +392,16 @@ skip_pseudoflavor_check:
 			access, ntohl(error));
 	}
 out:
-	if (error == nfserr_stale)
+	if (error == nfserr_stale) {
+		extern int report_stale;
+
+		if (report_stale) {
+			printk("%s: return STALE in %d\n", __func__, get_exec_env()->veid);
+			dump_stack();
+		}
+
 		nfsdstats.fh_stale++;
+	}
 	return error;
 }
 
@@ -429,7 +458,7 @@ static bool fsid_type_ok_for_exp(u8 fsid
 {
 	switch (fsid_type) {
 	case FSID_DEV:
-		if (!old_valid_dev(exp_sb(exp)->s_dev))
+		if (!old_valid_dev(exp_get_dev(exp)))
 			return 0;
 		/* FALL THROUGH */
 	case FSID_MAJOR_MINOR:
@@ -494,7 +523,7 @@ retry:
 			else
 				fsid_type = FSID_UUID4_INUM;
 		}
-	} else if (!old_valid_dev(exp_sb(exp)->s_dev))
+	} else if (!old_valid_dev(exp_get_dev(exp)))
 		/* for newer device numbers, we must use a newer fsid format */
 		fsid_type = FSID_ENCODE_DEV;
 	else
@@ -519,7 +548,9 @@ fh_compose(struct svc_fh *fhp, struct sv
 	struct inode * inode = dentry->d_inode;
 	struct dentry *parent = dentry->d_parent;
 	__u32 *datap;
-	dev_t ex_dev = exp_sb(exp)->s_dev;
+	dev_t ex_dev;
+
+	ex_dev = exp_get_dev(exp);
 
 	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
 		MAJOR(ex_dev), MINOR(ex_dev),
@@ -640,7 +671,7 @@ fh_put(struct svc_fh *fhp)
 	}
 	fh_drop_write(fhp);
 	if (exp) {
-		cache_put(&exp->h, &svc_export_cache);
+		exp_put(exp);
 		fhp->fh_export = NULL;
 	}
 	return;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsproc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsproc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsproc.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsproc.c	2015-01-21 12:02:46.932127930 +0300
@@ -5,6 +5,7 @@
  */
 
 #include <linux/namei.h>
+#include <linux/sched.h>
 
 #include "cache.h"
 #include "xdr.h"
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfssvc.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfssvc.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfssvc.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfssvc.c	2015-01-21 12:02:47.671108312 +0300
@@ -23,7 +23,6 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
-extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
 
@@ -50,7 +49,6 @@ struct timeval			nfssvc_boot;
  *	nfsd_versions
  */
 DEFINE_MUTEX(nfsd_mutex);
-struct svc_serv 		*nfsd_serv;
 
 /*
  * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
@@ -111,17 +109,31 @@ struct svc_program		nfsd_program = {
 	.pg_vers		= nfsd_versions,	/* version table */
 	.pg_name		= "nfsd",		/* program name */
 	.pg_class		= "nfsd",		/* authentication class */
-	.pg_stats		= &nfsd_svcstats,	/* version table */
 	.pg_authenticate	= &svc_set_client,	/* export authentication */
 
 };
 
+struct svc_program		ve_nfsd_program = {
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+	.pg_next		= &nfsd_acl_program,
+#endif
+	.pg_prog		= NFS_PROGRAM,
+	.pg_nvers		= NFSD_NRVERS - 1,	/* no nfsdv4 for ct */
+	.pg_vers		= nfsd_versions,
+	.pg_name		= "nfsd",
+	.pg_class		= "nfsd",
+	.pg_authenticate	= &svc_set_client,
+
+};
+
 u32 nfsd_supported_minorversion;
 
 int nfsd_vers(int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
 		return 0;
+	if ((vers == 4) && !ve_is_super(get_exec_env()))
+		return 0;
 	switch(change) {
 	case NFSD_SET:
 		nfsd_versions[vers] = nfsd_version[vers];
@@ -140,7 +152,8 @@ int nfsd_vers(int vers, enum vers_op cha
 	case NFSD_TEST:
 		return nfsd_versions[vers] != NULL;
 	case NFSD_AVAIL:
-		return nfsd_version[vers] != NULL;
+		if ((vers != 4) || ve_is_super(get_exec_env()))
+			return (nfsd_version[vers] != NULL);
 	}
 	return 0;
 }
@@ -200,7 +213,10 @@ static int nfsd_init_socks(int port)
 	return 0;
 }
 
-static bool nfsd_up = false;
+#ifndef CONFIG_VE
+static bool _nfsd_up = false;
+static DECLARE_COMPLETION(_nfsd_exited);
+#endif
 
 static int nfsd_startup(unsigned short port, int nrservs)
 {
@@ -222,9 +238,11 @@ static int nfsd_startup(unsigned short p
 	ret = lockd_up();
 	if (ret)
 		goto out_racache;
-	ret = nfs4_state_start();
-	if (ret)
-		goto out_lockd;
+	if (ve_is_super(get_exec_env())) {
+		ret = nfs4_state_start();
+		if (ret)
+			goto out_lockd;
+	}
 	nfsd_up = true;
 	return 0;
 out_lockd:
@@ -244,10 +262,10 @@ static void nfsd_shutdown(void)
 	 */
 	if (!nfsd_up)
 		return;
-	nfs4_state_shutdown();
+	if (ve_is_super(get_exec_env()))
+		nfs4_state_shutdown();
 	lockd_down();
 	nfsd_racache_shutdown();
-	nfsd_up = false;
 }
 
 static void nfsd_last_thread(struct svc_serv *serv)
@@ -258,9 +276,10 @@ static void nfsd_last_thread(struct svc_
 
 	svc_rpcb_cleanup(serv);
 
-	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
-			    "cache\n");
 	nfsd_export_flush();
+
+	nfsd_up = false;
+	complete(&nfsd_exited);
 }
 
 void nfsd_reset_versions(void)
@@ -333,13 +352,17 @@ int nfsd_create_serv(void)
 	}
 	nfsd_reset_versions();
 
-	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      nfsd_last_thread, nfsd, THIS_MODULE);
+	nfsd_serv = svc_create_pooled(ve_is_super(get_exec_env()) ?
+					&nfsd_program : &ve_nfsd_program,
+				      nfsd_max_blksize,
+				      nfsd_last_thread, nfsd, THIS_MODULE,
+				      get_exec_env()->nfsd_data->svc_stat);
 	if (nfsd_serv == NULL)
 		return -ENOMEM;
 
 	set_max_drc();
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
+	init_completion(&nfsd_exited);
 	return err;
 }
 
@@ -456,8 +479,11 @@ nfsd_svc(unsigned short port, int nrserv
 	 */
 	error = nfsd_serv->sv_nrthreads - 1;
 out_shutdown:
-	if (error < 0 && !nfsd_up_before)
+	if (error < 0 && !nfsd_up_before) {
 		nfsd_shutdown();
+		nfsd_up = false;
+		complete(&nfsd_exited);
+	}
 out_destroy:
 	svc_destroy(nfsd_serv);		/* Release server */
 out:
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsxdr.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsxdr.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/nfsxdr.c	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/nfsxdr.c	2015-01-21 12:02:46.933127903 +0300
@@ -184,6 +184,7 @@ encode_fattr(struct svc_rqst *rqstp, __b
 	*p++ = htonl((u32) stat->ino);
 	*p++ = htonl((u32) stat->atime.tv_sec);
 	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
+	time = stat->mtime;
 	lease_get_mtime(dentry->d_inode, &time); 
 	*p++ = htonl((u32) time.tv_sec);
 	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/stats.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/stats.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/stats.c	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/stats.c	2015-01-21 12:02:47.671108312 +0300
@@ -28,10 +28,9 @@
 
 #include "nfsd.h"
 
-struct nfsd_stats	nfsdstats;
-struct svc_stat		nfsd_svcstats = {
-	.program	= &nfsd_program,
-};
+#ifndef CONFIG_VE
+struct nfsd_stats	_nfsdstats;
+#endif
 
 static int nfsd_proc_show(struct seq_file *seq, void *v)
 {
@@ -63,7 +62,7 @@ static int nfsd_proc_show(struct seq_fil
 	seq_putc(seq, '\n');
 	
 	/* show my rpc info */
-	svc_seq_show(seq, &nfsd_svcstats);
+	svc_seq_show(seq, get_exec_env()->nfsd_data->svc_stat);
 
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
@@ -91,14 +90,27 @@ static const struct file_operations nfsd
 	.release = single_release,
 };
 
-void
+int
 nfsd_stat_init(void)
 {
-	svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
+	struct ve_nfsd_data *d;
+
+	d = get_exec_env()->nfsd_data;
+	d->svc_stat = kzalloc(sizeof(struct svc_stat), GFP_KERNEL);
+	if (d->svc_stat == NULL)
+		return -ENOMEM;
+
+	d->svc_stat->program = &nfsd_program;
+	svc_proc_register(d->svc_stat, &nfsd_proc_fops);
+	return 0;
 }
 
 void
 nfsd_stat_shutdown(void)
 {
+	struct ve_nfsd_data *d;
+
+	d = get_exec_env()->nfsd_data;
 	svc_proc_unregister("nfsd");
+	kfree(d->svc_stat);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/ve.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/ve.h
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/ve.h	2015-01-21 12:02:47.671108312 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/ve.h	2015-01-21 12:02:47.671108312 +0300
@@ -0,0 +1,48 @@
+/*
+ * fs/nfsd/ve.h
+ *
+ * VE context for NFSd
+ *
+ */
+
+#ifndef __VE_NFSD_H__
+#define __VE_NFSD_H__
+
+#define VE_RAPARM_SIZE	2048
+
+struct ve_nfsd_data {
+	struct file_system_type *nfsd_fs;
+	struct cache_detail *_svc_export_cache;
+	struct cache_detail *_svc_expkey_cache;
+	struct svc_serv *_nfsd_serv;
+	struct nfsd_stats _nfsdstats;
+	struct svc_stat *svc_stat;
+	char _raparm_hash[VE_RAPARM_SIZE];
+	struct completion _nfsd_exited;
+	bool _nfsd_up;
+};
+
+#ifdef CONFIG_VE
+
+#include <linux/ve.h>
+
+#define NFSD_CTX_FIELD(arg)	(get_exec_env()->nfsd_data->_##arg)
+
+#else
+
+#define NFSD_CTX_FIELD(arg)	_##arg
+
+#endif
+
+#define svc_export_cache	NFSD_CTX_FIELD(svc_export_cache)
+#define svc_expkey_cache	NFSD_CTX_FIELD(svc_expkey_cache)
+
+#define nfsd_serv		NFSD_CTX_FIELD(nfsd_serv)
+#define nfsd_up			NFSD_CTX_FIELD(nfsd_up)
+#define nfsd_exited		NFSD_CTX_FIELD(nfsd_exited)
+#define nfsdstats		NFSD_CTX_FIELD(nfsdstats)
+struct raparm_hbucket;
+#define raparm_hash		((struct raparm_hbucket *)NFSD_CTX_FIELD(raparm_hash))
+
+
+#endif /* __VE_NFSD_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/vfs.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/vfs.c
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/vfs.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/vfs.c	2015-01-21 12:02:53.129963398 +0300
@@ -26,6 +26,7 @@
 #include <linux/xattr.h>
 #include <linux/jhash.h>
 #include <linux/ima.h>
+#include <linux/vzquota.h>
 #include <asm/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -70,7 +71,10 @@ struct raparm_hbucket {
 #define RAPARM_HASH_BITS	4
 #define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
 #define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
-static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
+
+#ifndef CONFIG_VE
+static struct raparm_hbucket	_raparm_hash[RAPARM_HASH_SIZE];
+#endif
 
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
@@ -414,6 +418,7 @@ nfsd_setattr(struct svc_rqst *rqstp, str
 			put_write_access(inode);
 			goto out_nfserr;
 		}
+		vzquota_cur_qmblk_set(fhp->fh_export->ex_path.dentry->d_inode);
 		vfs_dq_init(inode);
 	}
 
@@ -813,6 +818,7 @@ nfsd_open(struct svc_rqst *rqstp, struct
 		else
 			flags = O_WRONLY|O_LARGEFILE;
 
+		vzquota_cur_qmblk_set(fhp->fh_export->ex_path.dentry->d_inode);
 		vfs_dq_init(inode);
 	}
 	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
@@ -827,6 +833,13 @@ nfsd_open(struct svc_rqst *rqstp, struct
 			(*filp)->f_mode |= FMODE_64BITHASH;
 		else
 			(*filp)->f_mode |= FMODE_32BITHASH;
+
+		/* Update fmode for underlying file */
+		if ((*filp)->f_op->get_host) {
+			struct file *host = (*filp)->f_op->get_host(*filp);
+
+			host->f_mode |= (*filp)->f_mode & (FMODE_32BITHASH | FMODE_64BITHASH);
+		}
 	}
 
 out_nfserr:
@@ -869,7 +882,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
 		if (ra->p_count == 0)
 			frap = rap;
 	}
-	depth = nfsdstats.ra_size*11/10;
+	depth = nfsdstats.ra_size;
 	if (!frap) {	
 		spin_unlock(&rab->pb_lock);
 		return NULL;
@@ -1015,15 +1028,18 @@ static void kill_suid(struct dentry *den
  * better tool (separate unstable writes and commits) for solving this
  * problem.
  */
-static int wait_for_concurrent_writes(struct file *file)
+static int wait_for_concurrent_writes(struct file *file, struct svc_export *exp)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	static ino_t last_ino;
 	static dev_t last_dev;
 	int err = 0;
+	dev_t		ex_dev;
 
+	BUG_ON(file->f_vfsmnt != exp->ex_path.mnt);
+	ex_dev = exp_get_dev(exp);
 	if (atomic_read(&inode->i_writecount) > 1
-	    || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+	    || (last_ino == inode->i_ino && last_dev == ex_dev)) {
 		dprintk("nfsd: write defer %d\n", task_pid_nr(current));
 		msleep(10);
 		dprintk("nfsd: write resume %d\n", task_pid_nr(current));
@@ -1034,7 +1050,7 @@ static int wait_for_concurrent_writes(st
 		err = vfs_fsync(file, file->f_path.dentry, 0);
 	}
 	last_ino = inode->i_ino;
-	last_dev = inode->i_sb->s_dev;
+	last_dev = ex_dev;
 	return err;
 }
 
@@ -1102,7 +1118,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s
 		kill_suid(dentry);
 
 	if (stable && use_wgather)
-		host_err = wait_for_concurrent_writes(file);
+		host_err = wait_for_concurrent_writes(file, exp);
 
 out_nfserr:
 	dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1126,6 +1142,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp,
 	struct inode *inode;
 	struct raparms	*ra;
 	__be32 err;
+	dev_t		ex_dev;
 
 	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 	if (err)
@@ -1134,7 +1151,9 @@ __be32 nfsd_read(struct svc_rqst *rqstp,
 	inode = file->f_path.dentry->d_inode;
 
 	/* Get readahead parameters */
-	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+	BUG_ON(file->f_vfsmnt != fhp->fh_export->ex_path.mnt);
+	ex_dev = exp_get_dev(fhp->fh_export);
+	ra = nfsd_get_raparms(ex_dev, inode->i_ino);
 
 	if (ra && ra->p_set)
 		file->f_ra = ra->p_ra;
@@ -2240,6 +2259,7 @@ nfsd_racache_init(int cache_size)
 	int	nperbucket;
 	struct raparms **raparm = NULL;
 
+	BUILD_BUG_ON(sizeof(struct raparm_hbucket) * RAPARM_HASH_SIZE > VE_RAPARM_SIZE);
 
 	if (raparm_hash[0].pb_head)
 		return 0;
@@ -2255,7 +2275,7 @@ nfsd_racache_init(int cache_size)
 
 		raparm = &raparm_hash[i].pb_head;
 		for (j = 0; j < nperbucket; j++) {
-			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL_UBC);
 			if (!*raparm)
 				goto out_nomem;
 			raparm = &(*raparm)->p_next;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/nfsd/xdr3.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/xdr3.h
--- linux-2.6.32-504.3.3.el6.orig/fs/nfsd/xdr3.h	2014-12-12 23:28:51.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/nfsd/xdr3.h	2015-01-21 12:02:42.464246551 +0300
@@ -202,6 +202,7 @@ struct nfsd3_fsinfores {
 	__u32			f_dtpref;
 	__u64			f_maxfilesize;
 	__u32			f_properties;
+	__u32			f_time_gran;
 };
 
 struct nfsd3_pathconfres {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/notify/inode_mark.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inode_mark.c
--- linux-2.6.32-504.3.3.el6.orig/fs/notify/inode_mark.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inode_mark.c	2015-01-21 12:02:47.928101490 +0300
@@ -90,6 +90,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/writeback.h> /* for inode_lock */
+#include <linux/mount.h>
 
 #include <asm/atomic.h>
 
@@ -270,6 +271,23 @@ void fsnotify_clear_marks_by_inode(struc
 	}
 }
 
+static void fsnotify_detach_mnt(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+	struct fsnotify_group *group;
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+		spin_lock(&entry->lock);
+		group = entry->group;
+		if (group->ops->detach_mnt)
+			group->ops->detach_mnt(entry);
+		spin_unlock(&entry->lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
 /*
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
@@ -370,7 +388,7 @@ int fsnotify_add_mark(struct fsnotify_ma
  * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
  * We temporarily drop inode_lock, however, and CAN block.
  */
-void fsnotify_unmount_inodes(struct super_block *sb)
+static void fsnotify_unmount(struct super_block *sb, struct vfsmount *mnt)
 {
 	struct list_head *list = &sb->s_inodes;
 	struct inode *inode, *next_i, *need_iput = NULL;
@@ -423,10 +441,14 @@ void fsnotify_unmount_inodes(struct supe
 		if (need_iput_tmp)
 			iput(need_iput_tmp);
 
-		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+		if (mnt)
+			fsnotify_detach_mnt(inode);
+		else {
+			/* for each watch, send FS_UNMOUNT and then remove it */
+			fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 
-		fsnotify_inode_delete(inode);
+			fsnotify_inode_delete(inode);
+		}
 
 		iput(inode);
 
@@ -437,3 +459,15 @@ void fsnotify_unmount_inodes(struct supe
 	wait_event(sb->s_fsnotify_marks_wq, !atomic_read(&sb->s_fsnotify_marks));
 	spin_lock(&inode_lock);
 }
+
+void fsnotify_unmount_inodes(struct super_block *sb)
+{
+	fsnotify_unmount(sb, NULL);
+}
+
+void fsnotify_unmount_mnt(struct vfsmount *mnt)
+{
+	spin_lock(&inode_lock);
+	fsnotify_unmount(mnt->mnt_sb, mnt);
+	spin_unlock(&inode_lock);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/notify/inotify/inotify.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inotify/inotify.h
--- linux-2.6.32-504.3.3.el6.orig/fs/notify/inotify/inotify.h	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inotify/inotify.h	2015-01-21 12:02:50.892022806 +0300
@@ -13,6 +13,10 @@ struct inotify_inode_mark_entry {
 	/* fsnotify_mark_entry MUST be the first thing */
 	struct fsnotify_mark_entry fsn_entry;
 	int wd;
+
+	/* for checkpoint/restore */
+	char *cpt_wd_path;
+	struct vfsmount *cpt_wd_mnt;
 };
 
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/notify/inotify/inotify_fsnotify.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inotify/inotify_fsnotify.c
--- linux-2.6.32-504.3.3.el6.orig/fs/notify/inotify/inotify_fsnotify.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inotify/inotify_fsnotify.c	2015-01-21 12:02:50.892022806 +0300
@@ -29,6 +29,7 @@
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/mount.h>
 
 #include "inotify.h"
 
@@ -165,10 +166,25 @@ void inotify_free_event_priv(struct fsno
 	kmem_cache_free(event_priv_cachep, event_priv);
 }
 
+static void inotify_detach_mnt(struct fsnotify_mark_entry *fe)
+{
+	struct inotify_inode_mark_entry *e;
+
+	e = container_of(fe, struct inotify_inode_mark_entry, fsn_entry);
+	if (e->cpt_wd_path) {
+		kfree(e->cpt_wd_path);
+		e->cpt_wd_path = NULL;
+		mnt_unpin(e->cpt_wd_mnt);
+		mntput(e->cpt_wd_mnt);
+		e->cpt_wd_mnt = NULL;
+	}
+}
+
 const struct fsnotify_ops inotify_fsnotify_ops = {
 	.handle_event = inotify_handle_event,
 	.should_send_event = inotify_should_send_event,
 	.free_group_priv = inotify_free_group_priv,
 	.free_event_priv = inotify_free_event_priv,
 	.freeing_mark = inotify_freeing_mark,
+	.detach_mnt = inotify_detach_mnt,
 };
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/notify/inotify/inotify_user.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inotify/inotify_user.c
--- linux-2.6.32-504.3.3.el6.orig/fs/notify/inotify/inotify_user.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/notify/inotify/inotify_user.c	2015-01-21 12:02:50.895022728 +0300
@@ -40,6 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/wait.h>
+#include <linux/module.h>
 
 #include "inotify.h"
 
@@ -340,7 +341,7 @@ static long inotify_ioctl(struct file *f
 	return ret;
 }
 
-static const struct file_operations inotify_fops = {
+const struct file_operations inotify_fops = {
 	.poll		= inotify_poll,
 	.read		= inotify_read,
 	.fasync		= inotify_fasync,
@@ -348,6 +349,7 @@ static const struct file_operations inot
 	.unlocked_ioctl	= inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
 };
+EXPORT_SYMBOL(inotify_fops);
 
 
 /*
@@ -461,6 +463,12 @@ static void inotify_free_mark(struct fsn
 {
 	struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
 
+	if (ientry->cpt_wd_path) {
+		kfree(ientry->cpt_wd_path);
+		mnt_unpin(ientry->cpt_wd_mnt);
+		mntput(ientry->cpt_wd_mnt);
+	}
+
 	kmem_cache_free(inotify_inode_mark_cachep, ientry);
 }
 
@@ -527,26 +535,48 @@ static int inotify_update_existing_watch
 	return ret;
 }
 
-static int inotify_new_watch(struct fsnotify_group *group,
-			     struct inode *inode,
-			     u32 arg)
+int __inotify_new_watch(struct fsnotify_group *group,
+			     struct path *path, __u32 mask, int wd)
 {
 	struct inotify_inode_mark_entry *tmp_ientry;
-	__u32 mask;
-	int ret, wd;
+	char *kwd_path = NULL, *wd_path = NULL;
+	u32 start_wd;
+	int ret, wd_local;
 
-	/* don't allow invalid bits: we don't want flags set */
-	mask = inotify_arg_to_mask(arg);
 	if (unlikely(!mask))
 		return -EINVAL;
 
+	if (!ve_is_super(get_exec_env())) {
+		kwd_path = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (!kwd_path)
+			return -ENOMEM;
+
+		wd_path = d_path(path, kwd_path, PATH_MAX);
+		if (IS_ERR(wd_path)) {
+			kfree(kwd_path);
+			return PTR_ERR(wd_path);
+		}
+
+		wd_path = kstrdup(wd_path, GFP_KERNEL);
+		if (!wd_path) {
+			kfree(kwd_path);
+			return -ENOMEM;
+		}
+
+		kfree(kwd_path);
+	}
+
 	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!tmp_ientry))
+	if (unlikely(!tmp_ientry)) {
+		kfree(wd_path);
 		return -ENOMEM;
+	}
 
 	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
 	tmp_ientry->fsn_entry.mask = mask;
 	tmp_ientry->wd = -1;
+	tmp_ientry->cpt_wd_path = NULL;
+	tmp_ientry->cpt_wd_mnt = NULL;
 
 	ret = -ENOSPC;
 	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
@@ -559,12 +589,16 @@ retry:
 	/* we are putting the mark on the idr, take a reference */
 	fsnotify_get_mark(&tmp_ientry->fsn_entry);
 
+	if (wd == -1)
+		start_wd = group->inotify_data.last_wd + 1;
+	else
+		start_wd = wd;
+
 	spin_lock(&group->inotify_data.idr_lock);
 	ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-				group->inotify_data.last_wd +  1,
-				&tmp_ientry->wd);
+				start_wd, &tmp_ientry->wd);
 	if (!ret) {
-		wd = tmp_ientry->wd;
+		wd_local = tmp_ientry->wd;
 		/* update the idr hint, who cares about races, it's just a hint */
 		group->inotify_data.last_wd = tmp_ientry->wd;
 
@@ -584,16 +618,29 @@ retry:
 		goto out_err;
 	}
 
+	if (wd != -1 && tmp_ientry->wd != wd) {
+		ret = -EBUSY;
+		fsnotify_put_mark(&tmp_ientry->fsn_entry);
+		inotify_remove_from_idr(group, tmp_ientry);
+		goto out_err;
+	}
+
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, path->dentry->d_inode);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_ientry);
 		goto out_err;
 	}
 
+	if (!ve_is_super(get_exec_env())) {
+		tmp_ientry->cpt_wd_path = wd_path;
+		mnt_pin(path->mnt);
+		tmp_ientry->cpt_wd_mnt = path->mnt;
+	}
+
 	/* return the watch descriptor for this new entry */
-	ret = wd;
+	ret = wd_local;
 
 	/* match the ref from fsnotify_init_markentry() */
 	fsnotify_put_mark(&tmp_ientry->fsn_entry);
@@ -603,22 +650,31 @@ retry:
 		fsnotify_recalc_group_mask(group);
 
 out_err:
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(wd_path);
 		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+	}
 
 	return ret;
 }
+EXPORT_SYMBOL(__inotify_new_watch);
 
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_new_watch(struct fsnotify_group *group,
+			     struct path *path, u32 arg)
+{
+	return __inotify_new_watch(group, path, inotify_arg_to_mask(arg), -1);
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct path *path, u32 arg)
 {
 	int ret = 0;
 
 retry:
 	/* try to update and existing watch with the new arg */
-	ret = inotify_update_existing_watch(group, inode, arg);
+	ret = inotify_update_existing_watch(group, path->dentry->d_inode, arg);
 	/* no mark present, try to add a new one */
 	if (ret == -ENOENT)
-		ret = inotify_new_watch(group, inode, arg);
+		ret = inotify_new_watch(group, path, arg);
 	/*
 	 * inotify_new_watch could race with another thread which did an
 	 * inotify_new_watch between the update_existing and the add watch
@@ -630,7 +686,8 @@ retry:
 	return ret;
 }
 
-static struct fsnotify_group *inotify_new_group(unsigned int max_events)
+static struct fsnotify_group *
+inotify_new_group(unsigned int max_events, int flags)
 {
 	struct fsnotify_group *group;
 	unsigned int grp_num;
@@ -650,7 +707,7 @@ static struct fsnotify_group *inotify_ne
 	group->inotify_data.user = get_current_user();
 
 	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
-	    inotify_max_user_instances) {
+	    inotify_max_user_instances && !(flags & O_DIRECT)) {
 		fsnotify_put_group(group);
 		return ERR_PTR(-EMFILE);
 	}
@@ -658,66 +715,75 @@ static struct fsnotify_group *inotify_ne
 	return group;
 }
 
-
-/* inotify syscalls */
-SYSCALL_DEFINE1(inotify_init1, int, flags)
+struct file *inotify_create(int flags)
 {
 	struct fsnotify_group *group;
 	struct file *filp;
 	struct path path;
-	int fd, ret;
 
-	/* Check the IN_* constants for consistency.  */
-	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
-	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
-
-	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
-		return -EINVAL;
-
-	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-	group = inotify_new_group(inotify_max_queued_events);
+	/*
+	 * fsnotify_obtain_group took a reference to group,
+	 * we put this when we kill the file in the end
+	 */
+	group = inotify_new_group(inotify_max_queued_events, flags);
 	if (IS_ERR(group))
-		return PTR_ERR(group);
-
-	ret = get_unused_fd_flags(flags & O_CLOEXEC);
-	if (ret < 0)
-		goto out_group;
-	fd = ret;
+		return ERR_CAST(group);
 
 	path.mnt = inotify_mnt;
 	path.dentry = inotify_mnt->mnt_root;
 	path_get(&path);
 
-	ret = -ENFILE;
 	filp = alloc_file(&path, FMODE_READ, &inotify_fops);
 	if (!filp)
-		goto out_fd;
+		goto out_file;
 
 	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 	filp->private_data = group;
+	return filp;
 
-	fd_install(fd, filp);
-
-	return fd;
-
-out_fd:
+out_file:
 	path_put(&path);
-	put_unused_fd(fd);
-out_group:
 	fsnotify_put_group(group);
-	return ret;
+	return ERR_PTR(-ENFILE);
+}
+EXPORT_SYMBOL_GPL(inotify_create);
+
+/* inotify syscalls */
+SYSCALL_DEFINE1(inotify_init1, int, flags)
+{
+	struct file *filp;
+	int fd;
+
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+		return -EINVAL;
+
+	filp = inotify_create(flags);
+	if (IS_ERR(filp))
+		return PTR_ERR(filp);
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		fput(filp);
+	else
+		fd_install(fd, filp);
+
+	return fd;
 }
 
 SYSCALL_DEFINE0(inotify_init)
 {
 	return sys_inotify_init1(0);
 }
+EXPORT_SYMBOL(sys_inotify_init);
 
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		u32, mask)
 {
 	struct fsnotify_group *group;
-	struct inode *inode;
 	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
@@ -742,12 +808,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, 
 	if (ret)
 		goto fput_and_out;
 
-	/* inode held in place by reference to path; group by fget on fd */
-	inode = path.dentry->d_inode;
 	group = filp->private_data;
 
 	/* create/update an inode mark */
-	ret = inotify_update_watch(group, inode, mask);
+	ret = inotify_update_watch(group, &path, mask);
 	if (unlikely(ret))
 		goto path_put_and_out;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ocfs2/export.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ocfs2/export.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ocfs2/export.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ocfs2/export.c	2015-01-21 12:02:52.021992810 +0300
@@ -201,8 +201,12 @@ static int ocfs2_encode_fh(struct dentry
 		   dentry->d_name.len, dentry->d_name.name,
 		   fh, len, connectable);
 
-	if (len < 3 || (connectable && len < 6)) {
-		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+	if (connectable && (len < 6)) {
+		*max_len = 6;
+		type = 255;
+		goto bail;
+	} else if (len < 3) {
+		*max_len = 3;
 		type = 255;
 		goto bail;
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/open.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/open.c
--- linux-2.6.32-504.3.3.el6.orig/fs/open.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/open.c	2015-01-21 12:02:57.965835033 +0300
@@ -511,15 +511,21 @@ out:
 	return err;
 }
 
-SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
+static int do_fchmodat(int dfd, const char __user *filename, mode_t mode, int flag)
 {
 	struct path path;
 	struct inode *inode;
 	int error;
 	struct iattr newattrs;
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	int follow;
 retry:
-	error = user_path_at(dfd, filename, lookup_flags, &path);
+	error = -EINVAL;
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : lookup_flags;
+	error = user_path_at(dfd, filename, follow, &path);
 	if (error)
 		goto out;
 	inode = path.dentry->d_inode;
@@ -545,9 +551,20 @@ out:
 	return error;
 }
 
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
+{
+	return do_fchmodat(dfd, filename, mode, 0);
+}
+
 SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
 {
-	return sys_fchmodat(AT_FDCWD, filename, mode);
+	return do_fchmodat(AT_FDCWD, filename, mode, 0);
+}
+EXPORT_SYMBOL_GPL(sys_chmod);
+
+SYSCALL_DEFINE2(lchmod, const char __user *, filename, mode_t, mode)
+{
+	return do_fchmodat(AT_FDCWD, filename, mode, AT_SYMLINK_NOFOLLOW);
 }
 
 static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
@@ -609,6 +626,7 @@ SYSCALL_DEFINE3(chown, const char __user
 {
 	return sys_fchownat(AT_FDCWD, filename, user, group, 0);
 }
+EXPORT_SYMBOL(sys_chown);
 
 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 {
@@ -676,6 +694,11 @@ static struct file *__dentry_open(struct
 	struct inode *inode;
 	int error;
 
+	if (!may_use_odirect())
+		f->f_flags &= ~O_DIRECT;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		f->f_flags &= ~O_SYNC;
+
 	f->f_mode = (__force fmode_t)((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
@@ -692,7 +715,7 @@ static struct file *__dentry_open(struct
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
-	file_move(f, &inode->i_sb->s_files);
+	file_sb_list_add(f, inode->i_sb);
 
 	error = security_dentry_open(f, cred);
 	if (error)
@@ -738,7 +761,7 @@ cleanup_all:
 			__mnt_drop_write(mnt);
 		}
 	}
-	file_kill(f);
+	file_sb_list_del(f);
 	f->f_path.dentry = NULL;
 	f->f_path.mnt = NULL;
 cleanup_file:
@@ -810,6 +833,7 @@ struct file *nameidata_to_filp(struct na
 		path_put(&nd->path);
 	return filp;
 }
+EXPORT_SYMBOL_GPL(nameidata_to_filp);
 
 /*
  * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an
@@ -926,6 +950,7 @@ SYSCALL_DEFINE3(open, const char __user 
 	asmlinkage_protect(3, ret, filename, flags, mode);
 	return ret;
 }
+EXPORT_SYMBOL(sys_open);
 
 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
 		int, mode)
@@ -1057,3 +1082,181 @@ int nonseekable_open(struct inode *inode
 }
 
 EXPORT_SYMBOL(nonseekable_open);
+
+/*
+ * require inode->i_mutex or unreachable inode
+ */
+int open_inode_peer(struct inode *inode, struct path *path,
+		    const struct cred *cred)
+{
+	struct inode *peer = path->dentry->d_inode;
+	struct address_space *mapping;
+	struct file *file;
+	struct user_beancounter *cur_ub;
+	const struct cred *cur_cred;
+	int err;
+
+	/*
+	 * We cannot open peers in the middle of transaction,
+	 * this shouldn't happens at all.
+	 */
+	err = -EDEADLK;
+	if (WARN_ON_ONCE(current->journal_info))
+		goto out_err;
+
+	err = -EBUSY;
+	if (inode->i_peer_file)
+		goto out_err;
+
+	err = -EINVAL;
+	if (!S_ISREG(inode->i_mode) || !S_ISREG(peer->i_mode) ||
+	    peer == inode || i_size_read(peer) != i_size_read(inode))
+		goto out_err;
+
+restart:
+	rcu_read_lock();
+	file = rcu_dereference(peer->i_peer_file);
+	if (file && file->f_mapping != peer->i_mapping) {
+		rcu_read_unlock();
+		err = -EMLINK;
+		goto out_err;
+	}
+	if (file && atomic_long_inc_not_zero(&file->f_count)) {
+		rcu_read_unlock();
+		path_put(path);
+		goto install;
+	}
+	rcu_read_unlock();
+
+	cur_ub = set_exec_ub(&ub0);
+	cur_cred = override_creds(cred);
+	file = dentry_open(path->dentry, path->mnt,
+			   O_RDONLY|O_LARGEFILE, cred);
+	revert_creds(cur_cred);
+	set_exec_ub(cur_ub);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	spin_lock(&peer->i_lock);
+	if (atomic_read(&peer->i_writecount) > 0) {
+		spin_unlock(&peer->i_lock);
+		fput(file);
+		return -ETXTBSY;
+	}
+	if (peer->i_size != inode->i_size) {
+		spin_unlock(&peer->i_lock);
+		fput(file);
+		return -EINVAL;
+	}
+	if (peer->i_peer_file && file_count(peer->i_peer_file)) {
+		spin_unlock(&peer->i_lock);
+		*path = file->f_path;
+		path_get(path);
+		fput(file);
+		goto restart;
+	}
+	atomic_dec(&peer->i_writecount);
+	rcu_assign_pointer(peer->i_peer_file, file);
+	spin_unlock(&peer->i_lock);
+
+install:
+	spin_lock(&inode->i_lock);
+	if (inode->i_peer_file)
+		goto undo;
+	rcu_assign_pointer(inode->i_peer_file, file);
+	spin_unlock(&inode->i_lock);
+
+	mapping = file->f_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+	list_add(&inode->i_mapping->i_peer_list, &mapping->i_peer_list);
+	spin_unlock(&mapping->i_mmap_lock);
+
+	/* update peer atime */
+	file_accessed(file);
+
+	/* prune unused pages */
+	invalidate_mapping_pages(inode->i_mapping, 0, -1);
+
+	return 0;
+
+undo:
+	spin_unlock(&inode->i_lock);
+	peer_fput(file);
+
+	return -EBUSY;
+
+out_err:
+	path_put(path);
+	return err;
+}
+EXPORT_SYMBOL(open_inode_peer);
+
+static void __peer_fput(struct work_struct *work)
+{
+	struct file *file = container_of(work, struct file, f_work);
+
+	/* update peer atime */
+	file_accessed(file);
+
+	__fput(file);
+}
+
+void peer_fput(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count)) {
+		struct inode *peer = file->f_mapping->host;
+
+		spin_lock(&peer->i_lock);
+		if (peer->i_peer_file == file)
+			rcu_assign_pointer(peer->i_peer_file, NULL);
+		atomic_inc(&peer->i_writecount);
+		spin_unlock(&peer->i_lock);
+
+		/*
+		 * We cannot fput file if we are in the middle of
+		 * fs-transaction, so schedule this fput to keventd.
+		 */
+		if (current->journal_info) {
+			INIT_WORK(&file->f_work, __peer_fput);
+			schedule_work(&file->f_work);
+		} else
+			__peer_fput(&file->f_work);
+	}
+}
+
+/*
+ * require inode->i_mutex or unreachable inode
+ */
+void close_inode_peer(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct file *file = inode->i_peer_file;
+
+	if (!file)
+		return;
+
+	spin_lock(&inode->i_lock);
+	BUG_ON(inode->i_peer_file != file);
+	BUG_ON(file->f_mapping == mapping);
+	rcu_assign_pointer(inode->i_peer_file, NULL);
+	spin_unlock(&inode->i_lock);
+
+	if (mapping_mapped(mapping)) {
+		struct zap_details details = {
+			.check_mapping = file->f_mapping,
+			.first_index = 0,
+			.last_index = ULONG_MAX,
+		};
+
+		synchronize_mapping_faults(mapping);
+		zap_mapping_range(mapping, &details);
+	}
+
+	mapping = file->f_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+	list_del_init(&inode->i_mapping->i_peer_list);
+	spin_unlock(&mapping->i_mmap_lock);
+
+	peer_fput(file);
+}
+EXPORT_SYMBOL(close_inode_peer);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/partitions/check.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/partitions/check.c
--- linux-2.6.32-504.3.3.el6.orig/fs/partitions/check.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/partitions/check.c	2015-01-21 12:02:57.854837977 +0300
@@ -20,6 +20,7 @@
 #include <linux/ctype.h>
 #include <linux/genhd.h>
 #include <linux/blktrace_api.h>
+#include <linux/sysfs.h>
 
 #include "check.h"
 
@@ -132,6 +133,7 @@ char *disk_name(struct gendisk *hd, int 
 
 	return buf;
 }
+EXPORT_SYMBOL(disk_name);
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
@@ -216,7 +218,7 @@ ssize_t part_size_show(struct device *de
 		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
-	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
+	return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
 }
 
 ssize_t part_alignment_offset_show(struct device *dev,
@@ -374,6 +376,43 @@ void delete_partition(struct gendisk *di
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
+#if BITS_PER_LONG == 32 && defined(CONFIG_LBDAF)
+void part_nr_sects_write(struct hd_struct *part, sector_t val)
+{
+	write_seqcount_begin(&part->seq);
+	part->nr_sects = val;
+	write_seqcount_end(&part->seq);
+}
+
+/*
+ * Any access of part->nr_sects which is not protected by partition
+ * bd_mutex or gendisk bdev bd_mutex, hould be done using this accessor
+ * function.
+ */
+sector_t part_nr_sects_read(struct hd_struct *part)
+{
+	sector_t nr_sects;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&part->seq);
+		nr_sects = part->nr_sects;
+	} while (read_seqcount_retry(&part->seq, seq));
+
+	return nr_sects;
+}
+#else
+void part_nr_sects_write(struct hd_struct *part, sector_t val)
+{
+	part->nr_sects = val;
+}
+
+sector_t part_nr_sects_read(struct hd_struct *part)
+{
+	return part->nr_sects;
+}
+#endif
+
 static ssize_t whole_disk_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -416,6 +455,7 @@ struct hd_struct *add_partition(struct g
 		queue_limit_alignment_offset(&disk->queue->limits, start);
 	p->discard_alignment =
 		queue_limit_discard_alignment(&disk->queue->limits, start);
+	seqcount_init(&p->seq);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
@@ -496,14 +536,16 @@ void register_disk(struct gendisk *disk)
 
 	if (device_add(ddev))
 		return;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	err = sysfs_create_link(block_depr, &ddev->kobj,
-				kobject_name(&ddev->kobj));
-	if (err) {
-		device_del(ddev);
-		return;
+
+	if (!sysfs_deprecated) {
+		err = sysfs_create_link(block_depr, &ddev->kobj,
+					kobject_name(&ddev->kobj));
+		if (err) {
+			device_del(ddev);
+			return;
+		}
 	}
-#endif
+
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -684,9 +726,8 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-#endif
+	if (!sysfs_deprecated)
+		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	device_del(disk_to_dev(disk));
 	blk_free_devt(disk_to_dev(disk)->devt);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/pipe.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/pipe.c
--- linux-2.6.32-504.3.3.el6.orig/fs/pipe.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/pipe.c	2015-01-21 12:02:57.965835033 +0300
@@ -22,6 +22,8 @@
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+#include <bc/kmem.h>
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -532,7 +534,7 @@ redo1:
 			int error, atomic = 1;
 
 			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
+				page = alloc_page(GFP_HIGHUSER | __GFP_UBC);
 				if (unlikely(!page)) {
 					ret = ret ? : -ENOMEM;
 					break;
@@ -691,7 +693,7 @@ pipe_poll(struct file *filp, poll_table 
 	return mask;
 }
 
-static int
+int
 pipe_release(struct inode *inode, int decr, int decw)
 {
 	struct pipe_inode_info *pipe;
@@ -712,6 +714,7 @@ pipe_release(struct inode *inode, int de
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pipe_release);
 
 static int
 pipe_read_fasync(int fd, struct file *filp, int on)
@@ -883,7 +886,7 @@ struct pipe_inode_info * alloc_pipe_info
 {
 	struct pipe_inode_info *pipe;
 
-	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC);
 	if (pipe) {
 		init_waitqueue_head(&pipe->wait);
 		pipe->r_counter = pipe->w_counter = 1;
@@ -892,6 +895,7 @@ struct pipe_inode_info * alloc_pipe_info
 
 	return pipe;
 }
+EXPORT_SYMBOL_GPL(alloc_pipe_info);
 
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
@@ -913,6 +917,30 @@ void free_pipe_info(struct inode *inode)
 	inode->i_pipe = NULL;
 }
 
+static void __swap_pipe_info(struct inode *to, struct inode *from)
+{
+	BUG_ON(!from->i_pipe);
+	BUG_ON(!to->i_pipe);
+	swap(to->i_pipe, from->i_pipe);
+	swap(to->i_pipe->inode, from->i_pipe->inode);
+	swap(to->i_pipe->readers, from->i_pipe->readers);
+	swap(to->i_pipe->writers, from->i_pipe->writers);
+	swap(to->i_pipe->r_counter, from->i_pipe->r_counter);
+	swap(to->i_pipe->w_counter, from->i_pipe->w_counter);
+}
+
+void swap_pipe_info(struct inode *to, struct inode *from)
+{
+	BUG_ON(!S_ISFIFO(to->i_mode));
+	BUG_ON(!S_ISFIFO(from->i_mode));
+	mutex_lock(&from->i_mutex);
+	mutex_lock(&to->i_mutex);
+	__swap_pipe_info(to, from);
+	mutex_unlock(&to->i_mutex);
+	mutex_unlock(&from->i_mutex);
+}
+EXPORT_SYMBOL_GPL(swap_pipe_info);
+
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
@@ -1093,6 +1121,7 @@ int do_pipe_flags(int *fd, int flags)
 	free_write_pipe(fw);
 	return error;
 }
+EXPORT_SYMBOL(do_pipe_flags);
 
 /*
  * sys_pipe() is the normal C calling standard for creating
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/pramcache.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/pramcache.c
--- linux-2.6.32-504.3.3.el6.orig/fs/pramcache.c	2015-01-21 12:02:52.678975369 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/pramcache.c	2015-01-21 12:02:53.043965680 +0300
@@ -0,0 +1,874 @@
+#include <linux/buffer_head.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/mmgang.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/pram.h>
+#include <linux/pramcache.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
+
+static int pramcache_feature_nosync;
+
+#define PRAMCACHE_PAGE_CACHE	"page_cache"
+#define PRAMCACHE_BDEV_CACHE	"bdev_cache"
+
+#define PRAMCACHE_MAGIC		0x70667363
+#define PRAMCACHE_VERSION	3
+
+#define PRAMCACHE_FHANDLE_MAX	256
+
+struct pramcache_header {
+	__u32 magic;
+	__u32 version;
+	__u32 mnt_count;
+};
+
+struct page_state {
+	__u64 index;
+
+	__u32 flags;
+#define PAGE_STATE_UPTODATE	0x01
+#define PAGE_STATE_DIRTY	0x02
+
+	__u32 buffers_uptodate;
+#define MAX_PAGE_BUFFERS	32
+};
+
+static int pramcache_enabled;	/* if set, page & bdev caches
+				   will be saved to pram on umount */
+
+int pramcache_ploop_nosync = 1;
+
+/*
+ * pram_write() and pram_push_page() may not fail if pram_prealloc()
+ * succeeded. The macros gracefully eliminate redundant retval checks.
+ */
+#define pramcache_write(s, b, c) do {				\
+	if (unlikely(pram_write((s), (b), (c)) != (c)))		\
+		BUG();						\
+} while (0)
+#define pramcache_push_page(s, p) do {				\
+	if (unlikely(pram_push_page((s), (p), NULL) != 0))	\
+		BUG();						\
+} while (0)
+
+static void pramcache_msg(struct super_block *sb, const char *prefix,
+			  const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	printk("%sPRAMCACHE (%s): ", prefix, sb->s_id);
+	vprintk(fmt, ap);
+	printk("\n");
+	va_end(ap);
+}
+
+static char *pramcache_pram_basename(struct super_block *sb,
+				     char *buf, size_t size)
+{
+	snprintf(buf, size, "pramcache.%pU.", sb->s_uuid);
+	return buf;
+}
+
+/*
+ * Meta and data streams must be opened and closed atomically, otherwise we can
+ * get a data storage without corresponding meta storage, which will lead to
+ * open_streams() failures.
+ */
+static DEFINE_MUTEX(streams_mutex);
+
+static int open_streams(struct super_block *sb, const char *name, int mode,
+			struct pram_stream *meta_stream,
+			struct pram_stream *data_stream)
+{
+	char *buf;
+	size_t basename_len;
+	int err = -ENOMEM;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		goto out;
+
+	pramcache_pram_basename(sb, buf, PAGE_SIZE);
+	strlcat(buf, name, PAGE_SIZE);
+	basename_len = strlen(buf);
+
+	mutex_lock(&streams_mutex);
+
+	/*
+	 * Since loss of several pages is not critical when saving
+	 * page cache, we will be using GFP_NOWAIT & pram_prealloc()
+	 */
+
+	strlcat(buf, ".meta", PAGE_SIZE);
+	err = __pram_open(buf, mode, GFP_NOWAIT | __GFP_HIGHMEM, meta_stream);
+	if (err)
+		goto out_unlock;
+
+	buf[basename_len] = '\0';
+	strlcat(buf, ".data", PAGE_SIZE);
+	err = __pram_open(buf, mode, GFP_NOWAIT | __GFP_HIGHMEM, data_stream);
+	if (err)
+		goto out_close_meta;
+
+	mutex_unlock(&streams_mutex);
+
+	free_page((unsigned long)buf);
+	return 0;
+
+out_close_meta:
+	pram_close(meta_stream, -1);
+out_unlock:
+	mutex_unlock(&streams_mutex);
+	free_page((unsigned long)buf);
+out:
+	return err;
+}
+
+static void close_streams(struct pram_stream *meta_stream,
+			  struct pram_stream *data_stream, int err)
+{
+	mutex_lock(&streams_mutex);
+	pram_close(meta_stream, err);
+	pram_close(data_stream, err);
+	mutex_unlock(&streams_mutex);
+}
+
+/* returns non-zero if page should be saved */
+static int get_page_state(struct page *page, struct page_state *state)
+{
+	struct buffer_head *head, *bh;
+	int i;
+
+	if (PageWriteback(page))
+		return 0;
+
+	state->index = page->index;
+	state->flags = 0;
+	if (PageDirty(page))
+		state->flags |= PAGE_STATE_DIRTY;
+	if (PageUptodate(page)) {
+		state->flags |= PAGE_STATE_UPTODATE;
+		state->buffers_uptodate = ~0;
+		return 1;
+	}
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	i = 0;
+	state->buffers_uptodate = 0;
+	head = bh = page_buffers(page);
+	do {
+		if (WARN_ON_ONCE(i >= MAX_PAGE_BUFFERS))
+			return 0;
+		if (buffer_uptodate(bh))
+			state->buffers_uptodate |= 1 << i;
+		bh = bh->b_this_page;
+		i++;
+	} while (bh != head);
+
+	return !!state->buffers_uptodate;
+}
+
+static void make_page_uptodate(struct page *page, struct page_state *state)
+{
+	struct buffer_head *head, *bh;
+	int i;
+
+	WARN_ON_ONCE(PageDirty(page));
+	WARN_ON_ONCE(page_has_private(page));
+
+	if (state->flags & PAGE_STATE_UPTODATE) {
+		SetPageUptodate(page);
+		return;
+	}
+
+	ClearPageUptodate(page);
+	create_empty_buffers(page,
+		page->mapping->host->i_sb->s_blocksize, 0);
+
+	i = 0;
+	bh = head = page_buffers(page);
+	do {
+		if (WARN_ON_ONCE(i >= MAX_PAGE_BUFFERS))
+			break;
+		if (state->buffers_uptodate & (1 << i))
+			set_buffer_uptodate(bh);
+		bh = bh->b_this_page;
+		i++;
+	} while (bh != head);
+}
+
+/* returns non-zero if page was saved */
+static int save_page(struct page *page,
+		     struct page_state *state,
+		     struct pram_stream *meta_stream,
+		     struct pram_stream *data_stream)
+{
+	/* if prealloc fails, silently skip the page */
+	if (pram_prealloc2(GFP_NOWAIT | __GFP_HIGHMEM,
+			   sizeof(*state), PAGE_SIZE) == 0) {
+		pramcache_write(meta_stream, state, sizeof(*state));
+		pramcache_push_page(data_stream, page);
+		pram_prealloc_end();
+		return 1;
+	}
+	return 0;
+}
+
+static struct page *load_page(struct page_state *state,
+			      struct pram_stream *meta_stream,
+			      struct pram_stream *data_stream)
+{
+	struct page *page;
+	ssize_t ret;
+
+	ret = pram_read(meta_stream, state, sizeof(*state));
+	if (!ret)
+		return NULL;
+	if (ret != sizeof(*state))
+		return ERR_PTR(-EIO);
+
+	/* since we do not save outdated pages, empty uptodate mask
+	 * can be used as the 'end of mapping' mark */
+	if (!state->buffers_uptodate)
+		return NULL;
+
+	page = pram_pop_page(data_stream);
+	if (IS_ERR_OR_NULL(page))
+		return ERR_PTR(-EIO);
+
+	return page;
+}
+
+static int write_page(struct address_space *mapping, loff_t filesize,
+		      struct page *page, pgoff_t index)
+{
+	loff_t pos = index << PAGE_SHIFT;
+	unsigned len = PAGE_SIZE;
+	struct page *page2;
+	void *fsdata;
+	int status;
+
+	WARN_ON_ONCE(pos >= filesize);
+	if (pos + len > filesize)
+		len = filesize - pos;
+
+	status = pagecache_write_begin(NULL, mapping,
+				       pos, len, 0, &page2, &fsdata);
+	if (status)
+		return status;
+
+	if (unlikely(page2 != page))
+		copy_highpage(page2, page);
+
+	status = pagecache_write_end(NULL, mapping,
+				     pos, len, len, page2, fsdata);
+	if (unlikely(status < 0))
+		return status;
+
+	return 0;
+}
+
+static int insert_page(struct address_space *mapping, loff_t filesize,
+		       struct page *page, struct page_state *state)
+{
+	int err;
+
+	if (!pram_page_dirty(page)) {
+		err = add_to_page_cache_lru(page, mapping,
+					    state->index, GFP_KERNEL);
+	} else {
+		/* page already accounted and in lru */
+		__set_page_locked(page);
+		err = add_to_page_cache_nogang(page, mapping,
+					       state->index, GFP_KERNEL);
+		if (err)
+			__clear_page_locked(page);
+	}
+	if (!err) {
+		make_page_uptodate(page, state);
+		unlock_page(page);
+	} else if (err != -EEXIST)
+		goto out;
+
+	err = 0;
+	if (state->flags & PAGE_STATE_DIRTY)
+		err = write_page(mapping, filesize, page, state->index);
+out:
+	put_page(page);
+	return err;
+}
+
+static void evict_page(struct page *page)
+{
+	if (page_has_private(page)) {
+		do_invalidatepage(page, 0);
+		if (page_has_private(page))
+			return;
+	}
+	cancel_dirty_page(page, PAGE_CACHE_SIZE);
+	remove_from_page_cache(page);
+	page_cache_release(page);
+}
+
+static void save_invalidate_page(struct page *page, int nosync,
+				 struct pram_stream *meta_stream,
+				 struct pram_stream *data_stream)
+{
+	int evict = 1;
+	struct page_state state;
+
+	if (!get_page_state(page, &state))
+		goto invalidate;
+
+	if (state.flags & PAGE_STATE_DIRTY) {
+		/* for the sake of simplicity evict only
+		 * those dirty pages that are fully uptodate
+		 * if nosync */
+		if (!nosync || !(state.flags & PAGE_STATE_UPTODATE)) {
+			/* treat the page as clean because
+			 * it will be synced soon */
+			state.flags &= ~PAGE_STATE_DIRTY;
+			evict = 0;
+		}
+	}
+
+	if (!save_page(page, &state, meta_stream, data_stream))
+		goto invalidate;
+
+	if (evict)
+		evict_page(page);
+	return;
+
+invalidate:
+	invalidate_inode_page(page);
+}
+
+static void save_invalidate_mapping_pages(struct address_space *mapping,
+					  int nosync,
+					  struct pram_stream *meta_stream,
+					  struct pram_stream *data_stream)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t index;
+
+			lock_page(page);
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			index = page->index;
+			if (index > next)
+				next = index;
+			next++;
+
+			save_invalidate_page(page, nosync,
+					     meta_stream, data_stream);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static long load_mapping_pages(struct address_space *mapping,
+			       loff_t filesize,
+			       struct pram_stream *meta_stream,
+			       struct pram_stream *data_stream)
+{
+	struct page_state state;
+	struct page *page;
+	long loaded = 0;
+	int err;
+
+next:
+	page = load_page(&state, meta_stream, data_stream);
+	if (!page)
+		return loaded;
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	err = insert_page(mapping, filesize, page, &state);
+	if (err)
+		return err;
+
+	loaded++;
+	goto next;
+}
+
+static void save_invalidate_inode(struct inode *inode,
+				  int *first, int nosync,
+				  void *buf, size_t bufsize,
+				  struct pram_stream *meta_stream,
+				  struct pram_stream *data_stream)
+{
+	const struct page_state eof = { 0, };
+	struct dentry *dentry;
+	__u64 filesize;
+	__u32 len;
+
+	if (hlist_unhashed(&inode->i_hash))
+		goto invalidate;
+
+	len = vfs_inode_fhandle(inode, buf, bufsize);
+	if (len < 0)
+		goto invalidate;
+
+	dentry = vfs_fhandle_to_dentry(inode->i_sb, buf);
+	if (IS_ERR(dentry))
+		goto invalidate;
+	dput(dentry);
+
+	if (pram_prealloc(GFP_NOWAIT | __GFP_HIGHMEM,
+			  sizeof(eof) + sizeof(filesize) + len) != 0)
+		goto invalidate;
+
+	/* if we have already saved inodes, write the 'end of mapping'
+	 * mark (see load_page()) */
+	if (!*first)
+		pramcache_write(meta_stream, &eof, sizeof(eof));
+
+	pramcache_write(meta_stream, buf, len);
+
+	/* since filesystems usually write file size to disk on page
+	 * writeback and we may avoid writeback by emitting dirty pages,
+	 * save file size to pram */
+	filesize = i_size_read(inode);
+	pramcache_write(meta_stream, &filesize, sizeof(filesize));
+
+	pram_prealloc_end();
+
+	save_invalidate_mapping_pages(&inode->i_data, nosync,
+				      meta_stream, data_stream);
+	*first = 0;
+	return;
+
+invalidate:
+	invalidate_mapping_pages(&inode->i_data, 0, ~0UL);
+}
+
+static long load_inode(struct super_block *sb,
+		       void *buf, size_t bufsize,
+		       struct pram_stream *meta_stream,
+		       struct pram_stream *data_stream)
+{
+	struct file_handle *handle;
+	struct dentry *dentry;
+	__u64 filesize;
+	ssize_t ret;
+	int err;
+
+	if (bufsize < sizeof(*handle))
+		return -ENOBUFS;
+
+	handle = buf;
+	ret = pram_read(meta_stream, handle, sizeof(*handle));
+	if (!ret)
+		return -ENODATA;
+
+	err = -EIO;
+	if (ret != sizeof(*handle))
+		goto out;
+	if (handle->handle_bytes > bufsize - sizeof(*handle))
+		goto out;
+
+	if (pram_read(meta_stream, (char *)buf + sizeof(*handle),
+		      handle->handle_bytes) != handle->handle_bytes)
+		goto out;
+
+	dentry = vfs_fhandle_to_dentry(sb, handle);
+	err = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (pram_read(meta_stream, &filesize,
+		      sizeof(filesize)) != sizeof(filesize))
+		goto out_dput;
+
+	err = load_mapping_pages(&dentry->d_inode->i_data, filesize,
+				 meta_stream, data_stream);
+out_dput:
+	dput_nocache(dentry, 1);
+out:
+	return err;
+}
+
+static int save_header(struct super_block *sb,
+		       struct pram_stream *stream)
+{
+	struct pramcache_header hdr;
+	int err;
+
+	hdr.magic = PRAMCACHE_MAGIC;
+	hdr.version = PRAMCACHE_VERSION;
+	hdr.mnt_count = sb->s_mnt_count;
+
+	err = pram_prealloc(GFP_KERNEL | __GFP_HIGHMEM, sizeof(hdr));
+	if (!err) {
+		pramcache_write(stream, &hdr, sizeof(hdr));
+		pram_prealloc_end();
+	}
+	return err;
+}
+
+static int check_header(struct super_block *sb,
+			struct pram_stream *stream)
+{
+	struct pramcache_header hdr;
+
+	if (pram_read(stream, &hdr, sizeof(hdr)) != sizeof(hdr))
+		return -EIO;
+
+	if (hdr.magic != PRAMCACHE_MAGIC) {
+		pramcache_msg(sb, KERN_ERR, "wrong magic");
+		return -EINVAL;
+	}
+
+	if (hdr.version != PRAMCACHE_VERSION) {
+		pramcache_msg(sb, KERN_ERR, "bad version (%d)",
+			      (int)hdr.version);
+		return -EINVAL;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY))
+		hdr.mnt_count++;
+
+	if (sb->s_mnt_count != hdr.mnt_count) {
+		pramcache_msg(sb, KERN_ERR,
+			      "mnt count should be %d, but was %d",
+			      (int)hdr.mnt_count, sb->s_mnt_count);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void pramcache_prune(struct super_block *sb, const char *name)
+{
+	struct pram_stream meta_stream, data_stream;
+	int err;
+
+retry:
+	/* first, destroy the cache */
+	err = open_streams(sb, name, PRAM_READ, &meta_stream, &data_stream);
+	if (!err)
+		close_streams(&meta_stream, &data_stream, 0);
+	if (err == -ENOENT)
+		err = 0;
+	if (err)
+		goto out;
+
+	/* then, create an empty one */
+	err = open_streams(sb, name, PRAM_WRITE, &meta_stream, &data_stream);
+	if (!err)
+		close_streams(&meta_stream, &data_stream, 0);
+out:
+	if (err == -EBUSY || err == -EEXIST) {
+		/* someone is writing to the cache, let them finish */
+		schedule_timeout_uninterruptible(1);
+		goto retry;
+	}
+	if (err) {
+		pramcache_msg(sb, KERN_ERR,
+			      "prune failed (%d), "
+			      "data corruption possible!", err);
+	}
+}
+
+static void save_invalidate_page_cache(struct super_block *sb, int nosync)
+{
+	struct pram_stream meta_stream, data_stream;
+	struct inode *inode, *old_inode = NULL;
+	int first = 1;
+	void *buf;
+	int err;
+
+	err = open_streams(sb, PRAMCACHE_PAGE_CACHE, PRAM_WRITE,
+			   &meta_stream, &data_stream);
+	if (err)
+		goto out;
+
+	err = save_header(sb, &meta_stream);
+	if (err)
+		goto out_close_streams;
+
+	err = -ENOMEM;
+	buf = kmalloc(PRAMCACHE_FHANDLE_MAX, GFP_KERNEL);
+	if (!buf)
+		goto out_close_streams;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!inode->i_nlink)
+			continue;
+		if (!inode->i_data.nrpages)
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
+
+		/* We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the inode_lock.
+		 * We cannot iput the inode now as we can be holding the last
+		 * reference and we cannot iput it under inode_lock. So we
+		 * keep the reference and iput it later. */
+		iput(old_inode);
+		old_inode = inode;
+
+		save_invalidate_inode(inode, &first, nosync,
+				      buf, PRAMCACHE_FHANDLE_MAX,
+				      &meta_stream, &data_stream);
+
+		spin_lock(&inode_lock);
+	}
+	spin_unlock(&inode_lock);
+	iput(old_inode);
+	err = 0;
+
+	kfree(buf);
+out_close_streams:
+	close_streams(&meta_stream, &data_stream, err);
+out:
+	if (err)
+		pramcache_msg(sb, KERN_ERR,
+			      "Failed to save page cache: %d", err);
+	if (err == -EEXIST) {
+		pramcache_msg(sb, KERN_ERR,
+			      "Filesystem UUID collision detected, "
+			      "run `tune2fs -U' to update UUID");
+		pramcache_prune(sb, PRAMCACHE_PAGE_CACHE);
+	}
+}
+
+void pramcache_load_page_cache(struct super_block *sb)
+{
+	struct pram_stream meta_stream, data_stream;
+	long ret, loaded = 0;
+	void *buf;
+	int err;
+
+	BUG_ON(!sb->s_bdev);
+
+	if (sb->s_flags & MS_RDONLY)
+		/* will load on remount rw, since dirty pages
+		 * can't be populated right now */
+		return;
+
+	err = open_streams(sb, PRAMCACHE_PAGE_CACHE, PRAM_READ,
+			   &meta_stream, &data_stream);
+	if (err)
+		goto out;
+
+	err = check_header(sb, &meta_stream);
+	if (err)
+		goto out_close_streams;
+
+	err = -ENOMEM;
+	buf = kmalloc(PRAMCACHE_FHANDLE_MAX, GFP_KERNEL);
+	if (!buf)
+		goto out_close_streams;
+
+next:
+	ret = load_inode(sb, buf, PRAMCACHE_FHANDLE_MAX,
+			 &meta_stream, &data_stream);
+	if (ret < 0) {
+		err = ret;
+		if (err == -ENODATA)
+			err = 0;
+		goto out_free_buf;
+	}
+	loaded += ret;
+	goto next;
+
+out_free_buf:
+	kfree(buf);
+out_close_streams:
+	close_streams(&meta_stream, &data_stream, 0);
+out:
+	if (!err)
+		pramcache_msg(sb, KERN_INFO,
+			      "loaded page cache (%ld pages)", loaded);
+	else if (err != -ENOENT)
+		pramcache_msg(sb, KERN_ERR,
+			      "Failed to load page cache: %d", err);
+}
+EXPORT_SYMBOL(pramcache_load_page_cache);
+
+static void save_invalidate_bdev_cache(struct super_block *sb)
+{
+	struct pram_stream meta_stream, data_stream;
+	int err;
+
+	err = open_streams(sb, PRAMCACHE_BDEV_CACHE, PRAM_WRITE,
+			   &meta_stream, &data_stream);
+	if (err)
+		goto out;
+
+	err = save_header(sb, &meta_stream);
+	if (err)
+		goto out_close_streams;
+
+	save_invalidate_mapping_pages(sb->s_bdev->bd_inode->i_mapping, 0,
+				      &meta_stream, &data_stream);
+out_close_streams:
+	close_streams(&meta_stream, &data_stream, err);
+out:
+	if (err)
+		pramcache_msg(sb, KERN_ERR,
+			      "Failed to save bdev cache: %d", err);
+	if (err == -EEXIST) {
+		pramcache_msg(sb, KERN_ERR,
+			      "Filesystem UUID collision detected, "
+			      "run `tune2fs -U' to update UUID");
+		pramcache_prune(sb, PRAMCACHE_BDEV_CACHE);
+	}
+}
+
+void pramcache_load_bdev_cache(struct super_block *sb)
+{
+	struct pram_stream meta_stream, data_stream;
+	long loaded = 0;
+	int err;
+
+	BUG_ON(!sb->s_bdev);
+
+	err = open_streams(sb, PRAMCACHE_BDEV_CACHE, PRAM_READ,
+			   &meta_stream, &data_stream);
+	if (err)
+		goto out;
+
+	err = check_header(sb, &meta_stream);
+	if (err)
+		goto out_close_streams;
+
+	loaded = load_mapping_pages(sb->s_bdev->bd_inode->i_mapping, 0,
+				    &meta_stream, &data_stream);
+	if (loaded < 0)
+		err = loaded;
+
+out_close_streams:
+	close_streams(&meta_stream, &data_stream, 0);
+out:
+	if (!err)
+		pramcache_msg(sb, KERN_INFO,
+			      "loaded bdev cache (%ld pages)", loaded);
+	else if (err != -ENOENT)
+		pramcache_msg(sb, KERN_ERR,
+			      "Failed to load bdev cache: %d", err);
+}
+EXPORT_SYMBOL(pramcache_load_bdev_cache);
+
+void pramcache_save_page_cache(struct super_block *sb, int nosync)
+{
+	BUG_ON(!sb->s_bdev);
+
+	if (pramcache_ploop_nosync &&
+	    !strncmp(sb->s_bdev->bd_disk->disk_name, "ploop", 5))
+		nosync = 1;
+
+	if (pramcache_feature_nosync < CONFIG_PRAMCACHE_FEATURE_NOSYNC)
+		nosync = 0;
+
+	/*
+	 * To avoid collisions with not yet loaded page cache (it is loaded on
+	 * mount/remount rw - see pramcache_load_page_cache()), do not save
+	 * page cache of fs mounted ro.
+	 */
+	if (pramcache_enabled && !(sb->s_flags & MS_RDONLY))
+		save_invalidate_page_cache(sb, nosync);
+}
+EXPORT_SYMBOL(pramcache_save_page_cache);
+
+void pramcache_save_bdev_cache(struct super_block *sb)
+{
+	BUG_ON(!sb->s_bdev);
+
+	if (pramcache_enabled)
+		save_invalidate_bdev_cache(sb);
+}
+EXPORT_SYMBOL(pramcache_save_bdev_cache);
+
+static ssize_t pramcache_show(struct kobject *kobj,
+			      struct kobj_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%d\n", pramcache_enabled);
+}
+
+static ssize_t pramcache_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val) != 0)
+		return -EINVAL;
+	val = !!val;
+	if (pramcache_enabled != val) {
+		pramcache_enabled = val;
+		printk(KERN_INFO "PRAMCACHE: %s\n",
+		       pramcache_enabled ? "enabled" : "disabled");
+	}
+	return count;
+}
+
+static struct kobj_attribute pramcache_attr =
+	__ATTR(pramcache, 0644, pramcache_show, pramcache_store);
+
+static struct attribute *pramcache_attrs[] = {
+	&pramcache_attr.attr,
+	NULL,
+};
+
+static struct attribute_group pramcache_attr_group = {
+	.attrs = pramcache_attrs,
+};
+
+#ifdef CONFIG_SYSCTL
+ctl_table pramcache_table[] = {
+	{
+		.procname	= "nosync",
+		.data		= &pramcache_feature_nosync,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
+static int __init pramcache_init(void)
+{
+	sysfs_update_group(kernel_kobj, &pramcache_attr_group);
+	return 0;
+}
+module_init(pramcache_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/array.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/array.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/array.c	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/array.c	2015-01-21 12:02:54.174935658 +0300
@@ -84,6 +84,8 @@
 #include <linux/tracehook.h>
 #include <linux/utrace.h>
 
+#include <bc/beancounter.h>
+
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include "internal.h"
@@ -155,6 +157,18 @@ static inline const char *get_task_state
 	return *p;
 }
 
+static int task_virtual_pid(struct task_struct *t)
+{
+	struct pid *pid;
+
+	pid = task_pid(t);
+	/*
+	 * this will give wrong result for tasks,
+	 * that failed to enter VE, but that's OK
+	 */
+	return pid ? pid->numbers[pid->level].nr : 0;
+}
+
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *p)
 {
@@ -162,17 +176,18 @@ static inline void task_state(struct seq
 	int g;
 	struct fdtable *fdt = NULL;
 	const struct cred *cred;
-	pid_t ppid, tpid;
+	pid_t ppid, tpid, vpid;
 
 	rcu_read_lock();
-	ppid = pid_alive(p) ?
-		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+	ppid = pid_alive(p) ? ve_task_ppid_nr_ns(p, ns) : 0;
+
 	tpid = 0;
 	if (pid_alive(p)) {
 		struct task_struct *tracer = tracehook_tracer_task(p);
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
+	vpid = task_virtual_pid(p);
 	cred = get_task_cred(p);
 	seq_printf(m,
 		"State:\t%s\n"
@@ -208,6 +223,10 @@ static inline void task_state(struct seq
 	put_cred(cred);
 
 	seq_printf(m, "\n");
+
+	seq_printf(m, "envID:\t%d\nVPid:\t%d\n",
+			p->ve_task_info.owner_env->veid, vpid);
+	seq_printf(m, "StopState:\t%u\n", p->stopped_state);
 }
 
 static void render_sigset_t(struct seq_file *m, const char *header,
@@ -247,10 +266,10 @@ static void collect_sigign_sigcatch(stru
 	}
 }
 
-static inline void task_sig(struct seq_file *m, struct task_struct *p)
+void task_sig(struct seq_file *m, struct task_struct *p)
 {
 	unsigned long flags;
-	sigset_t pending, shpending, blocked, ignored, caught;
+	sigset_t pending, shpending, blocked, ignored, caught, saved;
 	int num_threads = 0;
 	unsigned long qsize = 0;
 	unsigned long qlim = 0;
@@ -260,11 +279,13 @@ static inline void task_sig(struct seq_f
 	sigemptyset(&blocked);
 	sigemptyset(&ignored);
 	sigemptyset(&caught);
+	sigemptyset(&saved);
 
 	if (lock_task_sighand(p, &flags)) {
 		pending = p->pending.signal;
 		shpending = p->signal->shared_pending.signal;
 		blocked = p->blocked;
+		saved = p->saved_sigmask;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = atomic_read(&p->signal->count);
 		qsize = atomic_read(&__task_cred(p)->user->sigpending);
@@ -281,6 +302,7 @@ static inline void task_sig(struct seq_f
 	render_sigset_t(m, "SigBlk:\t", &blocked);
 	render_sigset_t(m, "SigIgn:\t", &ignored);
 	render_sigset_t(m, "SigCgt:\t", &caught);
+	render_sigset_t(m, "SigSvd:\t", &saved);
 }
 
 static void render_cap_t(struct seq_file *m, const char *header,
@@ -315,6 +337,20 @@ static inline void task_cap(struct seq_f
 	render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
 
+#ifdef CONFIG_BEANCOUNTERS
+static inline void ub_dump_task_info(struct task_struct *tsk,
+		char *stsk, int ltsk, char *smm, int lmm)
+{
+	snprintf(stsk, ltsk, "%u", tsk->task_bc.task_ub->ub_uid);
+	task_lock(tsk);
+	if (tsk->mm)
+		snprintf(smm, lmm, "%u", tsk->mm->mm_ub->ub_uid);
+	else
+		strncpy(smm, "N/A", lmm);
+	task_unlock(tsk);
+}
+#endif
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -328,6 +364,9 @@ int proc_pid_status(struct seq_file *m, 
 			struct pid *pid, struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
+#ifdef CONFIG_BEANCOUNTERS
+	char tsk_ub_info[64], mm_ub_info[64];
+#endif
 
 	task_name(m, task);
 	task_state(m, ns, pid, task);
@@ -340,6 +379,14 @@ int proc_pid_status(struct seq_file *m, 
 	task_cap(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
+#ifdef CONFIG_BEANCOUNTERS
+	ub_dump_task_info(task,
+			tsk_ub_info, sizeof(tsk_ub_info),
+			mm_ub_info, sizeof(mm_ub_info));
+
+	seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info);
+	seq_printf(m, "MMUB:\t%s\n", mm_ub_info);
+#endif
 	return 0;
 }
 
@@ -363,6 +410,11 @@ static int do_task_stat(struct seq_file 
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
+#ifdef CONFIG_BEANCOUNTERS
+	char ub_task_info[64];
+	char ub_mm_info[64];
+#endif
+	int is_super = ve_is_super(get_exec_env());
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -420,7 +472,7 @@ static int do_task_stat(struct seq_file 
 		}
 
 		sid = task_session_nr_ns(task, ns);
-		ppid = task_tgid_nr_ns(task->real_parent, ns);
+		ppid = ve_task_ppid_nr_ns(task, ns);
 		pgid = task_pgrp_nr_ns(task, ns);
 
 		unlock_task_sighand(task, &flags);
@@ -445,12 +497,38 @@ static int do_task_stat(struct seq_file 
 	start_time =
 		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
 				+ task->real_start_time.tv_nsec;
+#ifdef CONFIG_VE
+	if (!is_super) {
+		struct timespec *ve_start_ts =
+				&get_exec_env()->real_start_timespec;
+		start_time -=
+			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
+				+ ve_start_ts->tv_nsec;
+	}
+	/* tasks inside a CT can have negative start time e.g. if the CT was
+	 * migrated from another hw node, in which case we will report 0 in
+	 * order not to confuse userspace */
+	if ((s64)start_time < 0)
+		start_time = 0;
+#endif
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
+#ifdef CONFIG_BEANCOUNTERS
+	ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info),
+				ub_mm_info, sizeof(ub_mm_info));
+#endif
+
 	seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld"
+#ifdef CONFIG_VE
+	" 0 0 0 0 0 %d %u"
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+	" %s %s"
+#endif
+	"\n",
 		pid_nr_ns(pid, ns),
 		tcomm,
 		state,
@@ -492,12 +570,21 @@ static int do_task_stat(struct seq_file 
 		0UL,
 		0UL,
 		task->exit_signal,
-		task_cpu(task),
+		is_super ? task_cpu(task) : task_vcpu_id(task),
 		task->rt_priority,
 		task->policy,
 		(unsigned long long)delayacct_blkio_ticks(task),
 		cputime_to_clock_t(gtime),
-		cputime_to_clock_t(cgtime));
+		cputime_to_clock_t(cgtime)
+#ifdef CONFIG_VE
+		, task_pid_nr_ns(task, task_active_pid_ns(task)),
+		VEID(VE_TASK_INFO(task)->owner_env)
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+		, ub_task_info,
+		ub_mm_info
+#endif
+		);
 	if (mm)
 		mmput(mm);
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/base.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/base.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/base.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/base.c	2015-01-21 12:02:57.966835006 +0300
@@ -49,6 +49,7 @@
 
 #include <asm/uaccess.h>
 
+#include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
@@ -84,6 +85,8 @@
 #include <linux/fs_struct.h>
 #include "internal.h"
 
+#include <bc/oom_kill.h>
+
 /* NOTE:
  *	Implementing inode permission operations in /proc is almost
  *	certainly an error.  Permission checks need to happen during
@@ -150,21 +153,20 @@ static unsigned int pid_entry_count_dirs
 	return count;
 }
 
-static int get_fs_path(struct task_struct *task, struct path *path, bool root)
+static int get_task_root(struct task_struct *task, struct path *root)
 {
-	struct fs_struct *fs;
 	int result = -ENOENT;
 
 	task_lock(task);
-	fs = task->fs;
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = root ? fs->root : fs->pwd;
-		path_get(path);
-		read_unlock(&fs->lock);
-		result = 0;
-	}
-	task_unlock(task);
+	if (task->fs) {
+		get_fs_root(task->fs, root);
+		task_unlock(task);
+
+		result = d_root_check(root);
+		if (result)
+			path_put(root);
+	} else
+		task_unlock(task);
 	return result;
 }
 
@@ -186,7 +188,16 @@ static int proc_cwd_link(struct inode *i
 	int result = -ENOENT;
 
 	if (task) {
-		result = get_fs_path(task, path, 0);
+		task_lock(task);
+		if (task->fs) {
+			get_fs_pwd(task->fs, path);
+			task_unlock(task);
+
+			result = d_root_check(path);
+			if (result)
+				path_put(path);
+		} else
+			task_unlock(task);
 		put_task_struct(task);
 	}
 	return result;
@@ -198,7 +209,7 @@ static int proc_root_link(struct inode *
 	int result = -ENOENT;
 
 	if (task) {
-		result = get_fs_path(task, path, 1);
+		result = get_task_root(task, path);
 		put_task_struct(task);
 	}
 	return result;
@@ -495,14 +506,15 @@ static const struct file_operations proc
 
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-	unsigned long points = 0;
+	int points = 0;
 
 	read_lock(&tasklist_lock);
-	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL,
-					totalram_pages + total_swap_pages);
+	if (pid_alive(task)) {
+		points = oom_badness(task, ub_oom_total_pages(get_exec_ub()), NULL);
+		points = clamp(points, 0, 1000);
+	}
 	read_unlock(&tasklist_lock);
-	return sprintf(buffer, "%lu\n", points);
+	return sprintf(buffer, "%d\n", points);
 }
 
 struct limit_names {
@@ -674,17 +686,36 @@ static int proc_pid_syscall(struct task_
 static int proc_fd_access_allowed(struct inode *inode)
 {
 	struct task_struct *task;
-	int allowed = 0;
+	int err;
+
 	/* Allow access to a task's file descriptors if it is us or we
 	 * may use ptrace attach to the process and find out that
 	 * information.
 	 */
+	err = -ENOENT;
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		if (task->flags & PF_KTHREAD)
+			/*
+			 * Always allow access to kernel threads /proc entries.
+			 */
+			err = 0;
+		else if (ptrace_may_access(task, PTRACE_MODE_READ))
+			err = 0;
+		else
+			/*
+			 * This clever ptrace_may_attach() may play a trick
+			 * on us. If the task is zombie it will consider this
+			 * task to be not dumpable at all and will deny any
+			 * ptracing in VE. Not a big deal for ptrace(), but
+			 * following the link will fail with the -EACCESS
+			 * reason. Some software is unable to stand such a
+			 * swindle and refuses to work :(
+			 */
+			err = (task->mm ? -EACCES : -ENOENT);
 		put_task_struct(task);
 	}
-	return allowed;
+	return err;
 }
 
 int proc_setattr(struct dentry *dentry, struct iattr *attr)
@@ -770,7 +801,7 @@ static int mounts_open_common(struct ino
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-		if (ns && get_fs_path(task, &root, 1) == 0)
+		if (ns && get_task_root(task, &root) == 0)
 			ret = 0;
 		put_task_struct(task);
 	}
@@ -794,6 +825,10 @@ static int mounts_open_common(struct ino
 	p->ns = ns;
 	p->root = root;
 	p->event = ns->event;
+	p->iter = NULL;
+	p->iter_pos = 0;
+	p->iter_advanced = 0;
+	register_mounts_reader(p);
 
 	return 0;
 
@@ -810,6 +845,7 @@ static int mounts_open_common(struct ino
 static int mounts_release(struct inode *inode, struct file *file)
 {
 	struct proc_mounts *p = file->private_data;
+	unregister_mounts_reader(p);
 	path_put(&p->root);
 	put_mnt_ns(p->ns);
 	return seq_release(inode, file);
@@ -1216,6 +1252,8 @@ static ssize_t oom_adjust_write(struct f
 		err = -EINVAL;
 		goto out;
 	}
+	if (!ve_is_super(get_exec_env()))
+		goto out;
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task) {
@@ -1239,13 +1277,6 @@ static ssize_t oom_adjust_write(struct f
 		goto err_sighand;
 	}
 
-	if (oom_adjust != task->signal->oom_adj) {
-		if (oom_adjust == OOM_DISABLE)
-			atomic_inc(&task->mm->oom_disable_count);
-		if (task->signal->oom_adj == OOM_DISABLE)
-			atomic_dec(&task->mm->oom_disable_count);
-	}
-
 	task->signal->oom_adj = oom_adjust;
 	/*
 	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
@@ -1282,7 +1313,7 @@ static ssize_t oom_score_adj_read(struct
 	if (!task)
 		return -ESRCH;
 	if (lock_task_sighand(task, &flags)) {
-		oom_score_adj = task->signal->oom_score_adj;
+		oom_score_adj = get_task_oom_score_adj(task);
 		unlock_task_sighand(task, &flags);
 	}
 	put_task_struct(task);
@@ -1339,15 +1370,10 @@ static ssize_t oom_score_adj_write(struc
 		goto err_sighand;
 	}
 
-	if (oom_score_adj != task->signal->oom_score_adj) {
-		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_inc(&task->mm->oom_disable_count);
-		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&task->mm->oom_disable_count);
-	}
 	task->signal->oom_score_adj = oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
+
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
@@ -1741,6 +1767,7 @@ void set_mm_exe_file(struct mm_struct *m
 	mm->exe_file = new_exe_file;
 	mm->num_exe_file_vmas = 0;
 }
+EXPORT_SYMBOL(set_mm_exe_file);
 
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
@@ -1779,10 +1806,15 @@ static int proc_exe_link(struct inode *i
 	exe_file = get_mm_exe_file(mm);
 	mmput(mm);
 	if (exe_file) {
-		*exe_path = exe_file->f_path;
-		path_get(&exe_file->f_path);
+		int result;
+
+		result = d_root_check(&exe_file->f_path);
+		if (result == 0) {
+			*exe_path = exe_file->f_path;
+			path_get(&exe_file->f_path);
+		}
 		fput(exe_file);
-		return 0;
+		return result;
 	} else
 		return -ENOENT;
 }
@@ -1790,13 +1822,14 @@ static int proc_exe_link(struct inode *i
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	int error = -EACCES;
+	int error;
 
 	/* We don't need a base pointer in the /proc filesystem */
 	path_put(&nd->path);
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
-	if (!proc_fd_access_allowed(inode))
+	error = proc_fd_access_allowed(inode);
+	if (error < 0)
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
@@ -1830,12 +1863,13 @@ static int do_proc_readlink(struct path 
 
 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
 {
-	int error = -EACCES;
+	int error;
 	struct inode *inode = dentry->d_inode;
 	struct path path;
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
-	if (!proc_fd_access_allowed(inode))
+	error = proc_fd_access_allowed(inode);
+	if (error < 0)
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &path);
@@ -2091,6 +2125,7 @@ static int proc_fd_info(struct inode *in
 	struct files_struct *files = NULL;
 	struct file *file;
 	int fd = proc_fd(inode);
+	int err = -ENOENT;
 
 	if (task) {
 		files = get_files_struct(task);
@@ -2103,7 +2138,8 @@ static int proc_fd_info(struct inode *in
 		 */
 		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
-		if (file) {
+		err = -EACCES;
+		if (file && !d_root_check(&file->f_path)) {
 			if (path) {
 				*path = file->f_path;
 				path_get(&file->f_path);
@@ -2121,7 +2157,7 @@ static int proc_fd_info(struct inode *in
 		spin_unlock(&files->file_lock);
 		put_files_struct(files);
 	}
-	return -ENOENT;
+	return err;
 }
 
 static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2912,7 +2948,7 @@ static int do_io_accounting(struct task_
 		struct task_struct *t = task;
 
 		task_io_accounting_add(&acct, &task->signal->ioac);
-		while_each_thread(task, t)
+		while_each_thread_ve(task, t)
 			task_io_accounting_add(&acct, &t->ioac);
 
 		unlock_task_sighand(task, &flags);
@@ -3647,3 +3683,42 @@ static const struct file_operations proc
 	.read		= generic_read_dir,
 	.readdir	= proc_task_readdir,
 };
+
+/* Check whether dentry belongs to a task that already died */
+int proc_dentry_of_dead_task(struct dentry *dentry)
+{
+	struct proc_dir_entry *de = PDE(dentry->d_inode);
+
+	if (de && de->data == &dummy_proc_pid_file_operations)
+		return 1;
+
+	while (de && de->parent != &proc_root && (dentry != dentry->d_parent)) {
+		dentry = dentry->d_parent;
+		de = PDE(dentry->d_inode);
+	}
+
+	return (dentry->d_op == &pid_dentry_operations &&
+		 proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first == NULL);
+}
+EXPORT_SYMBOL(proc_dentry_of_dead_task);
+
+/* Place it here to avoid use vzrst module count */
+static ssize_t dummy_proc_pid_read(struct file * file, char __user * buf,
+				 size_t count, loff_t *ppos)
+{
+	return -ESRCH;
+}
+
+static ssize_t dummy_proc_pid_write(struct file * file, const char * buf,
+				  size_t count, loff_t *ppos)
+{
+	return -ESRCH;
+}
+
+struct file_operations dummy_proc_pid_file_operations = {
+	.read		= dummy_proc_pid_read,
+	.write		= dummy_proc_pid_write,
+};
+
+EXPORT_SYMBOL(dummy_proc_pid_file_operations);
+
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/cmdline.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/cmdline.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/cmdline.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/cmdline.c	2015-01-21 12:02:43.998205823 +0300
@@ -2,10 +2,12 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 
 static int cmdline_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "%s\n", saved_command_line);
+	seq_printf(m, "%s\n",
+		ve_is_super(get_exec_env()) ? saved_command_line : "quiet");
 	return 0;
 }
 
@@ -23,7 +25,7 @@ static const struct file_operations cmdl
 
 static int __init proc_cmdline_init(void)
 {
-	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	proc_create("cmdline", 0, &glob_proc_root, &cmdline_proc_fops);
 	return 0;
 }
 module_init(proc_cmdline_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/cpuinfo.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/cpuinfo.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/cpuinfo.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/cpuinfo.c	2015-01-21 12:02:43.998205823 +0300
@@ -18,7 +18,7 @@ static const struct file_operations proc
 
 static int __init proc_cpuinfo_init(void)
 {
-	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	proc_create("cpuinfo", 0, &glob_proc_root, &proc_cpuinfo_operations);
 	return 0;
 }
 module_init(proc_cpuinfo_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/devices.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/devices.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/devices.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/devices.c	2015-01-21 12:02:44.394195310 +0300
@@ -2,6 +2,7 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 
 static int devinfo_show(struct seq_file *f, void *v)
 {
@@ -64,7 +65,7 @@ static const struct file_operations proc
 
 static int __init proc_devices_init(void)
 {
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	proc_create("devices", 0, &glob_proc_root, &proc_devinfo_operations);
 	return 0;
 }
 module_init(proc_devices_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/generic.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/generic.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/generic.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/generic.c	2015-01-21 12:02:44.387195495 +0300
@@ -249,12 +249,26 @@ static const struct file_operations proc
 	.write		= proc_file_write,
 };
 
+static int proc_dir_is_ve_owner(struct proc_dir_entry *de,
+				struct proc_dir_entry *lde)
+{
+	return (lde && (lde == de));
+}
+
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct proc_dir_entry *de = PDE(inode);
 	int error;
 
+	if (iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
+
+		/* Reject VE context change of perms of node's proc files */
+		if (!ve_is_super(get_exec_env()) &&
+			!proc_dir_is_ve_owner(de, LPDE(inode)))
+			return -EPERM;
+	}
+
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		goto out;
@@ -263,9 +277,12 @@ static int proc_notify_change(struct den
 	if (error)
 		goto out;
 	
-	de->uid = inode->i_uid;
-	de->gid = inode->i_gid;
-	de->mode = inode->i_mode;
+	if (iattr->ia_valid & ATTR_UID)
+		de->uid = inode->i_uid;
+	if (iattr->ia_valid & ATTR_GID)
+		de->gid = inode->i_gid;
+	if (iattr->ia_valid & ATTR_MODE)
+		de->mode = inode->i_mode;
 out:
 	return error;
 }
@@ -274,11 +291,22 @@ static int proc_getattr(struct vfsmount 
 			struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
-	struct proc_dir_entry *de = PROC_I(inode)->pde;
-	if (de && de->nlink)
-		inode->i_nlink = de->nlink;
+	struct proc_dir_entry *de = PDE(inode);
+	struct proc_dir_entry *lde = LPDE(inode);
 
 	generic_fillattr(inode, stat);
+
+	if (de && de->nlink)
+		stat->nlink = de->nlink;
+	/* if dentry is found in both trees and it is a directory
+	 * then inode's nlink count must be altered, because local
+	 * and global subtrees may differ.
+	 * on the other hand, they may intersect, so actual nlink
+	 * value is difficult to calculate - upper estimate is used
+	 * instead of it.
+	 */
+	if (lde && lde != de && lde->nlink > 1)
+		stat->nlink += lde->nlink - 2;
 	return 0;
 }
 
@@ -421,28 +449,64 @@ static const struct dentry_operations pr
 	.d_delete	= proc_delete_dentry,
 };
 
+struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir,
+		const char *name, int namelen)
+{
+	struct proc_dir_entry *de;
+
+	if (PROC_IS_HARDLINK(dir))
+		dir = (struct proc_dir_entry *) dir->data;
+
+	for (de = dir->subdir; de ; de = de->next) {
+		if (de->namelen != namelen)
+			continue;
+		if (memcmp(de->name, name, namelen))
+			continue;
+		break;
+	}
+	return de;
+}
+EXPORT_SYMBOL(__proc_lookup);
+
 /*
  * Don't create negative dentries here, return -ENOENT by hand
  * instead.
  */
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
-		struct dentry *dentry)
+struct dentry *proc_lookup_de(struct proc_dir_entry *de,
+		struct proc_dir_entry *lde,
+		struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = NULL;
 	int error = -ENOENT;
 
 	spin_lock(&proc_subdir_lock);
-	for (de = de->subdir; de ; de = de->next) {
-		if (de->namelen != dentry->d_name.len)
-			continue;
-		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
+	de = __proc_lookup(de, dentry->d_name.name, dentry->d_name.len);
+	if (lde != NULL)
+		lde = __proc_lookup(lde, dentry->d_name.name,
+				dentry->d_name.len);
+
+	if (de == NULL)
+		de = lde;
+
+	if (de != NULL) {
+		/*
+		 * de     lde    meaning   inode(g,l)
+		 * ------------------------------------
+		 * NULL   NULL   -ENOENT   *
+		 * X      NULL   global    X NULL
+		 * NULL   X      local     X X
+		 * X      Y      both      X Y
+		 */
+		{
 			unsigned int ino;
 
 			ino = de->low_ino;
 			de_get(de);
+			if (lde != NULL)
+				de_get(lde);
 			spin_unlock(&proc_subdir_lock);
 			error = -EINVAL;
-			inode = proc_get_inode(dir->i_sb, ino, de);
+			inode = proc_get_inode(dir->i_sb, ino, de, lde);
 			goto out_unlock;
 		}
 	}
@@ -456,13 +520,15 @@ out_unlock:
 	}
 	if (de)
 		de_put(de);
+	if (lde)
+		de_put(lde);
 	return ERR_PTR(error);
 }
 
 struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
 		struct nameidata *nd)
 {
-	return proc_lookup_de(PDE(dir), dir, dentry);
+	return proc_lookup_de(PDE(dir), LPDE(dir), dir, dentry);
 }
 
 /*
@@ -474,13 +540,14 @@ struct dentry *proc_lookup(struct inode 
  * value of the readdir() call, as long as it's non-negative
  * for success..
  */
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-		filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lde,
+		struct file *filp, void *dirent, filldir_t filldir)
 {
 	unsigned int ino;
 	int i;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	struct proc_dir_entry *ode = de, *fde = NULL;
 
 	ino = inode->i_ino;
 	i = filp->f_pos;
@@ -501,25 +568,21 @@ int proc_readdir_de(struct proc_dir_entr
 			/* fall through */
 		default:
 			spin_lock(&proc_subdir_lock);
-			de = de->subdir;
 			i -= 2;
-			for (;;) {
-				if (!de) {
-					ret = 1;
-					spin_unlock(&proc_subdir_lock);
-					goto out;
-				}
-				if (!i)
-					break;
-				de = de->next;
-				i--;
-			}
-
-			do {
+repeat:
+			if (PROC_IS_HARDLINK(de))
+				de = (struct proc_dir_entry *) de->data;
+			de = de->subdir;
+			while (de != NULL) {
 				struct proc_dir_entry *next;
 
-				/* filldir passes info to user space */
 				de_get(de);
+				if (i-- > 0 || (fde != NULL &&
+							__proc_lookup(fde,
+							de->name, de->namelen)))
+					goto skip;
+
+				/* filldir passes info to user space */
 				spin_unlock(&proc_subdir_lock);
 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
 					    de->low_ino, de->mode >> 12) < 0) {
@@ -528,10 +591,17 @@ int proc_readdir_de(struct proc_dir_entr
 				}
 				spin_lock(&proc_subdir_lock);
 				filp->f_pos++;
+skip:
 				next = de->next;
 				de_put(de);
 				de = next;
-			} while (de);
+			}
+
+			if (fde == NULL && lde != NULL && lde != ode) {
+				de = lde;
+				fde = ode;
+				goto repeat;
+			}
 			spin_unlock(&proc_subdir_lock);
 	}
 	ret = 1;
@@ -543,7 +613,7 @@ int proc_readdir(struct file *filp, void
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
-	return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+	return proc_readdir_de(PDE(inode), LPDE(inode), filp, dirent, filldir);
 }
 
 /*
@@ -869,6 +939,8 @@ void remove_proc_entry(const char *name,
 	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
 			"'%s/%s', leaking at least '%s'\n", __func__,
 			de->parent->name, de->name, de->subdir->name);
+	if (PROC_IS_HARDLINK(de))
+		de_put((struct proc_dir_entry *) de->data);
 	if (atomic_dec_and_test(&de->count))
 		free_proc_entry(de);
 }
@@ -926,3 +998,41 @@ int remove_proc_subtree(const char *name
 	return 0;
 }
 EXPORT_SYMBOL(remove_proc_subtree);
+
+const struct inode_operations proc_hard_inode_operations;
+
+struct proc_dir_entry *create_proc_hardlink(const char *name, mode_t mode,
+						struct proc_dir_entry *parent,
+						struct proc_dir_entry *link)
+{
+	struct proc_dir_entry *ent;
+	mode &= (~S_IFMT);
+	mode |= link->mode & S_IFMT;
+	ent = __proc_create(&parent, name, mode, 1);
+	if (!ent)
+		return ent;
+	ent->data = link;
+	ent->proc_iops = &proc_hard_inode_operations;
+	if (proc_register(parent, ent) < 0) {
+		kfree(ent);
+		ent = NULL;
+	}
+	de_get(link);
+	return ent;
+}
+EXPORT_SYMBOL(create_proc_hardlink);
+
+struct proc_dir_entry *proc_lookup_entry(const char *name, \
+					struct proc_dir_entry *parent)
+{
+	const char *fn = name;
+	int len;
+
+	if (xlate_proc_name(name, &parent, &fn) != 0)
+		return NULL;
+
+	len = strlen(fn);
+
+	return __proc_lookup(parent, fn, len);
+}
+EXPORT_SYMBOL(proc_lookup_entry);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/inode.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/inode.c	2015-01-21 12:02:44.420194619 +0300
@@ -65,6 +65,10 @@ static void proc_delete_inode(struct ino
 	de = PROC_I(inode)->pde;
 	if (de)
 		de_put(de);
+	de = PROC_I(inode)->lpde;
+	if (de)
+		de_put(de);
+
 	if (PROC_I(inode)->sysctl)
 		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
@@ -91,6 +95,7 @@ static struct inode *proc_alloc_inode(st
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
+	ei->lpde = NULL;
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
 	ei->ns = NULL;
@@ -468,22 +473,30 @@ static const struct file_operations proc
 #endif
 
 struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
-				struct proc_dir_entry *de)
+		struct proc_dir_entry *de, struct proc_dir_entry *lde)
 {
 	struct inode * inode;
+	struct proc_dir_entry *de_lnk = de;
 
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return NULL;
-	if (inode->i_state & I_NEW) {
+	inode = new_inode_pseudo(sb);
+	if (inode) {
+		if (PROC_IS_HARDLINK(de_lnk))
+			de = de_lnk->data;
+		if (lde)
+			WARN_ON(PROC_IS_HARDLINK(de_lnk));
+
+		inode->i_ino = ino;
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->fd = 0;
 		PROC_I(inode)->pde = de;
+#ifdef CONFIG_VE
+		PROC_I(inode)->lpde = lde;
+#endif
 
-		if (de->mode) {
-			inode->i_mode = de->mode;
-			inode->i_uid = de->uid;
-			inode->i_gid = de->gid;
+		if (de_lnk->mode) {
+			inode->i_mode = de_lnk->mode;
+			inode->i_uid = de_lnk->uid;
+			inode->i_gid = de_lnk->gid;
 		}
 		if (de->size)
 			inode->i_size = de->size;
@@ -504,11 +517,17 @@ struct inode *proc_get_inode(struct supe
 				inode->i_fop = de->proc_fops;
 			}
 		}
-		unlock_new_inode(inode);
-	} else
-	       de_put(de);
+		if (PROC_IS_HARDLINK(de_lnk)) {
+			de_get(de);
+			de_put(de_lnk);;
+		}
+	} else {
+		de_put(de_lnk);
+		if (lde)
+			de_put(lde);
+	}
 	return inode;
-}			
+}
 
 int proc_fill_super(struct super_block *s)
 {
@@ -520,21 +539,25 @@ int proc_fill_super(struct super_block *
 	s->s_magic = PROC_SUPER_MAGIC;
 	s->s_op = &proc_sops;
 	s->s_time_gran = 1;
-	
-	de_get(&proc_root);
-	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
-	if (!root_inode)
-		goto out_no_root;
+
+	de_get(get_exec_env()->proc_root);
+	de_get(&glob_proc_root);
+	root_inode = proc_get_inode(s, PROC_ROOT_INO,
+			&glob_proc_root, get_exec_env()->proc_root);
+	if (!root_inode) {
+		printk("proc_fill_super: get root inode failed\n");
+		de_put(get_exec_env()->proc_root);
+		de_put(&glob_proc_root);
+		return -ENOMEM;
+	}
 	root_inode->i_uid = 0;
 	root_inode->i_gid = 0;
 	s->s_root = d_alloc_root(root_inode);
-	if (!s->s_root)
-		goto out_no_root;
-	return 0;
+	if (!s->s_root) {
+		printk("proc_fill_super: allocate dentry failed\n");
+		iput(root_inode);
+		return -ENOMEM;
+	}
 
-out_no_root:
-	printk("proc_read_super: get root inode failed\n");
-	iput(root_inode);
-	de_put(&proc_root);
-	return -ENOMEM;
+	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/internal.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/internal.h
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/internal.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/internal.h	2015-01-21 12:02:44.373195867 +0300
@@ -12,6 +12,12 @@
 #include <linux/proc_fs.h>
 
 extern struct proc_dir_entry proc_root;
+#ifdef CONFIG_VE
+extern struct proc_dir_entry glob_proc_root;
+#else
+#define glob_proc_root	proc_root
+#endif
+
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 #else
@@ -88,10 +94,11 @@ static inline int proc_fd(struct inode *
 	return PROC_I(inode)->fd;
 }
 
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
+struct dentry *proc_lookup_de(struct proc_dir_entry *de,
+		struct proc_dir_entry *lpde, struct inode *ino,
 		struct dentry *dentry);
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-		filldir_t filldir);
+int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lpde,
+		struct file *filp, void *dirent, filldir_t filldir);
 
 struct pde_opener {
 	struct inode *inode;
@@ -114,7 +121,8 @@ void de_put(struct proc_dir_entry *de);
 
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, unsigned int,
+		struct proc_dir_entry *, struct proc_dir_entry *);
 int proc_remount(struct super_block *sb, int *flags, char *data);
 
 /*
@@ -128,6 +136,10 @@ int proc_readdir(struct file *, void *, 
 struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 
+extern const struct inode_operations proc_hard_inode_operations;
+
+#define PROC_IS_HARDLINK(d) (d->proc_iops == &proc_hard_inode_operations)
+
 
 /* Lookups */
 typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/kmsg.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/kmsg.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/kmsg.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/kmsg.c	2015-01-21 12:02:43.839210045 +0300
@@ -12,6 +12,10 @@
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
+#include <linux/veprintk.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -41,19 +45,20 @@ static ssize_t kmsg_read(struct file *fi
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, &log_wait, wait);
+	poll_wait(file, &ve_log_wait, wait);
 	if (do_syslog(9, NULL, 0))
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
 
-static const struct file_operations proc_kmsg_operations = {
+const struct file_operations proc_kmsg_operations = {
 	.read		= kmsg_read,
 	.poll		= kmsg_poll,
 	.open		= kmsg_open,
 	.release	= kmsg_release,
 };
+EXPORT_SYMBOL(proc_kmsg_operations);
 
 static int __init proc_kmsg_init(void)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/loadavg.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/loadavg.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/loadavg.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/loadavg.c	2015-01-21 12:02:53.918942454 +0300
@@ -13,14 +13,25 @@
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
 	unsigned long avnrun[3];
+	long running, threads;
+	struct ve_struct *ve;
 
-	get_avenrun(avnrun, FIXED_1/200, 0);
+	ve = get_exec_env();
+	if (ve_is_super(ve)) {
+		get_avenrun(avnrun, FIXED_1/200, 0);
+		running = nr_running();
+		threads = nr_threads;
+	} else {
+		get_avenrun_ve(avnrun, FIXED_1/200, 0);
+		running = nr_running_ve();
+		threads = ve->pcounter;
+	}
 
-	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%ld %d\n",
 		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
-		nr_running(), nr_threads,
+		running, threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
 }
@@ -39,7 +50,7 @@ static const struct file_operations load
 
 static int __init proc_loadavg_init(void)
 {
-	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	proc_create("loadavg", 0, &glob_proc_root, &loadavg_proc_fops);
 	return 0;
 }
 module_init(proc_loadavg_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/meminfo.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/meminfo.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/meminfo.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/meminfo.c	2015-01-21 12:02:58.939809182 +0300
@@ -5,11 +5,13 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mmzone.h>
+#include <linux/mmgang.h>
 #include <linux/proc_fs.h>
 #include <linux/quicklist.h>
 #include <linux/seq_file.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
+#include <linux/virtinfo.h>
 #include <asm/atomic.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -35,9 +37,119 @@ int meminfo_legacy_layout_sysctl_handler
         return proc_dointvec(table, write, buffer, length, ppos);
 }
 
-static int meminfo_proc_show(struct seq_file *m, void *v)
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
+void hugetlb_meminfo_mi(struct seq_file *m, struct meminfo *mi)
+{
+	struct hstate *h = &default_hstate;
+	unsigned long total, used, free;
+
+	if (!h->nr_huge_pages)
+		return;
+
+	total = min(mi->ub->ub_parms[UB_LOCKEDPAGES].limit >> h->order,
+		    h->nr_huge_pages);
+	used = mi->ub->ub_hugetlb_pages >> h->order;
+	free = min(total > used ? total - used : 0ul, h->free_huge_pages);
+
+	seq_printf(m,
+		"HugePages_Total:   %5lu\n"
+		"HugePages_Free:    %5lu\n"
+		"HugePages_Rsvd:    %5lu\n"
+		"HugePages_Surp:    %5lu\n"
+		"Hugepagesize:   %8lu kB\n",
+		total, free, 0ul, 0ul, K(1ul << h->order));
+}
+
+static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
+{
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Shmem:          %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n",
+		K(mi->si->totalram),
+		K(mi->si->freeram),
+		K(mi->cached),
+		K(0L),
+		K(mi->pages[LRU_ACTIVE_ANON]   + mi->pages[LRU_ACTIVE_FILE]),
+		K(mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]),
+		K(mi->pages[LRU_ACTIVE_ANON]),
+		K(mi->pages[LRU_INACTIVE_ANON]),
+		K(mi->pages[LRU_ACTIVE_FILE]),
+		K(mi->pages[LRU_INACTIVE_FILE]),
+		K(mi->pages[LRU_UNEVICTABLE]),
+		K(mi->locked),
+		K(mi->si->totalswap),
+		K(mi->si->freeswap),
+		K(mi->dirty_pages),
+		K(mi->writeback_pages),
+		K(mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_INACTIVE_ANON]),
+		K(mi->shmem),
+		K(mi->slab_reclaimable + mi->slab_unreclaimable),
+		K(mi->slab_reclaimable),
+		K(mi->slab_unreclaimable));
+
+	if (mi->meminfo_val != VE_MEMINFO_COMPLETE)
+		return 0;
+
+	seq_printf(m,
+		"MemCommitted:   %8lu kB\n"
+		"MemAvailable:   %8lu kB\n"
+		"MemPortion:     %8lu kB\n"
+		"Shadow:         %8lu kB\n"
+		"Shadow(anon):   %8lu kB\n"
+		"Shadow(file):   %8lu kB\n"
+#ifdef CONFIG_KSTALED
+		"IdleClean:      %8lu kB\n"
+		"IdleDirtyFile:  %8lu kB\n"
+		"IdleDirtySwap:  %8lu kB\n"
+#endif
+		,
+		K(get_ub_gs(mi->ub)->memory_committed),
+		K(get_ub_gs(mi->ub)->memory_available),
+		K(get_ub_gs(mi->ub)->memory_portion),
+		K(mi->shadow[LRU_ACTIVE_ANON] + mi->shadow[LRU_INACTIVE_ANON] +
+		  mi->shadow[LRU_ACTIVE_FILE] + mi->shadow[LRU_INACTIVE_FILE] +
+		  mi->shadow[LRU_UNEVICTABLE]),
+		K(mi->shadow[LRU_ACTIVE_ANON] + mi->shadow[LRU_INACTIVE_ANON]),
+		K(mi->shadow[LRU_ACTIVE_FILE] + mi->shadow[LRU_INACTIVE_FILE])
+#ifdef CONFIG_KSTALED
+		,
+		K(mi->idle_page_stats.idle_clean),
+		K(mi->idle_page_stats.idle_dirty_file),
+		K(mi->idle_page_stats.idle_dirty_swap)
+#endif
+		);
+
+	hugetlb_meminfo_mi(m, mi);
+
+	return 0;
+}
+
+int meminfo_proc_show_ub(struct seq_file *m, void *v,
+		struct user_beancounter *ub, unsigned long meminfo_val)
 {
+	int ret;
 	struct sysinfo i;
+	struct meminfo mi;
 	unsigned long committed;
 	struct vmalloc_info vmi;
 	long cached;
@@ -48,12 +160,23 @@ static int meminfo_proc_show(struct seq_
 	struct zone *zone;
 	int lru;
 
+	si_meminfo(&i);
+	si_swapinfo(&i);
+
+	memset(&mi, 0, sizeof(mi));
+	mi.si = &i;
+	mi.ub = ub;
+	mi.meminfo_val = meminfo_val;
+
+	ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi);
+	if (ret & NOTIFY_FAIL)
+		return 0;
+	if (ret & NOTIFY_OK)
+		return meminfo_proc_show_mi(m, &mi);
+
 /*
  * display in kilobytes.
  */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-	si_meminfo(&i);
-	si_swapinfo(&i);
 	committed = percpu_counter_read_positive(&vm_committed_as);
 
 	cached = global_page_state(NR_FILE_PAGES) -
@@ -106,6 +229,12 @@ static int meminfo_proc_show(struct seq_
 		"Buffers:        %8lu kB\n"
 		"Cached:         %8lu kB\n"
 		"SwapCached:     %8lu kB\n"
+#ifdef CONFIG_MEMORY_GANGS
+		"MemCommitted:   %8lu kB\n"
+#endif
+#ifdef CONFIG_MEMORY_VSWAP
+		"VirtualSwap:    %8lu kB\n"
+#endif
 		"Active:         %8lu kB\n"
 		"Inactive:       %8lu kB\n"
 		"Active(anon):   %8lu kB\n"
@@ -158,6 +287,12 @@ static int meminfo_proc_show(struct seq_
 		K(i.bufferram),
 		K(cached),
 		K(total_swapcache_pages),
+#ifdef CONFIG_MEMORY_GANGS
+		K(total_committed_pages),
+#endif
+#ifdef CONFIG_MEMORY_VSWAP
+		K(global_page_state(NR_VSWAP)),
+#endif
 		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
 		K(pages[LRU_ACTIVE_ANON]),
@@ -229,6 +364,12 @@ static int meminfo_proc_show(struct seq_
 #undef K
 }
 
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	return meminfo_proc_show_ub(m, v, current->mm->mm_ub,
+			get_exec_env()->meminfo_val);
+}
+
 static int meminfo_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, meminfo_proc_show, NULL);
@@ -243,7 +384,7 @@ static const struct file_operations memi
 
 static int __init proc_meminfo_init(void)
 {
-	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	proc_create("meminfo", 0, &glob_proc_root, &meminfo_proc_fops);
 	return 0;
 }
 module_init(proc_meminfo_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/proc_net.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/proc_net.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/proc_net.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/proc_net.c	2015-01-21 12:02:45.354169824 +0300
@@ -126,7 +126,7 @@ static struct dentry *proc_tgid_net_look
 	de = ERR_PTR(-ENOENT);
 	net = get_proc_task_net(dir);
 	if (net != NULL) {
-		de = proc_lookup_de(net->proc_net, dir, dentry);
+		de = proc_lookup_de(net->proc_net, NULL, dir, dentry);
 		put_net(net);
 	}
 	return de;
@@ -164,7 +164,8 @@ static int proc_tgid_net_readdir(struct 
 	ret = -EINVAL;
 	net = get_proc_task_net(filp->f_path.dentry->d_inode);
 	if (net != NULL) {
-		ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+		ret = proc_readdir_de(net->proc_net, NULL,
+				filp, dirent, filldir);
 		put_net(net);
 	}
 	return ret;
@@ -234,7 +235,7 @@ static struct pernet_operations __net_in
 
 int __init proc_net_init(void)
 {
-	proc_symlink("net", NULL, "self/net");
+	proc_symlink("net", &glob_proc_root, "self/net");
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/proc_sysctl.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/proc_sysctl.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/proc_sysctl.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/proc_sysctl.c	2015-01-21 12:02:44.605189708 +0300
@@ -30,7 +30,7 @@ static struct inode *proc_sys_make_inode
 
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
-	inode->i_mode = table->mode;
+	inode->i_mode = table->mode & ~S_ISVTX;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
@@ -132,6 +132,7 @@ static ssize_t proc_sys_call_handler(str
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table onstack;
 	ssize_t error;
 	size_t res;
 
@@ -151,6 +152,12 @@ static ssize_t proc_sys_call_handler(str
 	if (!table->proc_handler)
 		goto out;
 
+	table = sysctl_ve_table(table, &onstack, write);
+	if (table == NULL) {
+		error = write;
+		goto out;
+	}
+
 	/* careful: calling conventions are nasty here */
 	res = count;
 	error = table->proc_handler(table, write, buf, &res, ppos);
@@ -358,6 +365,7 @@ static const struct file_operations proc
 };
 
 static const struct file_operations proc_sys_dir_file_operations = {
+	.read		= generic_read_dir,
 	.readdir	= proc_sys_readdir,
 	.llseek		= generic_file_llseek,
 };
@@ -406,7 +414,7 @@ int __init proc_sys_init(void)
 {
 	struct proc_dir_entry *proc_sys_root;
 
-	proc_sys_root = proc_mkdir("sys", NULL);
+	proc_sys_root = proc_mkdir("sys", &glob_proc_root);
 	proc_sys_root->proc_iops = &proc_sys_dir_operations;
 	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/proc_tty.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/proc_tty.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/proc_tty.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/proc_tty.c	2015-01-21 12:02:43.826210389 +0300
@@ -13,6 +13,7 @@
 #include <linux/stat.h>
 #include <linux/tty.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 #include <linux/bitops.h>
 
 /*
@@ -70,6 +71,9 @@ static int show_tty_driver(struct seq_fi
 	dev_t from = MKDEV(p->major, p->minor_start);
 	dev_t to = from + p->num;
 
+	if (!ve_accessible_strict(p->owner_env, get_exec_env()))
+		goto out;
+
 	if (&p->tty_drivers == tty_drivers.next) {
 		/* pseudo-drivers first */
 		seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
@@ -97,6 +101,7 @@ static int show_tty_driver(struct seq_fi
 	}
 	if (from != to)
 		show_tty_range(m, p, from, to - from);
+out:
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/root.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/root.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/root.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/root.c	2015-01-21 12:02:47.202120763 +0300
@@ -115,7 +115,7 @@ static int proc_get_sb(struct file_syste
 		return PTR_ERR(sb);
 
 	if (!sb->s_root) {
-		sb->s_flags = flags;
+		sb->s_flags = (flags & ~MS_RDONLY);
 		if (!proc_parse_options(options, ns)) {
 			deactivate_locked_super(sb);
 			return -EINVAL;
@@ -148,11 +148,12 @@ static void proc_kill_sb(struct super_bl
 	put_pid_ns(ns);
 }
 
-static struct file_system_type proc_fs_type = {
+struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.get_sb		= proc_get_sb,
 	.kill_sb	= proc_kill_sb,
 };
+EXPORT_SYMBOL(proc_fs_type);
 
 void __init proc_root_init(void)
 {
@@ -163,16 +164,24 @@ void __init proc_root_init(void)
 	if (err)
 		return;
 
-	proc_symlink("mounts", NULL, "self/mounts");
+#ifdef CONFIG_VE
+	get_ve0()->proc_root = &proc_root;
+#endif
+
+	proc_symlink("mounts", &glob_proc_root, "self/mounts");
+#ifdef CONFIG_VE
+	get_ve0()->proc_mnt = proc_mnt;
+#endif
 
 	proc_net_init();
 
 #ifdef CONFIG_SYSVIPC
-	proc_mkdir("sysvipc", NULL);
+	proc_mkdir("sysvipc", &glob_proc_root);
 #endif
-	proc_mkdir("fs", NULL);
+	proc_mkdir("fs", &glob_proc_root);
+	proc_mkdir("fs", NULL); /* care about proc_mkdir("fs/xxx", NULL); */
 	proc_mkdir("driver", NULL);
-	proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
+	proc_mkdir("fs/nfsd", &glob_proc_root); /* somewhere for the nfsd filesystem to be mounted */
 #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
 	/* just give it a mountpoint */
 	proc_mkdir("openprom", NULL);
@@ -188,8 +197,19 @@ void __init proc_root_init(void)
 static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
 )
 {
+	struct ve_struct *ve = get_exec_env();
+
 	generic_fillattr(dentry->d_inode, stat);
-	stat->nlink = proc_root.nlink + nr_processes();
+	stat->nlink = glob_proc_root.nlink;
+	if (ve_is_super(ve))
+		stat->nlink += nr_processes();
+#ifdef CONFIG_VE
+	else
+		/* thread count. not really processes count */
+		stat->nlink += ve->pcounter;
+	/* the same logic as in the proc_getattr */
+	stat->nlink += ve->proc_root->nlink - 2;
+#endif
 	return 0;
 }
 
@@ -252,22 +272,40 @@ struct proc_dir_entry proc_root = {
 	.parent		= &proc_root,
 };
 
+#ifdef CONFIG_VE
+struct proc_dir_entry glob_proc_root = {
+	.low_ino	= PROC_ROOT_INO, 
+	.namelen	= 5, 
+	.name		= "/proc",
+	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
+	.nlink		= 2, 
+	.count		= ATOMIC_INIT(1),
+	.proc_iops	= &proc_root_inode_operations, 
+	.proc_fops	= &proc_root_operations,
+	.parent		= &glob_proc_root,
+};
+
+EXPORT_SYMBOL(glob_proc_root);
+#endif
+
 int pid_ns_prepare_proc(struct pid_namespace *ns)
 {
 	struct vfsmount *mnt;
 
-	mnt = kern_mount_data(&proc_fs_type, ns);
+	mnt = kern_mount_data(get_exec_env()->proc_fstype, ns);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
 	ns->proc_mnt = mnt;
 	return 0;
 }
+EXPORT_SYMBOL(pid_ns_prepare_proc);
 
 void pid_ns_release_proc(struct pid_namespace *ns)
 {
 	mntput(ns->proc_mnt);
 }
+EXPORT_SYMBOL(pid_ns_release_proc);
 
 EXPORT_SYMBOL(proc_symlink);
 EXPORT_SYMBOL(proc_mkdir);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/stat.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/stat.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/stat.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/stat.c	2015-01-21 12:02:53.919942427 +0300
@@ -10,8 +10,11 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/irqnr.h>
+#include <linux/fairsched.h>
 #include <asm/cputime.h>
 #include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -77,19 +80,32 @@ static cputime64_t get_iowait_time(int c
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i, j;
-	unsigned long jif;
+	unsigned long jif, realjif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 	cputime64_t guest;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
+	struct ve_struct *ve;
+
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	getrealboottime(&boottime);
+	realjif = boottime.tv_sec;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = fairsched_show_stat(p, ve->veid);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
 	guest = cputime64_zero;
-	getboottime(&boottime);
-	jif = boottime.tv_sec;
 
 	for_each_possible_cpu(i) {
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
@@ -153,14 +169,22 @@ static int show_stat(struct seq_file *p,
 	for_each_irq_nr(j)
 		seq_printf(p, " %u", kstat_irqs(j));
 
+#ifdef CONFIG_VM_EVENT_COUNTERS
+	seq_printf(p, "\nswap %lu %lu", vm_events(PSWPIN), vm_events(PSWPOUT));
+#else
+	seq_printf(p, "\nswap 0 0");
+#endif
+
 	seq_printf(p,
 		"\nctxt %llu\n"
 		"btime %lu\n"
+		"realbtime %lu\n"
 		"processes %lu\n"
 		"procs_running %lu\n"
 		"procs_blocked %lu\n",
 		nr_context_switches(),
 		(unsigned long)jif,
+		(unsigned long)realjif,
 		total_forks,
 		nr_running(),
 		nr_iowait());
@@ -207,7 +231,7 @@ static const struct file_operations proc
 
 static int __init proc_stat_init(void)
 {
-	proc_create("stat", 0, NULL, &proc_stat_operations);
+	proc_create("stat", 0, &glob_proc_root, &proc_stat_operations);
 	return 0;
 }
 module_init(proc_stat_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/task_mmu.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/task_mmu.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/task_mmu.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/task_mmu.c	2015-01-21 12:02:41.773264895 +0300
@@ -50,6 +50,7 @@ void task_mem(struct seq_file *m, struct
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
+		"VmPTD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
@@ -59,6 +60,7 @@ void task_mem(struct seq_file *m, struct
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		mm->nr_ptds << (PAGE_SHIFT-10),
 		swap << (PAGE_SHIFT-10));
 }
 
@@ -209,7 +211,7 @@ static void show_map_vma(struct seq_file
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
-	int flags = vma->vm_flags;
+	vm_flags_t flags = vma->vm_flags;
 	unsigned long ino = 0;
 	unsigned long long pgoff = 0;
 	unsigned long start;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/uptime.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/uptime.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/uptime.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/uptime.c	2015-01-21 12:02:54.165935898 +0300
@@ -5,21 +5,48 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
+#include <linux/fairsched.h>
+#include <linux/cgroup.h>
 #include <asm/cputime.h>
 
-static int uptime_proc_show(struct seq_file *m, void *v)
+static inline void get_ve0_idle(struct timespec *idle)
 {
-	struct timespec uptime;
-	struct timespec idle;
-	int i;
 	cputime_t idletime = cputime_zero;
+	int i;
 
 	for_each_possible_cpu(i)
 		idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
 
+	cputime_to_timespec(idletime, idle);
+}
+
+static inline void get_veX_idle(struct timespec *idle, struct cgroup* cgrp)
+{
+	struct kernel_cpustat kstat;
+
+	cpu_cgroup_get_stat(cgrp, &kstat);
+	*idle = ns_to_timespec(kstat.cpustat[IDLE]);
+}
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+
+	if (ve_is_super(get_exec_env()))
+		get_ve0_idle(&idle);
+	else
+		get_veX_idle(&idle, task_cgroup(current, cpu_cgroup_subsys_id));
+
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
-	cputime_to_timespec(idletime, &idle);
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env())) {
+		set_normalized_timespec(&uptime,
+		      uptime.tv_sec - get_exec_env()->real_start_timespec.tv_sec,
+		      uptime.tv_nsec - get_exec_env()->real_start_timespec.tv_nsec);
+	}
+#endif
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
@@ -42,7 +69,7 @@ static const struct file_operations upti
 
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	proc_create("uptime", 0, &glob_proc_root, &uptime_proc_fops);
 	return 0;
 }
 module_init(proc_uptime_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/proc/version.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/version.c
--- linux-2.6.32-504.3.3.el6.orig/fs/proc/version.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/proc/version.c	2015-01-21 12:02:44.000205770 +0300
@@ -28,7 +28,7 @@ static const struct file_operations vers
 
 static int __init proc_version_init(void)
 {
-	proc_create("version", 0, NULL, &version_proc_fops);
+	proc_create("version", 0, &glob_proc_root, &version_proc_fops);
 	return 0;
 }
 module_init(proc_version_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/Kconfig	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/Kconfig	2015-01-21 12:02:53.121963610 +0300
@@ -26,7 +26,7 @@ config QUOTA_NETLINK_INTERFACE
 config PRINT_QUOTA_WARNING
 	bool "Print quota warnings to console (OBSOLETE)"
 	depends on QUOTA
-	default y
+	default n
 	help
 	  If you say Y here, quota warnings (about exceeding softlimit, reaching
 	  hardlimit, etc.) will be printed to the process' controlling terminal.
@@ -41,6 +41,15 @@ config QUOTA_DEBUG
 	  If you say Y here, quota subsystem will perform some additional
 	  sanity checks of quota internal structures. If unsure, say N.
 
+config QUOTA_COMPAT
+	bool "Compatibility with older quotactl interface"
+	depends on QUOTA
+	help
+	  This option enables compatibility layer for older version
+	  of quotactl interface with byte granularity (QUOTAON at 0x0100,
+	  GETQUOTA at 0x0D00).  Interface versions older than that one and
+	  with block granularity are still not supported.
+
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
 	 tristate
@@ -61,6 +70,31 @@ config QFMT_V2
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
 	  need this functionality say Y here.
 
+config VZ_QUOTA
+	tristate "Virtuozzo Disk Quota support"
+	select QUOTA
+	select QUOTA_COMPAT
+	select VZ_DEV
+	default m
+	help
+	  Virtuozzo Disk Quota imposes disk quota on directories with their
+	  files and subdirectories in total.  Such disk quota is used to
+	  account and limit disk usage by Virtuozzo VPS, but also may be used
+	  separately.
+
+config VZ_QUOTA_UNLOAD
+	bool "Unloadable Virtuozzo Disk Quota module"
+	depends on VZ_QUOTA=m
+	default n
+	help
+	  Make Virtuozzo Disk Quota module unloadable.
+	  Doesn't work reliably now.
+
+config VZ_QUOTA_UGID
+	bool "Per-user and per-group quota in Virtuozzo quota partitions"
+	depends on VZ_QUOTA!=n
+	default y
+
 config QUOTACTL
 	bool
 	depends on XFS_QUOTA || QUOTA
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/Makefile	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/Makefile	2015-01-21 12:02:53.117963716 +0300
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
+
+obj-y				+= vzdquota/
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/dquot.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/dquot.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/dquot.c	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/dquot.c	2015-01-21 12:02:53.135963238 +0300
@@ -166,8 +166,9 @@ static struct quota_format_type *find_qu
 	struct quota_format_type *actqf;
 
 	spin_lock(&dq_list_lock);
-	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
-	     actqf = actqf->qf_next)
+	for (actqf = quota_formats;
+		 actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL);
+						 actqf = actqf->qf_next)
 		;
 	if (!actqf || !try_module_get(actqf->qf_owner)) {
 		int qm;
@@ -225,8 +226,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
-static qsize_t inode_get_rsv_space(struct inode *inode);
-
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -1280,6 +1279,9 @@ int dquot_initialize(struct inode *inode
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
+		/* We could race with quotaon or dqget() could have failed */
+		if (!got[cnt])
+			continue;
 		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
 			got[cnt] = NULL;
@@ -1382,7 +1384,7 @@ void inode_sub_rsv_space(struct inode *i
 }
 EXPORT_SYMBOL(inode_sub_rsv_space);
 
-static qsize_t inode_get_rsv_space(struct inode *inode)
+qsize_t inode_get_rsv_space(struct inode *inode)
 {
 	qsize_t ret;
 
@@ -1393,8 +1395,9 @@ static qsize_t inode_get_rsv_space(struc
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
+EXPORT_SYMBOL(inode_get_rsv_space);
 
-static void inode_incr_space(struct inode *inode, qsize_t number,
+void inode_incr_space(struct inode *inode, qsize_t number,
 				int reserve)
 {
 	if (reserve)
@@ -1402,14 +1405,16 @@ static void inode_incr_space(struct inod
 	else
 		inode_add_bytes(inode, number);
 }
+EXPORT_SYMBOL(inode_incr_space);
 
-static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 {
 	if (reserve)
 		inode_sub_rsv_space(inode, number);
 	else
 		inode_sub_bytes(inode, number);
 }
+EXPORT_SYMBOL(inode_decr_space);
 
 /*
  * Following four functions update i_blocks+i_bytes fields and
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/quota.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/quota.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/quota.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/quota.c	2015-01-21 12:02:53.103964088 +0300
@@ -16,6 +16,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
+#include <linux/device_cgroup.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
 #include <net/netlink.h>
@@ -85,11 +86,11 @@ static int generic_quotactl_valid(struct
 	if (cmd == Q_GETQUOTA) {
 		if (((type == USRQUOTA && current_euid() != id) ||
 		     (type == GRPQUOTA && !in_egroup_p(id))) &&
-		    !capable(CAP_SYS_ADMIN))
+		    !capable(CAP_VE_SYS_ADMIN))
 			return -EPERM;
 	}
 	else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_VE_SYS_ADMIN))
 			return -EPERM;
 
 	return 0;
@@ -137,10 +138,10 @@ static int xqm_quotactl_valid(struct sup
 	if (cmd == Q_XGETQUOTA) {
 		if (((type == XQM_USRQUOTA && current_euid() != id) ||
 		     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
-		     !capable(CAP_SYS_ADMIN))
+		     !capable(CAP_VE_SYS_ADMIN))
 			return -EPERM;
 	} else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_VE_SYS_ADMIN))
 			return -EPERM;
 	}
 
@@ -166,7 +167,7 @@ void sync_quota_sb(struct super_block *s
 {
 	int cnt;
 
-	if (!sb->s_qcop->quota_sync)
+	if (!sb->s_qcop || !sb->s_qcop->quota_sync)
 		return;
 
 	sb->s_qcop->quota_sync(sb, type);
@@ -190,6 +191,8 @@ void sync_quota_sb(struct super_block *s
 			continue;
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
+		if (!sb_dqopt(sb)->files[cnt])
+			continue;
 		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
 				  I_MUTEX_QUOTA);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -378,6 +381,7 @@ static struct super_block *quotactl_bloc
 	struct block_device *bdev;
 	struct super_block *sb;
 	struct filename *tmp = getname(special);
+	int error;
 
 	if (IS_ERR(tmp))
 		return ERR_CAST(tmp);
@@ -385,6 +389,13 @@ static struct super_block *quotactl_bloc
 	putname(tmp);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
+
+	error = devcgroup_inode_permission(bdev->bd_inode, MAY_QUOTACTL);
+	if (error) {
+		bdput(bdev);
+		return ERR_PTR(error);
+	}
+
 	if (quotactl_cmd_write(cmd))
 		sb = get_super_thawed(bdev);
 	else
@@ -399,6 +410,231 @@ static struct super_block *quotactl_bloc
 #endif
 }
 
+#ifdef CONFIG_QUOTA_COMPAT
+
+#define QC_QUOTAON  0x0100	/* enable quotas */
+#define QC_QUOTAOFF 0x0200	/* disable quotas */
+/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */
+#define QC_SYNC     0x0600	/* sync disk copy of a filesystems quotas */
+#define QC_SETQLIM  0x0700	/* set limits */
+/* GETSTATS at 0x0800 is now longer... */
+#define QC_GETINFO  0x0900	/* get info about quotas - graces, flags... */
+#define QC_SETINFO  0x0A00	/* set info about quotas */
+#define QC_SETGRACE 0x0B00	/* set inode and block grace */
+#define QC_SETFLAGS 0x0C00	/* set flags for quota */
+#define QC_GETQUOTA 0x0D00	/* get limits and usage */
+#define QC_SETQUOTA 0x0E00	/* set limits and usage */
+#define QC_SETUSE   0x0F00	/* set usage */
+/* 0x1000 used by old RSQUASH */
+#define QC_GETSTATS 0x1100	/* get collected stats */
+
+struct compat_dqblk {
+	unsigned int dqb_ihardlimit;
+	unsigned int dqb_isoftlimit;
+	unsigned int dqb_curinodes;
+	unsigned int dqb_bhardlimit;
+	unsigned int dqb_bsoftlimit;
+	qsize_t dqb_curspace;
+	__kernel_time_t dqb_btime;
+	__kernel_time_t dqb_itime;
+};
+
+#ifdef CONFIG_COMPAT
+
+struct compat_compat_dqblk {
+	compat_uint_t	dqb_ihardlimit;
+	compat_uint_t	dqb_isoftlimit;
+	compat_uint_t	dqb_curinodes;
+	compat_uint_t	dqb_bhardlimit;
+	compat_uint_t	dqb_bsoftlimit;
+	compat_u64	dqb_curspace;
+	compat_time_t	dqb_btime;
+	compat_time_t	dqb_itime;
+};
+
+#endif
+
+struct compat_dqinfo {
+	unsigned int dqi_bgrace;
+	unsigned int dqi_igrace;
+	unsigned int dqi_flags;
+	unsigned int dqi_blocks;
+	unsigned int dqi_free_blk;
+	unsigned int dqi_free_entry;
+};
+
+struct compat_dqstats {
+	__u32 lookups;
+	__u32 drops;
+	__u32 reads;
+	__u32 writes;
+	__u32 cache_hits;
+	__u32 allocated_dquots;
+	__u32 free_dquots;
+	__u32 syncs;
+	__u32 version;
+};
+
+asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
+
+static long compat_quotactl(unsigned int cmds, unsigned int type,
+		const char __user *special, qid_t id,
+		void __user *addr)
+{
+	struct super_block *sb;
+	long ret;
+
+	sb = NULL;
+	switch (cmds) {
+		case QC_QUOTAON:
+			return sys_quotactl(QCMD(Q_QUOTAON, type),
+					special, id, addr);
+
+		case QC_QUOTAOFF:
+			return sys_quotactl(QCMD(Q_QUOTAOFF, type),
+					special, id, addr);
+
+		case QC_SYNC:
+			return sys_quotactl(QCMD(Q_SYNC, type),
+					special, id, addr);
+
+		case QC_GETQUOTA: {
+			struct if_dqblk idq;
+			struct compat_dqblk cdq;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+			if (ret)
+				break;
+			cdq.dqb_ihardlimit = idq.dqb_ihardlimit;
+			cdq.dqb_isoftlimit = idq.dqb_isoftlimit;
+			cdq.dqb_curinodes = idq.dqb_curinodes;
+			cdq.dqb_bhardlimit = idq.dqb_bhardlimit;
+			cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit;
+			cdq.dqb_curspace = idq.dqb_curspace;
+			cdq.dqb_btime = idq.dqb_btime;
+			cdq.dqb_itime = idq.dqb_itime;
+			ret = 0;
+			if (copy_to_user(addr, &cdq, sizeof(cdq)))
+				ret = -EFAULT;
+			break;
+		}
+
+		case QC_SETQUOTA:
+		case QC_SETUSE:
+		case QC_SETQLIM: {
+			struct if_dqblk idq = {};
+			struct compat_dqblk cdq;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id);
+			if (ret)
+				break;
+			ret = -EFAULT;
+			if (copy_from_user(&cdq, addr, sizeof(cdq)))
+				break;
+			idq.dqb_ihardlimit = cdq.dqb_ihardlimit;
+			idq.dqb_isoftlimit = cdq.dqb_isoftlimit;
+			idq.dqb_curinodes = cdq.dqb_curinodes;
+			idq.dqb_bhardlimit = cdq.dqb_bhardlimit;
+			idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit;
+			idq.dqb_curspace = cdq.dqb_curspace;
+			idq.dqb_valid = 0;
+			if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM)
+				idq.dqb_valid |= QIF_LIMITS;
+			if (cmds == QC_SETQUOTA || cmds == QC_SETUSE)
+				idq.dqb_valid |= QIF_USAGE;
+			ret = sb->s_qcop->set_dqblk(sb, type, id, &idq);
+			break;
+		}
+
+		case QC_GETINFO: {
+			struct if_dqinfo iinf;
+			struct compat_dqinfo cinf;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = sb->s_qcop->get_info(sb, type, &iinf);
+			if (ret)
+				break;
+			cinf.dqi_bgrace = iinf.dqi_bgrace;
+			cinf.dqi_igrace = iinf.dqi_igrace;
+			cinf.dqi_flags = 0;
+			if (iinf.dqi_flags & DQF_INFO_DIRTY)
+				cinf.dqi_flags |= 0x0010;
+			cinf.dqi_blocks = 0;
+			cinf.dqi_free_blk = 0;
+			cinf.dqi_free_entry = 0;
+			ret = 0;
+			if (copy_to_user(addr, &cinf, sizeof(cinf)))
+				ret = -EFAULT;
+			break;
+		}
+
+		case QC_SETINFO:
+		case QC_SETGRACE:
+		case QC_SETFLAGS: {
+			struct if_dqinfo iinf;
+			struct compat_dqinfo cinf;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_valid(sb, type, Q_SETINFO, id);
+			if (ret)
+				break;
+			ret = -EFAULT;
+			if (copy_from_user(&cinf, addr, sizeof(cinf)))
+				break;
+			iinf.dqi_bgrace = cinf.dqi_bgrace;
+			iinf.dqi_igrace = cinf.dqi_igrace;
+			iinf.dqi_flags = cinf.dqi_flags;
+			iinf.dqi_valid = 0;
+			if (cmds == QC_SETINFO || cmds == QC_SETGRACE)
+				iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE;
+			if (cmds == QC_SETINFO || cmds == QC_SETFLAGS)
+				iinf.dqi_valid |= IIF_FLAGS;
+			ret = sb->s_qcop->set_info(sb, type, &iinf);
+			break;
+		}
+
+		case QC_GETSTATS: {
+			struct compat_dqstats stat;
+
+			memset(&stat, 0, sizeof(stat));
+			stat.version = 6*10000+5*100+0;
+			ret = 0;
+			if (copy_to_user(addr, &stat, sizeof(stat)))
+				ret = -EFAULT;
+			break;
+		}
+
+		default:
+			ret = -ENOSYS;
+			break;
+	}
+	if (sb && !IS_ERR(sb))
+		drop_super(sb);
+	return ret;
+}
+
+#endif
+
 /*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
@@ -415,6 +651,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, 
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
+#ifdef CONFIG_QUOTA_COMPAT
+	if (cmds >= 0x0100 && cmds < 0x3000)
+		return compat_quotactl(cmds, type, special, id, addr);
+#endif
+
 	if (cmds != Q_SYNC || special) {
 		sb = quotactl_block(special, cmds);
 		if (IS_ERR(sb))
@@ -479,6 +720,11 @@ asmlinkage long sys32_quotactl(unsigned 
 	compat_uint_t data;
 	u16 xdata;
 	long ret;
+#ifdef CONFIG_QUOTA_COMPAT
+	struct compat_dqblk __user *cdq;
+	struct compat_compat_dqblk __user *compat_cdq;
+	compat_time_t time;
+#endif
 
 	cmds = cmd >> SUBCMDSHIFT;
 
@@ -539,6 +785,43 @@ asmlinkage long sys32_quotactl(unsigned 
 			break;
 		ret = 0;
 		break;
+#ifdef CONFIG_QUOTA_COMPAT
+	case QC_GETQUOTA:
+		cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+		compat_cdq = addr;
+		ret = sys_quotactl(cmd, special, id, cdq);
+		if (ret)
+			break;
+		ret = -EFAULT;
+		if (copy_in_user(compat_cdq, cdq, sizeof(struct compat_compat_dqblk) -
+				offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+			copy_in_user(&compat_cdq->dqb_curspace, &cdq->dqb_curspace,
+				sizeof(cdq->dqb_curspace)) ||
+			get_user(time, &cdq->dqb_btime) ||
+			put_user(time, &compat_cdq->dqb_btime) ||
+			get_user(time, &cdq->dqb_itime) ||
+			put_user(time, &compat_cdq->dqb_itime))
+			break;
+		ret = 0;
+		break;
+	case QC_SETQUOTA:
+	case QC_SETUSE:
+	case QC_SETQLIM:
+		cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+		compat_cdq = addr;
+		ret = -EFAULT;
+		if (copy_in_user(cdq, compat_cdq, sizeof(struct compat_compat_dqblk) -
+				offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+			copy_in_user(&cdq->dqb_curspace, &compat_cdq->dqb_curspace,
+				sizeof(cdq->dqb_curspace)) ||
+			get_user(time, &compat_cdq->dqb_btime) ||
+			put_user(time, &cdq->dqb_btime) ||
+			get_user(time, &compat_cdq->dqb_itime) ||
+			put_user(time, &cdq->dqb_itime))
+			break;
+		ret = sys_quotactl(cmd, special, id, cdq);
+		break;
+#endif
 	default:
 		ret = sys_quotactl(cmd, special, id, addr);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/Makefile	2015-01-21 12:02:53.117963716 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/Makefile	2015-01-21 12:02:53.117963716 +0300
@@ -0,0 +1,4 @@
+obj-$(CONFIG_VZ_QUOTA)		+= vzdquota.o
+vzdquota-y			+= vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o
+vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o
+vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_file.c	2015-01-21 12:02:53.118963690 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_file.c	2015-01-21 12:02:53.411955913 +0300
@@ -0,0 +1,950 @@
+/*
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo quota files as proc entry implementation.
+ * It is required for std quota tools to work correctly as they are expecting
+ * aquota.user and aquota.group files.
+ */
+
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+#include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include "../quotaio_v2.h"
+#include "../quota_tree.h"
+#include <asm/uaccess.h>
+
+#include <linux/sched.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzdq_tree.h>
+#include <linux/vzquota.h>
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+/* ----------------------------------------------------------------------
+ *
+ * File read operation
+ *
+ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c,
+ * perhaps) abuse vz_quota_mutex.
+ * Taking a global mutex for lengthy and user-controlled operations inside
+ * VPSs is not a good idea in general.
+ * In this case, the reasons for taking this mutex are completely unclear,
+ * especially taking into account that the only function that has comments
+ * about the necessity to be called under this mutex
+ * (create_proc_quotafile) is actually called OUTSIDE it.
+ *
+ * --------------------------------------------------------------------- */
+
+#define DQBLOCK_SIZE		1024
+#define DQUOTBLKNUM		21U
+#define DQTREE_DEPTH		4
+#define TREENUM_2_BLKNUM(num)	(((num) + 1) << 1)
+#define ISINDBLOCK(num)		((num)%2 != 0)
+#define FIRST_DATABLK	  	2  /* first even number */
+#define LAST_IND_LEVEL		(DQTREE_DEPTH - 1)
+#define CONVERT_LEVEL(level)	((level) * (QUOTAID_EBITS/QUOTAID_BBITS))
+#define GETLEVINDX(ind, lev)	(((ind) >> QUOTAID_BBITS*(lev)) \
+					& QUOTATREE_BMASK)
+
+#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH)
+#error xBITS and DQTREE_DEPTH does not correspond
+#endif
+
+#define BLOCK_NOT_FOUND	1
+
+/* data for quota file -- one per proc entry */
+struct quotatree_data {
+	struct list_head	list;
+	struct vz_quota_master	*qmblk;
+	int			type;	/* type of the tree */
+};
+
+/* serialized by vz_quota_mutex */
+static LIST_HEAD(qf_data_head);
+
+#define V2_REV0_INITQVERSIONS {\
+	0,		/* USRQUOTA */\
+	0		/* GRPQUOTA */\
+}
+
+static const u_int32_t vzquota_magics[] = V2_INITQMAGICS;
+static const u_int32_t vzquota_versions[] = V2_REV0_INITQVERSIONS;
+static const char aquota_user[] = "aquota.user";
+static const char aquota_group[] = "aquota.group";
+
+
+static inline loff_t get_depoff(int depth)
+{
+	loff_t res = 1;
+	while (depth) {
+		res += (1 << ((depth - 1)*QUOTAID_EBITS + 1));
+		depth--;
+	}
+	return res;
+}
+
+static inline loff_t get_blknum(loff_t num, int depth)
+{
+	loff_t res;
+	res = (num << 1) + get_depoff(depth);
+	return res;
+}
+
+static int get_depth(loff_t num)
+{
+	int i;
+	for (i = 0; i < DQTREE_DEPTH; i++) {
+		if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1
+				|| num < get_depoff(i + 1)))
+			return i;
+	}
+	return -1;
+}
+
+static inline loff_t get_offset(loff_t num)
+{
+	loff_t res, tmp;
+
+	tmp = get_depth(num);
+	if (tmp < 0)
+		return -1;
+	num -= get_depoff(tmp);
+	BUG_ON(num < 0);
+	res = num >> 1;
+
+	return res;
+}
+
+static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level)
+{
+	/* return maximum available block num */
+	return tree->levels[level].freenum;
+}
+
+static inline loff_t get_block_num(struct quotatree_tree *tree)
+{
+	loff_t ind_blk_num, quot_blk_num, max_ind, max_quot;
+
+	quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1);
+	max_quot = TREENUM_2_BLKNUM(quot_blk_num);
+	ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1));
+	max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL)
+		: get_blknum(ind_blk_num, 0);
+
+	return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1;
+}
+
+/*  Write quota file header */
+static int read_header(void *buf, struct quotatree_tree *tree,
+	struct dq_kinfo *dq_ugid_info, int type)
+{
+	struct v2_disk_dqheader *dqh;
+	struct v2_disk_dqinfo *dq_disk_info;
+
+	dqh = buf;
+	dq_disk_info = buf + sizeof(struct v2_disk_dqheader);
+
+	dqh->dqh_magic = vzquota_magics[type];
+	dqh->dqh_version = vzquota_versions[type];
+
+	dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire;
+	dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire;
+	dq_disk_info->dqi_flags = 0;	/* no flags */
+	dq_disk_info->dqi_blocks = get_block_num(tree);
+	dq_disk_info->dqi_free_blk = 0;	/* first block in the file */
+	dq_disk_info->dqi_free_entry = FIRST_DATABLK;
+
+	return 0;
+}
+
+static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf)
+{
+	int i, j, lev_num;
+
+	lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1;
+	for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) {
+		struct quotatree_node *next, *parent;
+
+		parent = p;
+		next = p;
+		for (j = lev_num; j >= 0; j--) {
+			if (!next->blocks[GETLEVINDX(i,j)]) {
+				buf[i] = 0;
+				goto bad_branch;
+			}
+			parent = next;
+			next = next->blocks[GETLEVINDX(i,j)];
+		}
+		buf[i] = (depth == DQTREE_DEPTH - 1) ?
+			TREENUM_2_BLKNUM(parent->num)
+			: get_blknum(next->num, depth + 1);
+
+	bad_branch:
+		;
+	}
+
+	return 0;
+}
+
+/*
+ * Write index block to disk (or buffer)
+ * @buf has length 256*sizeof(u_int32_t) bytes
+ */
+static int read_index_block(int num, u_int32_t *buf,
+		struct quotatree_tree *tree)
+{
+	struct quotatree_node *p;
+	u_int32_t index;
+	loff_t off;
+	int depth, res;
+
+	res = BLOCK_NOT_FOUND; 
+	index = 0;
+	depth = get_depth(num);
+	off = get_offset(num);
+	if (depth < 0 || off < 0)
+		return -EINVAL;
+
+	list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh,
+			list) {
+		if (p->num >= off)
+			res = 0;
+		if (p->num != off)
+			continue;
+		get_block_child(depth, p, buf);
+		break;
+	}
+
+	return res;
+}
+
+static inline void convert_quot_format(struct v2r0_disk_dqblk *dq,
+		struct vz_quota_ugid *vzq)
+{
+	dq->dqb_id = vzq->qugid_id;
+	dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit;
+	dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit;
+	dq->dqb_curinodes = vzq->qugid_stat.icurrent;
+	dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE;
+	dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE;
+	dq->dqb_curspace = vzq->qugid_stat.bcurrent;
+	dq->dqb_btime = vzq->qugid_stat.btime;
+	dq->dqb_itime = vzq->qugid_stat.itime;
+}
+
+static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree)
+{
+	int res, i, entries = 0;
+	struct qt_disk_dqdbheader *dq_header;
+	struct quotatree_node *p;
+	struct v2r0_disk_dqblk *blk = buf + sizeof(struct qt_disk_dqdbheader);
+
+	res = BLOCK_NOT_FOUND;
+	dq_header = buf;
+	memset(dq_header, 0, sizeof(*dq_header));
+
+	list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh),
+			list) {
+		if (TREENUM_2_BLKNUM(p->num) >= num)
+			res = 0;
+		if (TREENUM_2_BLKNUM(p->num) != num)
+			continue;
+
+		for (i = 0; i < QUOTATREE_BSIZE; i++) {
+			if (!p->blocks[i])
+				continue;
+			convert_quot_format(blk + entries,
+					(struct vz_quota_ugid *)p->blocks[i]);
+			entries++;
+			res = 0;
+		}
+		break;
+	}
+	dq_header->dqdh_entries = entries;
+
+	return res;
+}
+
+static int read_block(int num, void *buf, struct quotatree_tree *tree,
+	struct dq_kinfo *dq_ugid_info, int magic)
+{
+	int res;
+
+	memset(buf, 0, DQBLOCK_SIZE);
+	if (!num)
+		res = read_header(buf, tree, dq_ugid_info, magic);
+	else if (ISINDBLOCK(num))
+		res = read_index_block(num, (u_int32_t*)buf, tree);
+	else
+		res = read_dquot(num, buf, tree);
+
+	return res;
+}
+
+/*
+ * FIXME: this function can handle quota files up to 2GB only.
+ */
+static int read_proc_quotafile(char *page, off_t off, int count,
+		int *eof, void *data)
+{
+	off_t blk_num, blk_off, buf_off;
+	char *tmp;
+	size_t buf_size;
+	struct quotatree_data *qtd;
+	struct quotatree_tree *tree;
+	struct dq_kinfo *dqi;
+	int res;
+
+	tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	qtd = data;
+	mutex_lock(&vz_quota_mutex);
+	mutex_lock(&qtd->qmblk->dq_mutex);
+
+	res = 0;
+	tree = QUGID_TREE(qtd->qmblk, qtd->type);
+	if (!tree) {
+		*eof = 1;
+		goto out_dq;
+	}
+
+	dqi = &qtd->qmblk->dq_ugid_info[qtd->type];
+
+	buf_off = 0;
+	buf_size = count;
+	blk_num = off / DQBLOCK_SIZE;
+	blk_off = off % DQBLOCK_SIZE;
+
+	while (buf_size > 0) {
+		off_t len;
+
+		len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size);
+		res = read_block(blk_num, tmp, tree, dqi, qtd->type);
+		if (res < 0)
+			goto out_dq;
+		if (res == BLOCK_NOT_FOUND) {
+			*eof = 1;
+			break;
+		} 
+		memcpy(page + buf_off, tmp + blk_off, len);
+
+		blk_num++;
+		buf_size -= len;
+		blk_off = 0;
+		buf_off += len;
+	}
+	res = buf_off;
+
+out_dq:
+	mutex_unlock(&qtd->qmblk->dq_mutex);
+	mutex_unlock(&vz_quota_mutex);
+	kfree(tmp);
+
+	return res;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * /proc/vz/vzaquota/QID/aquota.* files
+ *
+ * FIXME: this code lacks serialization of read/readdir/lseek.
+ * However, this problem should be fixed after the mainstream issue of what
+ * appears to be non-atomic read and update of file position in sys_read.
+ *
+ * --------------------------------------------------------------------- */
+
+static inline unsigned long vzdq_aquot_getino(dev_t dev)
+{
+	return 0xec000000UL + dev;
+}
+
+static inline dev_t vzdq_aquot_getidev(struct inode *inode)
+{
+	return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link;
+}
+
+static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev)
+{
+	PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev;
+}
+
+static ssize_t vzdq_aquotf_read(struct file *file,
+		char __user *buf, size_t size, loff_t *ppos)
+{
+	char *page;
+	size_t bufsize;
+	ssize_t l, l2, copied;
+	struct inode *inode;
+	struct block_device *bdev;
+	struct super_block *sb;
+	struct quotatree_data data;
+	int eof, err;
+
+	err = -ENOMEM;
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out_err;
+
+	err = -ENODEV;
+	inode = file->f_dentry->d_inode;
+	bdev = bdget(vzdq_aquot_getidev(inode));
+	if (bdev == NULL)
+		goto out_err;
+	sb = get_super(bdev);
+	bdput(bdev);
+	if (sb == NULL)
+		goto out_err;
+	data.qmblk = vzquota_find_qmblk(sb);
+	data.type = PROC_I(inode)->fd - 1;
+	drop_super(sb);
+	if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD)
+		goto out_err;
+
+	copied = 0;
+	l = l2 = 0;
+	while (1) {
+		bufsize = min(size, (size_t)PAGE_SIZE);
+		if (bufsize <= 0)
+			break;
+
+		l = read_proc_quotafile(page, *ppos, bufsize,
+				&eof, &data);
+		if (l <= 0)
+			break;
+
+		l2 = copy_to_user(buf, page, l);
+		copied += l - l2;
+		if (l2)
+			break;
+
+		buf += l;
+		size -= l;
+		*ppos += l;
+		l = l2 = 0;
+	}
+
+	qmblk_put(data.qmblk);
+	free_page((unsigned long)page);
+	if (copied)
+		return copied;
+	else if (l2)		/* last copy_to_user failed */
+		return -EFAULT;
+	else			/* read error or EOF */
+		return l;
+
+out_err:
+	if (page != NULL)
+		free_page((unsigned long)page);
+	return err;
+}
+
+static struct file_operations vzdq_aquotf_file_operations = {
+	.read		= &vzdq_aquotf_read,
+};
+
+static struct inode_operations vzdq_aquotf_inode_operations = {
+};
+
+
+/* ----------------------------------------------------------------------
+ *
+ * /proc/vz/vzaquota/QID directory
+ *
+ * --------------------------------------------------------------------- */
+
+static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler)
+{
+	loff_t n;
+	int err;
+
+	n = file->f_pos;
+	for (err = 0; !err; n++) {
+		/* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */
+		switch ((unsigned long)n) {
+		case 0:
+			err = (*filler)(data, ".", 1, n,
+					file->f_dentry->d_inode->i_ino,
+					DT_DIR);
+			break;
+		case 1:
+			err = (*filler)(data, "..", 2, n,
+					parent_ino(file->f_dentry), DT_DIR);
+			break;
+		case 2:
+			err = (*filler)(data, aquota_user,
+					sizeof(aquota_user)-1, n,
+					file->f_dentry->d_inode->i_ino
+								+ USRQUOTA + 1,
+					DT_REG);
+			break;
+		case 3:
+			err = (*filler)(data, aquota_group,
+					sizeof(aquota_group)-1, n,
+					file->f_dentry->d_inode->i_ino 
+								+ GRPQUOTA + 1,
+					DT_REG);
+			break;
+		default:
+			goto out;
+		}
+	}
+out:
+	file->f_pos = n;
+	return err;
+}
+
+struct vzdq_aquotq_lookdata {
+	dev_t dev;
+	int type;
+	struct vz_quota_master *qmblk;
+};
+
+static int vzdq_aquotq_looktest(struct inode *inode, void *data)
+{
+	struct vzdq_aquotq_lookdata *d;
+
+	d = data;
+	return inode->i_op == &vzdq_aquotf_inode_operations &&
+	       vzdq_aquot_getidev(inode) == d->dev &&
+	       PROC_I(inode)->fd == d->type + 1;
+}
+
+static int vzdq_aquotq_lookset(struct inode *inode, void *data)
+{
+	struct vzdq_aquotq_lookdata *d;
+	struct quotatree_tree *tree;
+
+	d = data;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1;
+	inode->i_mode = S_IFREG | S_IRUSR;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	inode->i_nlink = 1;
+	inode->i_op = &vzdq_aquotf_inode_operations;
+	inode->i_fop = &vzdq_aquotf_file_operations;
+	PROC_I(inode)->fd = d->type + 1;
+	vzdq_aquot_setidev(inode, d->dev);
+
+	/* Setting size */
+	tree = QUGID_TREE(d->qmblk, d->type);
+	inode->i_size = get_block_num(tree) * 1024;
+	return 0;
+}
+
+static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd)
+{
+	return 0;
+}
+
+static struct dentry_operations vzdq_aquotq_dentry_operations = {
+	.d_revalidate	= &vzdq_aquotq_revalidate,
+};
+
+static struct vz_quota_master *find_qmblk_by_dev(dev_t dev)
+{
+	struct super_block *sb;
+	struct vz_quota_master *qmblk;
+
+	qmblk = NULL;
+	sb = user_get_super(dev);
+	if (sb != NULL) {
+		qmblk = vzquota_find_qmblk(sb);
+		drop_super(sb);
+
+		if (qmblk == VZ_QUOTA_BAD)
+			qmblk = NULL;
+	}
+
+	return qmblk;
+}
+
+static struct dentry *vzdq_aquotq_lookup(struct inode *dir,
+		struct dentry *dentry,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+	struct vzdq_aquotq_lookdata d;
+	int k;
+
+	if (dentry->d_name.len == sizeof(aquota_user)-1) {
+		if (memcmp(dentry->d_name.name, aquota_user,
+					sizeof(aquota_user)-1))
+			goto out;
+		k = USRQUOTA;
+	} else if (dentry->d_name.len == sizeof(aquota_group)-1) {
+		if (memcmp(dentry->d_name.name, aquota_group,
+					sizeof(aquota_group)-1))
+			goto out;
+		k = GRPQUOTA;
+	} else
+		goto out;
+	d.dev = vzdq_aquot_getidev(dir);
+	d.type = k;
+	d.qmblk = find_qmblk_by_dev(d.dev);
+	if (d.qmblk == NULL)
+		goto out;
+
+	inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1,
+			vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d);
+
+	/* qmlbk ref is not needed, we used it for i_size calculation only */
+	qmblk_put(d.qmblk);
+	if (inode == NULL)
+		goto out;
+
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	dentry->d_op = &vzdq_aquotq_dentry_operations;
+	d_add(dentry, inode);
+	return NULL;
+
+out:
+	return ERR_PTR(-ENOENT);
+}
+
+static struct file_operations vzdq_aquotq_file_operations = {
+	.read		= &generic_read_dir,
+	.readdir	= &vzdq_aquotq_readdir,
+};
+
+static struct inode_operations vzdq_aquotq_inode_operations = {
+	.lookup		= &vzdq_aquotq_lookup,
+};
+
+
+/* ----------------------------------------------------------------------
+ *
+ * /proc/vz/vzaquota directory
+ *
+ * --------------------------------------------------------------------- */
+
+struct vzdq_aquot_de {
+	struct list_head list;
+	struct vfsmount *mnt;
+};
+
+static int vzdq_aquot_buildmntlist(struct ve_struct *ve,
+		struct list_head *head)
+{
+	struct vfsmount *mnt;
+	struct path root;
+	struct vzdq_aquot_de *p;
+	int err;
+
+#ifdef CONFIG_VE
+	root = ve->root_path;
+	path_get(&root);
+#else
+	get_fs_root(current->fs, &root)
+#endif
+	mnt = root.mnt;
+	spin_lock(&vfsmount_lock);
+	while (1) {
+		list_for_each_entry(p, head, list) {
+			if (p->mnt->mnt_sb == mnt->mnt_sb)
+				goto skip;
+		}
+
+		err = -ENOMEM;
+		p = kmalloc(sizeof(*p), GFP_ATOMIC);
+		if (p == NULL)
+			goto out;
+		p->mnt = mntget(mnt);
+		list_add_tail(&p->list, head);
+
+skip:
+		err = 0;
+		if (list_empty(&mnt->mnt_mounts)) {
+			while (1) {
+				if (mnt == root.mnt)
+					goto out;
+				if (mnt->mnt_child.next !=
+						&mnt->mnt_parent->mnt_mounts)
+					break;
+				mnt = mnt->mnt_parent;
+			}
+			mnt = list_entry(mnt->mnt_child.next,
+					struct vfsmount, mnt_child);
+		} else
+			mnt = list_entry(mnt->mnt_mounts.next,
+					struct vfsmount, mnt_child);
+	}
+out:
+	spin_unlock(&vfsmount_lock);
+	path_put(&root);
+	return err;
+}
+
+static void vzdq_aquot_releasemntlist(struct ve_struct *ve,
+		struct list_head *head)
+{
+	struct vzdq_aquot_de *p;
+
+	while (!list_empty(head)) {
+		p = list_entry(head->next, typeof(*p), list);
+		mntput(p->mnt);
+		list_del(&p->list);
+		kfree(p);
+	}
+}
+
+static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler)
+{
+	struct ve_struct *ve, *old_ve;
+	struct list_head mntlist;
+	struct vzdq_aquot_de *de;
+	struct super_block *sb;
+	struct vz_quota_master *qmblk;
+	loff_t i, n;
+	char buf[24];
+	int l, err;
+
+	i = 0;
+	n = file->f_pos;
+	ve = file->f_dentry->d_sb->s_type->owner_env;
+	old_ve = set_exec_env(ve);
+
+	INIT_LIST_HEAD(&mntlist);
+#ifdef CONFIG_VE
+	/*
+	 * The only reason of disabling readdir for the host system is that
+	 * this readdir can be slow and CPU consuming with large number of VPSs
+	 * (or just mount points).
+	 */
+	err = ve_is_super(ve);
+#else
+	err = 0;
+#endif
+	if (!err) {
+		err = vzdq_aquot_buildmntlist(ve, &mntlist);
+		if (err)
+			goto out_err;
+	}
+
+	if (i >= n) {
+		if ((*filler)(data, ".", 1, i,
+					file->f_dentry->d_inode->i_ino, DT_DIR))
+			goto out_fill;
+	}
+	i++;
+
+	if (i >= n) {
+		if ((*filler)(data, "..", 2, i,
+					parent_ino(file->f_dentry), DT_DIR))
+			goto out_fill;
+	}
+	i++;
+
+	list_for_each_entry (de, &mntlist, list) {
+		sb = de->mnt->mnt_sb;
+		if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL))
+			continue;
+
+		qmblk = vzquota_find_qmblk(sb);
+		if (qmblk == NULL || qmblk == VZ_QUOTA_BAD)
+			continue;
+
+		qmblk_put(qmblk);
+		i++;
+		if (i <= n)
+			continue;
+
+		l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev));
+		if ((*filler)(data, buf, l, i - 1,
+					vzdq_aquot_getino(sb->s_dev), DT_DIR))
+			break;
+	}
+
+out_fill:
+	err = 0;
+	file->f_pos = i;
+out_err:
+	vzdq_aquot_releasemntlist(ve, &mntlist);
+	(void)set_exec_env(old_ve);
+	return err;
+}
+
+static int vzdq_aquotd_looktest(struct inode *inode, void *data)
+{
+	return inode->i_op == &vzdq_aquotq_inode_operations &&
+	       vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data;
+}
+
+static int vzdq_aquotd_lookset(struct inode *inode, void *data)
+{
+	dev_t dev;
+
+	dev = (dev_t)(unsigned long)data;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_ino = vzdq_aquot_getino(dev);
+	inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	inode->i_nlink = 2;
+	inode->i_op = &vzdq_aquotq_inode_operations;
+	inode->i_fop = &vzdq_aquotq_file_operations;
+	vzdq_aquot_setidev(inode, dev);
+	return 0;
+}
+
+static struct dentry *vzdq_aquotd_lookup(struct inode *dir,
+		struct dentry *dentry,
+		struct nameidata *nd)
+{
+	struct ve_struct *ve, *old_ve;
+	const unsigned char *s;
+	int l;
+	dev_t dev;
+	struct inode *inode;
+
+	ve = dir->i_sb->s_type->owner_env;
+	old_ve = set_exec_env(ve);
+#ifdef CONFIG_VE
+	/*
+	 * Lookup is much lighter than readdir, so it can be allowed for the
+	 * host system.  But it would be strange to be able to do lookup only
+	 * without readdir...
+	 */
+	if (ve_is_super(ve))
+		goto out;
+#endif
+
+	dev = 0;
+	l = dentry->d_name.len;
+	if (l <= 0)
+		goto out;
+	for (s = dentry->d_name.name; l > 0; s++, l--) {
+		if (!isxdigit(*s))
+			goto out;
+		if (dev & ~(~0UL >> 4))
+			goto out;
+		dev <<= 4;
+		if (isdigit(*s))
+			dev += *s - '0';
+		else if (islower(*s))
+			dev += *s - 'a' + 10;
+		else
+			dev += *s - 'A' + 10;
+	}
+	dev = new_decode_dev(dev);
+
+	if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL))
+		goto out;
+
+	inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev),
+			vzdq_aquotd_looktest, vzdq_aquotd_lookset,
+			(void *)(unsigned long)dev);
+	if (inode == NULL)
+		goto out;
+
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+
+	d_add(dentry, inode);
+	(void)set_exec_env(old_ve);
+	return NULL;
+
+out:
+	(void)set_exec_env(old_ve);
+	return ERR_PTR(-ENOENT);
+}
+
+static int vzdq_aquotd_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct ve_struct *ve, *old_ve;
+	struct list_head mntlist, *pos;
+
+	generic_fillattr(dentry->d_inode, stat);
+	ve = dentry->d_sb->s_type->owner_env;
+#ifdef CONFIG_VE
+	/*
+	 * The only reason of disabling getattr for the host system is that
+	 * this getattr can be slow and CPU consuming with large number of VPSs
+	 * (or just mount points).
+	 */
+	if (ve_is_super(ve))
+		return 0;
+#endif
+	INIT_LIST_HEAD(&mntlist);
+	old_ve = set_exec_env(ve);
+	if (!vzdq_aquot_buildmntlist(ve, &mntlist))
+		list_for_each(pos, &mntlist)
+			stat->nlink++;
+	vzdq_aquot_releasemntlist(ve, &mntlist);
+	(void)set_exec_env(old_ve);
+	return 0;
+}
+
+static struct file_operations vzdq_aquotd_file_operations = {
+	.read		= &generic_read_dir,
+	.readdir	= &vzdq_aquotd_readdir,
+};
+
+static struct inode_operations vzdq_aquotd_inode_operations = {
+	.lookup		= &vzdq_aquotd_lookup,
+	.getattr	= &vzdq_aquotd_getattr,
+};
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Initialization and deinitialization
+ *
+ * --------------------------------------------------------------------- */
+static int fake_data;
+static struct ctl_table fake_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= ".fake",
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+		.data		= &fake_data,
+		.maxlen		= sizeof(int),
+	},
+	{ }
+};
+
+static struct ctl_path fake_path[] = {
+	{ .ctl_name = CTL_FS, .procname = "fs", },
+	{ .ctl_name = FS_DQSTATS, .procname = "quota", },
+	{ }
+};
+
+/*
+ * FIXME: creation of proc entries here is unsafe with respect to module
+ * unloading.
+ */
+void vzaquota_init(void)
+{
+	struct proc_dir_entry *de;
+
+	de = proc_create("vzaquota", S_IFDIR | S_IRUSR | S_IXUSR,
+			glob_proc_vz_dir, &vzdq_aquotd_file_operations);
+	if (de != NULL)
+		de->proc_iops = &vzdq_aquotd_inode_operations;
+	else
+		printk("VZDQ: vz/vzaquota creation failed\n");
+
+	register_sysctl_glob_paths(fake_path, fake_table, 1);
+}
+
+void vzaquota_fini(void)
+{
+	remove_proc_entry("vz/vzaquota", NULL);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_mgmt.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_mgmt.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_mgmt.c	2015-01-21 12:02:53.118963690 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_mgmt.c	2015-01-21 12:02:53.404956099 +0300
@@ -0,0 +1,1274 @@
+/*
+ * Copyright (C) 2001, 2002, 2004, 2005  SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/quota.h>
+#include <linux/vzctl_quota.h>
+#include <linux/vzquota.h>
+#include <linux/buffer_head.h>
+
+
+/* ----------------------------------------------------------------------
+ * Switching quota on.
+ * --------------------------------------------------------------------- */
+
+/*
+ * check limits copied from user
+ */
+int vzquota_check_sane_limits(struct dq_kstat *qstat)
+{
+	int err;
+
+	err = -EINVAL;
+
+	/* softlimit must be less then hardlimit */
+	if (qstat->bsoftlimit > qstat->bhardlimit)
+		goto out;
+
+	if (qstat->isoftlimit > qstat->ihardlimit)
+		goto out;
+
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * check usage values copied from user
+ */
+int vzquota_check_sane_values(struct dq_kstat *qstat)
+{
+	int err;
+
+	err = -EINVAL;
+
+	/* expiration time must not be set if softlimit was not exceeded */
+	if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0)
+		goto out;
+
+	if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0)
+		goto out;
+
+	err = vzquota_check_sane_limits(qstat);
+out:
+	return err;
+}
+
+/*
+ * create new quota master block
+ * this function should:
+ *  - copy limits and usage parameters from user buffer;
+ *  - allock, initialize quota block and insert it to hash;
+ */
+static int vzquota_create(unsigned int quota_id,
+		struct vz_quota_stat __user *u_qstat, int compat)
+{
+	int err;
+	struct vz_quota_stat uqstat;
+	struct vz_quota_kstat qstat;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -EFAULT;
+	if (!compat) {
+		if (copy_from_user(&uqstat, u_qstat, sizeof(uqstat)))
+			goto out;
+	} else {
+#ifdef CONFIG_COMPAT
+		struct compat_vz_quota_stat cqstat;
+		if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat)))
+			goto out;
+		compat_dqstat2dqstat(&cqstat.dq_stat, &uqstat.dq_stat);
+		compat_dqinfo2dqinfo(&cqstat.dq_info, &uqstat.dq_info);
+#endif
+	}
+	user_dqstat2dqstat(&uqstat.dq_stat, &qstat.dq_stat);
+	user_dqinfo2dqinfo(&uqstat.dq_info, &qstat.dq_info);
+
+	err = -EINVAL;
+	if (quota_id == 0)
+		goto out;
+
+	if (vzquota_check_sane_values(&qstat.dq_stat))
+		goto out;
+	err = 0;
+	qmblk = vzquota_alloc_master(quota_id, &qstat);
+
+	if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */
+		err = PTR_ERR(qmblk);
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+/**
+ * vzquota_on - turn quota on
+ *
+ * This function should:
+ *  - find and get refcnt of directory entry for quota root and corresponding
+ *    mountpoint;
+ *  - find corresponding quota block and mark it with given path;
+ *  - check quota tree;
+ *  - initialize quota for the tree root.
+ */
+
+static int __vzquota_on(struct vz_quota_master *qmblk, struct path *path,
+		struct super_block **psb, char __user *buf)
+{
+	int err;
+
+	qmblk->dq_root_path = *path;
+	qmblk->dq_sb = path->dentry->d_inode->i_sb;
+
+	err = vzquota_get_super(qmblk->dq_sb);
+	if (err)
+		goto out_super;
+
+	/*
+	 * Serialization with quota initialization and operations is performed
+	 * through generation check: generation is memorized before qmblk is
+	 * found and compared under inode_qmblk_lock with assignment.
+	 *
+	 * Note that the dentry tree is shrunk only for high-level logical
+	 * serialization, purely as a courtesy to the user: to have consistent
+	 * quota statistics, files should be closed etc. on quota on.
+	 */
+	err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_path.dentry->d_inode,
+			qmblk, buf);
+	if (err)
+		goto out_init;
+
+	qmblk->dq_state = VZDQ_WORKING;
+	return 0;
+
+out_init:
+	*psb = qmblk->dq_sb;
+out_super:
+	qmblk->dq_sb = NULL;
+	qmblk->dq_root_path.dentry = NULL;
+	qmblk->dq_root_path.mnt = NULL;
+
+	return err;
+}
+
+static int vzquota_on(unsigned int quota_id, const char __user *quota_root,
+					char __user *buf)
+{
+	int err;
+	struct path path;
+	struct vz_quota_master *qmblk;
+	struct super_block *dqsb;
+
+	dqsb = NULL;
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	err = -EBUSY;
+	if (qmblk->dq_state != VZDQ_STARTING)
+		goto out;
+
+	err = user_path(quota_root, &path);
+	if (err)
+		goto out;
+	/* init path must be a directory */
+	err = -ENOTDIR;
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+		goto out_path;
+
+	err = __vzquota_on(qmblk, &path, &dqsb, buf);
+	if (err)
+		goto out_path;
+
+	mutex_unlock(&vz_quota_mutex);
+	return 0;
+
+out_path:
+	path_put(&path);
+out:
+	if (dqsb)
+		vzquota_put_super(dqsb);
+	mutex_unlock(&vz_quota_mutex);
+	return err;
+}
+
+static struct inode *vzquota_open_file(unsigned int quota_id, struct super_block *sb)
+{
+	struct dentry *qd;
+	int len;
+	char name[32];
+	struct inode *ino;
+
+	len = sprintf(name, VZQUOTA_ROOT_FILE, quota_id);
+	qd = lookup_one_len(name, sb->s_root, len);
+	if (IS_ERR(qd)) {
+		ino = (struct inode *)qd;
+		goto out;
+	}
+
+	if (qd->d_inode != NULL)
+		ino = igrab(qd->d_inode);
+	else
+		ino = ERR_PTR(-ENOENT);
+
+	dput(qd);
+out:
+	return ino;
+}
+
+static int vzquota_check_file(struct super_block *sb, struct inode *ino)
+{
+	struct vz_quota_hdr qhead;
+	ssize_t size;
+
+	/*
+	 * fixup changes on bdev
+	 */
+
+	invalidate_bdev(sb->s_bdev);
+
+	size = sb->s_op->quota_read_ino(sb, ino, (char *)&qhead, sizeof(qhead), 0);
+	if (size != sizeof(qhead))
+		return -EINVAL;
+
+	if (le32_to_cpu(qhead.magic) != VZQUOTA_MAGIC ||
+			le32_to_cpu(qhead.version) != VZQUOTA_VERSION_0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vzquota_read_file(struct inode *ino, struct vz_quota_kstat *qstat)
+{
+	int ret;
+	struct super_block *sb = ino->i_sb;
+	struct vz_quota_stat_img dqstat;
+	ssize_t size;
+
+	if (sb->s_op->quota_read_ino == NULL)
+		return -EOPNOTSUPP;
+
+	ret = vzquota_check_file(sb, ino);
+	if (ret)
+		return ret;
+
+	size = sb->s_op->quota_read_ino(sb, ino,
+			(char *)&dqstat, sizeof(dqstat), VZQUOTA_STAT_OFF);
+	if (size != sizeof(dqstat))
+		return -EINVAL;
+
+	qstat->dq_stat.bhardlimit = le64_to_cpu(dqstat.bhardlimit);
+	qstat->dq_stat.bsoftlimit = le64_to_cpu(dqstat.bsoftlimit);
+	qstat->dq_stat.btime = le64_to_cpu(dqstat.btime);
+	qstat->dq_stat.bcurrent = le64_to_cpu(dqstat.bcurrent);
+
+	qstat->dq_stat.ihardlimit = le32_to_cpu(dqstat.ihardlimit);
+	qstat->dq_stat.isoftlimit = le32_to_cpu(dqstat.isoftlimit);
+	qstat->dq_stat.itime = le64_to_cpu(dqstat.itime);
+	qstat->dq_stat.icurrent = le32_to_cpu(dqstat.icurrent);
+
+	qstat->dq_info.bexpire = le64_to_cpu(dqstat.bexpire);
+	qstat->dq_info.iexpire = le64_to_cpu(dqstat.iexpire);
+	qstat->dq_info.flags = le32_to_cpu(dqstat.flags);
+
+	return 0;
+}
+
+static int vzquota_on_file(unsigned int quota_id, const char __user *quota_root,
+					char __user *buf)
+{
+	int ret;
+	struct path path;
+	struct inode *ino;
+	struct vz_quota_kstat qstat;
+	struct vz_quota_master *qmblk = NULL;
+	struct super_block *dqsb = NULL;
+
+	ret = -EINVAL;
+	if (quota_id == 0)
+		goto out;
+
+	ret = user_path(quota_root, &path);
+	if (ret)
+		goto out;
+
+	ret = -ENOTDIR;
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+		goto out_path;
+
+	ino = vzquota_open_file(quota_id, path.dentry->d_sb);
+	if (IS_ERR(ino)) {
+		ret = PTR_ERR(ino);
+		goto out_path;
+	}
+
+	ret = vzquota_read_file(ino, &qstat);
+	if (ret < 0)
+		goto out_iput;
+
+	ret = vzquota_check_sane_values(&qstat.dq_stat);
+	if (ret)
+		goto out_iput;
+
+	mutex_lock(&vz_quota_mutex);
+	ret = -EBUSY;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk != NULL)
+		goto out_unlock;
+
+	qmblk = vzquota_alloc_master(quota_id, &qstat);
+	if (IS_ERR(qmblk)) {
+		qmblk = NULL;
+		ret = PTR_ERR(qmblk);
+		goto out_unlock;
+	}
+
+	ret = vzquota_read_ugid(qmblk, ino);
+	if (ret)
+		goto out_kill_qmblk;
+
+	ret = __vzquota_on(qmblk, &path, &dqsb, buf);
+	if (ret)
+		goto out_kill_qmblk;
+
+	qmblk->qfile = ino;
+	mutex_unlock(&vz_quota_mutex);
+	return 0;
+
+out_kill_qmblk:
+	list_del_init(&qmblk->dq_hash);
+out_unlock:
+	mutex_unlock(&vz_quota_mutex);
+out_iput:
+	iput(ino);
+out_path:
+	path_put(&path);
+out:
+	if (dqsb)
+		vzquota_put_super(dqsb);
+	mutex_unlock(&vz_quota_mutex);
+	if (qmblk)
+		qmblk_put(qmblk);
+	return ret;
+}
+
+int vzquota_on_cookie(struct super_block *sb, unsigned int cookie)
+{
+	int err = 0;
+	struct vz_quota_master *qmblk;
+	struct inode *ino;
+	struct vz_quota_kstat qstat;
+
+	mutex_lock(&vz_quota_mutex);
+	qmblk = vzquota_find_master(cookie);
+	if (qmblk != NULL) {
+		if (qmblk->dq_state == VZDQ_ORPHAN_CLEANUP)
+			goto out_ok;
+
+		printk("VZDQ: Tried to clean orphans on qmblk with %d state\n",
+				qmblk->dq_state);
+		err = -EBUSY;
+		goto out;
+	}
+
+	ino = vzquota_open_file(cookie, sb);
+	if (IS_ERR(ino)) {
+		err = PTR_ERR(ino);
+		goto out;
+	}
+
+	err = vzquota_read_file(ino, &qstat);
+	if (err)
+		goto out_iput;
+
+	qmblk = vzquota_alloc_master(cookie, &qstat);
+	if (IS_ERR(qmblk)) {
+		err = PTR_ERR(qmblk);
+		goto out_iput;
+	}
+
+	err = vzquota_get_super(sb);
+	if (err)
+		goto out_qput;
+
+	qmblk->dq_sb = sb;
+	qmblk->qfile = ino;
+	qmblk->dq_state = VZDQ_ORPHAN_CLEANUP;
+out_ok:
+	vzquota_cur_qmblk_orphan_set(qmblk);
+out:
+	mutex_unlock(&vz_quota_mutex);
+	return err;
+
+out_qput:
+	qmblk_put(qmblk);
+out_iput:
+	iput(ino);
+	goto out;
+}
+
+void vzquota_off_cookies(struct super_block *sb)
+{
+	int i;
+	struct vz_quota_master *qmblk;
+
+	vzquota_cur_qmblk_orphan_set(NULL);
+again:
+	mutex_lock(&vz_quota_mutex);
+	for (i = 0; i < vzquota_hash_size; i++) {
+		list_for_each_entry(qmblk, &vzquota_hash_table[i], dq_hash) {
+			if (qmblk->dq_state != VZDQ_ORPHAN_CLEANUP)
+				continue;
+			if (qmblk->dq_sb != sb)
+				continue;
+
+			list_del_init(&qmblk->dq_hash);
+			vzquota_put_super(qmblk->dq_sb);
+			mutex_unlock(&vz_quota_mutex);
+
+			iput(qmblk->qfile);
+			qmblk_put(qmblk);
+
+			goto again;
+		}
+	}
+	mutex_unlock(&vz_quota_mutex);
+
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+}
+
+/* ----------------------------------------------------------------------
+ * Switching quota off.
+ * --------------------------------------------------------------------- */
+
+static void vzquota_stat_dump(struct vz_quota_master *qmblk,
+		struct vz_quota_stat_img *img)
+{
+	img->bhardlimit = cpu_to_le64(qmblk->dq_stat.bhardlimit);
+	img->bsoftlimit = cpu_to_le64(qmblk->dq_stat.bsoftlimit);
+	img->btime = cpu_to_le64(qmblk->dq_stat.btime);
+	img->bcurrent = cpu_to_le64(qmblk->dq_stat.bcurrent);
+
+	img->ihardlimit = cpu_to_le32(qmblk->dq_stat.ihardlimit);
+	img->isoftlimit = cpu_to_le32(qmblk->dq_stat.isoftlimit);
+	img->itime = cpu_to_le64(qmblk->dq_stat.itime);
+	img->icurrent = cpu_to_le32(qmblk->dq_stat.icurrent);
+
+	img->bexpire = cpu_to_le64(qmblk->dq_info.bexpire);
+	img->iexpire = cpu_to_le64(qmblk->dq_info.iexpire);
+	img->flags = cpu_to_le64(qmblk->dq_info.flags);
+}
+
+static int vzquota_stat_write(struct inode *ino, struct vz_quota_stat_img *dqstat)
+{
+	ssize_t size;
+	struct super_block *sb = ino->i_sb;
+
+	size = sb->s_op->quota_write_ino(sb, ino,
+			(char *)dqstat, sizeof(*dqstat), VZQUOTA_STAT_OFF);
+
+	return (size == sizeof(*dqstat)) ? 0 : -EIO;
+}
+
+static int vzquota_write_file(struct vz_quota_master *qmblk,
+		struct inode *ino, struct vz_quota_ugid **ugid)
+{
+	struct vz_quota_stat_img dqstat;
+	struct vz_quota_uginfo_img uginfo;
+	unsigned char ubuf[VZQUOTA_UGID_ITEM_SIZE];
+	unsigned char gbuf[VZQUOTA_UGID_ITEM_SIZE];
+
+	/* FIXME - this locking is not very good */
+	mutex_lock(&qmblk->dq_write_lock);
+	qmblk_data_read_lock(qmblk);
+
+	vzquota_stat_dump(qmblk, &dqstat);
+	vzquota_uginfo_dump(qmblk, &uginfo);
+	if (ugid[0] != NULL)
+		vzquota_ugid_dump(ugid[0],
+				(struct vz_quota_ugid_stat_img *)ubuf);
+	if (ugid[1] != NULL)
+		vzquota_ugid_dump(ugid[1],
+				(struct vz_quota_ugid_stat_img *)gbuf);
+
+	qmblk_data_read_unlock(qmblk);
+
+	if (vzquota_stat_write(ino, &dqstat))
+		goto err;
+
+	if (vzquota_uginfo_write(ino, &uginfo))
+		goto err;
+
+	if (ugid[0] != NULL &&
+			vzquota_ugid_write(ino,
+				(struct vz_quota_ugid_stat_img *)ubuf,
+				ugid[0]->qugid_id, 0))
+		goto err;
+	if (ugid[1] != NULL &&
+			vzquota_ugid_write(ino,
+				(struct vz_quota_ugid_stat_img *)gbuf,
+				ugid[1]->qugid_id, 1))
+		goto err;
+
+	mutex_unlock(&qmblk->dq_write_lock);
+	return 0;
+
+err:
+	mutex_unlock(&qmblk->dq_write_lock);
+	return -1;
+}
+
+static int vzquota_write_ugids(struct vz_quota_master *qmblk,
+		struct inode *ino, struct vz_quota_ugid **ugid)
+{
+	unsigned char ubuf[2][VZQUOTA_UGID_ITEM_SIZE];
+	unsigned char gbuf[2][VZQUOTA_UGID_ITEM_SIZE];
+
+	/* FIXME - this locking is not very good */
+	mutex_lock(&qmblk->dq_write_lock);
+	qmblk_data_read_lock(qmblk);
+
+	if (ugid[0] != NULL) {
+		vzquota_ugid_dump(ugid[0],
+				(struct vz_quota_ugid_stat_img *)ubuf[0]);
+		vzquota_ugid_dump(ugid[0 + MAXQUOTAS],
+				(struct vz_quota_ugid_stat_img *)ubuf[1]);
+	}
+	if (ugid[1] != NULL) {
+		vzquota_ugid_dump(ugid[1],
+				(struct vz_quota_ugid_stat_img *)gbuf[0]);
+		vzquota_ugid_dump(ugid[1 + MAXQUOTAS],
+				(struct vz_quota_ugid_stat_img *)gbuf[1]);
+	}
+
+
+	qmblk_data_read_unlock(qmblk);
+
+	if (ugid[0] != NULL) {
+		if (vzquota_ugid_write(ino,
+				(struct vz_quota_ugid_stat_img *)ubuf[0],
+				ugid[0]->qugid_id, 0))
+			goto err;
+		if (vzquota_ugid_write(ino,
+				(struct vz_quota_ugid_stat_img *)ubuf[1],
+				ugid[0 + MAXQUOTAS]->qugid_id, 0))
+			goto err;
+	}
+
+	if (ugid[1] != NULL) {
+		if (vzquota_ugid_write(ino,
+				(struct vz_quota_ugid_stat_img *)gbuf[0],
+				ugid[1]->qugid_id, 1))
+			goto err;
+		if (vzquota_ugid_write(ino,
+				(struct vz_quota_ugid_stat_img *)gbuf[1],
+				ugid[1 + MAXQUOTAS]->qugid_id, 1))
+			goto err;
+	}
+
+	mutex_unlock(&qmblk->dq_write_lock);
+	return 0;
+
+err:
+	mutex_unlock(&qmblk->dq_write_lock);
+	return -1;
+}
+
+static void vzquota_sync_file(struct vz_quota_master *qmblk, struct inode *ino)
+{
+
+	struct super_block *sb = ino->i_sb;
+	struct vz_quota_ugid *no_ugids[2] = { NULL, NULL };
+
+	vzquota_write_file(qmblk, ino, no_ugids);
+
+	/*
+	 * FIXME - this is taken from quota.c, they know this is slow
+	 *         and don't know how to fix it :(
+	 */
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	invalidate_inode_pages(ino->i_mapping);
+}
+
+/*
+ * destroy quota block by ID
+ */
+static int __vzquota_destroy(struct vz_quota_master *qmblk)
+{
+	int err;
+	struct path root;
+	struct inode *qfile;
+
+	err = -EBUSY;
+	if (qmblk->dq_state == VZDQ_WORKING)
+		goto out;
+
+	qfile = qmblk->qfile;
+	qmblk->qfile = NULL;
+
+	list_del_init(&qmblk->dq_hash);
+	root = qmblk->dq_root_path;
+	qmblk->dq_root_path.dentry = NULL;
+	qmblk->dq_root_path.mnt = NULL;
+
+	if (qmblk->dq_sb)
+		vzquota_put_super(qmblk->dq_sb);
+	mutex_unlock(&vz_quota_mutex);
+
+	path_put(&root);
+
+	if (qfile != NULL) {
+		vzquota_sync_file(qmblk, qfile);
+		iput(qfile);
+	}
+	qmblk_put(qmblk);
+
+	return 0;
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	return err;
+}
+
+static int vzquota_destroy(unsigned int quota_id)
+{
+	int ret;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex);
+	ret = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk)
+		return __vzquota_destroy(qmblk);
+
+	mutex_unlock(&vz_quota_mutex);
+	return ret;
+}
+
+/**
+ * vzquota_off - turn quota off
+ */
+
+static int __vzquota_sync_list(struct list_head *lh,
+		struct vz_quota_master *qmblk, int sync)
+{
+	LIST_HEAD(list);
+	struct vz_quota_ilink *qlnk;
+	struct inode *inode;
+	int err, ret;
+
+	err = ret = 0;
+	while (!list_empty(lh)) {
+		if (need_resched()) {
+			inode_qmblk_unlock(qmblk->dq_sb);
+			schedule();
+			inode_qmblk_lock(qmblk->dq_sb);
+			continue;
+		}
+
+		qlnk = list_first_entry(lh, struct vz_quota_ilink, list);
+		list_move(&qlnk->list, &list);
+
+		inode = igrab(QLNK_INODE(qlnk));
+		if (!inode)
+			continue;
+
+		inode_qmblk_unlock(qmblk->dq_sb);
+		ret = write_inode_now(inode, sync);
+		if (ret)
+			err = ret;
+		iput(inode);
+
+		inode_qmblk_lock(qmblk->dq_sb);
+	}
+
+	list_splice(&list, lh);
+	return err;
+}
+
+static int vzquota_sync_list(struct list_head *lh,
+		struct vz_quota_master *qmblk)
+{
+	(void)__vzquota_sync_list(lh, qmblk, 0);
+	return __vzquota_sync_list(lh, qmblk, 1);
+}
+
+static int vzquota_sync_inodes(struct vz_quota_master *qmblk)
+{
+	int err;
+	LIST_HEAD(qlnk_list);
+
+	list_splice_init(&qmblk->dq_ilink_list, &qlnk_list);
+	err = vzquota_sync_list(&qlnk_list, qmblk);
+	if (!err && !list_empty(&qmblk->dq_ilink_list))
+		err = -EBUSY;
+	list_splice(&qlnk_list, &qmblk->dq_ilink_list);
+
+	return err;
+}
+
+static int __vzquota_off(struct vz_quota_master *qmblk, char __user *buf, int force)
+{
+	int err, ret;
+
+	err = -EALREADY;
+	if (qmblk->dq_state != VZDQ_WORKING)
+		goto out;
+
+	inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */
+	ret = vzquota_sync_inodes(qmblk);
+	inode_qmblk_unlock(qmblk->dq_sb);
+
+	err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force);
+	if (err)
+		goto out;
+
+	err = ret;
+	/* vzquota_destroy will free resources */
+	qmblk->dq_state = VZDQ_STOPING;
+
+out:
+	return err;
+}
+
+static int vzquota_off(unsigned int quota_id, char __user *buf, int force)
+{
+	int ret;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex);
+	ret = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	ret = -EINVAL;
+	if (qmblk->qfile != NULL)
+		goto out;
+
+	ret = __vzquota_off(qmblk, buf, force);
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return ret;
+}
+
+static int vzquota_off_file(unsigned int quota_id, char __user *buf)
+{
+	int ret;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex);
+	ret = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	ret = -EINVAL;
+	if (qmblk->qfile == NULL)
+		goto out;
+
+	ret = __vzquota_off(qmblk, buf, 0);
+	if (ret == 0)
+		return __vzquota_destroy(qmblk);
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	return ret;
+}
+
+void __vzquota_mark_dirty(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid **ugid)
+{
+	int ret;
+
+	ret = vzquota_write_file(qmblk, qmblk->qfile, ugid);
+	if (ret)
+		printk(KERN_ERR "vzdq: error writing quota. "
+				"In case of crash quota will be inconsistent.");
+}
+
+void __vzquota_mark_dirty_ugids(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid **ugids)
+{
+	int ret;
+
+	ret = vzquota_write_ugids(qmblk, qmblk->qfile, ugids);
+	if (ret)
+		printk(KERN_ERR "vzdq: error writing ugids. "
+				"In case of crash quota will be inconsistent.");
+}
+
+/* ----------------------------------------------------------------------
+ * Other VZQUOTA ioctl's.
+ * --------------------------------------------------------------------- */
+
+/*
+ * this function should:
+ * - set new limits/buffer under quota master block lock
+ * - if new softlimit less then usage, then set expiration time
+ * - no need to alloc ugid hash table - we'll do that on demand
+ */
+int vzquota_update_limit(struct dq_kstat *_qstat,
+		struct dq_kstat *qstat)
+{
+	int err;
+
+	err = -EINVAL;
+	if (vzquota_check_sane_limits(qstat))
+		goto out;
+
+	err = 0;
+
+	/* limits */
+	_qstat->bsoftlimit = qstat->bsoftlimit;
+	_qstat->bhardlimit = qstat->bhardlimit;
+	/*
+	 * If the soft limit is exceeded, administrator can override the moment
+	 * when the grace period for limit exceeding ends.
+	 * Specifying the moment may be useful if the soft limit is set to be
+	 * lower than the current usage.  In the latter case, if the grace
+	 * period end isn't specified, the grace period will start from the
+	 * moment of the first write operation.
+	 * There is a race with the user level.  Soft limit may be already
+	 * exceeded before the limit change, and grace period end calculated by
+	 * the kernel will be overriden.  User level may check if the limit is
+	 * already exceeded, but check and set calls are not atomic.
+	 * This race isn't dangerous.  Under normal cicrumstances, the
+	 * difference between the grace period end calculated by the kernel and
+	 * the user level should be not greater than as the difference between
+	 * the moments of check and set calls, i.e. not bigger than the quota
+	 * timer resolution - 1 sec.
+	 */
+	if (qstat->btime != (time_t)0 &&
+			_qstat->bcurrent >= _qstat->bsoftlimit)
+		_qstat->btime = qstat->btime;
+
+	_qstat->isoftlimit = qstat->isoftlimit;
+	_qstat->ihardlimit = qstat->ihardlimit;
+	if (qstat->itime != (time_t)0 &&
+			_qstat->icurrent >= _qstat->isoftlimit)
+		_qstat->itime = qstat->itime;
+
+out:
+	return err;
+}
+
+/*
+ * set new quota limits.
+ * this function should:
+ *  copy new limits from user level
+ *  - find quota block
+ *  - set new limits and flags.
+ */
+static int vzquota_setlimit(unsigned int quota_id,
+		struct vz_quota_stat __user *u_qstat, int compat)
+{
+	int err;
+	struct vz_quota_stat uqstat;
+	struct vz_quota_kstat qstat;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex); /* for hash list protection */
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	err = -EFAULT;
+	if (!compat) {
+		if (copy_from_user(&uqstat, u_qstat, sizeof(uqstat)))
+			goto out;
+	} else {
+#ifdef CONFIG_COMPAT
+		struct compat_vz_quota_stat cqstat;
+		if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat)))
+			goto out;
+		compat_dqstat2dqstat(&cqstat.dq_stat, &uqstat.dq_stat);
+		compat_dqinfo2dqinfo(&cqstat.dq_info, &uqstat.dq_info);
+#endif
+	}
+	user_dqstat2dqstat(&uqstat.dq_stat, &qstat.dq_stat);
+	user_dqinfo2dqinfo(&uqstat.dq_info, &qstat.dq_info);
+
+	qmblk_data_write_lock(qmblk);
+	err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat);
+	if (err == 0)
+		qmblk->dq_info = qstat.dq_info;
+	qmblk_data_write_unlock(qmblk);
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	return err;
+}
+
+/*
+ * get quota limits.
+ * very simple - just return stat buffer to user
+ */
+static int vzquota_getstat(unsigned int quota_id,
+		struct vz_quota_stat __user *u_qstat, int compat)
+{
+	int err;
+	struct vz_quota_stat uqstat;
+	struct vz_quota_kstat qstat;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	qmblk_data_read_lock(qmblk);
+	/* copy whole buffer under lock */
+	memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat));
+	memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info));
+	qmblk_data_read_unlock(qmblk);
+	dqstat2user_dqstat(&qstat.dq_stat, &uqstat.dq_stat);
+	dqinfo2user_dqinfo(&qstat.dq_info, &uqstat.dq_info);
+	if (!compat) {
+		err = copy_to_user(u_qstat, &uqstat, sizeof(uqstat));
+	} else {
+#ifdef CONFIG_COMPAT
+		struct compat_vz_quota_stat cqstat;
+		dqstat2compat_dqstat(&uqstat.dq_stat, &cqstat.dq_stat);
+		dqinfo2compat_dqinfo(&uqstat.dq_info, &cqstat.dq_info);
+		err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat));
+#endif
+	}
+	if (err)
+		err = -EFAULT;
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	return err;
+}
+
+static int vzquota_get_status(unsigned int quota_id)
+{
+	int ret;
+	struct vz_quota_master *qmblk;
+
+	mutex_lock(&vz_quota_mutex);
+
+	ret = -ESRCH;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk) {
+		if (qmblk->qfile != NULL)
+			ret = VZDQ_WORKING_JOURNAL;
+		else
+			ret = qmblk->dq_state;
+	}
+
+	mutex_unlock(&vz_quota_mutex);
+
+	return ret;
+}
+
+/*
+ * This is a system call to turn per-VE disk quota on.
+ * Note this call is allowed to run ONLY from VE0
+ */
+long do_vzquotactl(int cmd, unsigned int quota_id,
+		struct vz_quota_stat __user *qstat, const char __user *ve_root,
+		int compat)
+{
+	int ret;
+	int force = 0;
+
+	ret = -EPERM;
+	/* access allowed only from root of VE0 */
+	if (!capable(CAP_SYS_RESOURCE) ||
+	    !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	switch (cmd) {
+		case VZ_DQ_CREATE:
+			ret = vzquota_create(quota_id, qstat, compat);
+			break;
+		case VZ_DQ_DESTROY:
+			ret = vzquota_destroy(quota_id);
+			break;
+		case VZ_DQ_ON:
+			/* 
+			 * qstat is just a pointer to userspace buffer to
+			 * store busy files path in case of vzquota_on fail
+			 */
+			ret = vzquota_on(quota_id, ve_root, (char *)qstat);
+			break;
+		case VZ_DQ_ON_FILE:
+			ret = vzquota_on_file(quota_id, ve_root, (char *)qstat);
+			break;
+		case VZ_DQ_OFF_FORCED:
+			force = 1;
+		case VZ_DQ_OFF:
+			/* 
+			 * ve_root is just a pointer to userspace buffer to
+			 * store busy files path in case of vzquota_off fail
+			 */
+			ret = vzquota_off(quota_id, (char *)ve_root, force);
+			break;
+		case VZ_DQ_OFF_FILE:
+			ret = vzquota_off_file(quota_id, (char *)ve_root);
+			break;
+		case VZ_DQ_SETLIMIT:
+			ret = vzquota_setlimit(quota_id, qstat, compat);
+			break;
+		case VZ_DQ_GETSTAT:
+			ret = vzquota_getstat(quota_id, qstat, compat);
+			break;
+		case VZ_DQ_STATUS:
+			ret = vzquota_get_status(quota_id);
+			break;
+
+		default:
+			ret = -EINVAL;
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Proc filesystem routines
+ * ---------------------------------------------------------------------*/
+
+#if defined(CONFIG_PROC_FS)
+
+#define QUOTA_UINT_LEN		15
+#define QUOTA_TIME_LEN_FMT_UINT	"%11u"
+#define QUOTA_NUM_LEN_FMT_UINT	"%15u"
+#define QUOTA_NUM_LEN_FMT_ULL	"%15Lu"
+#define QUOTA_TIME_LEN_FMT_STR	"%11s"
+#define QUOTA_NUM_LEN_FMT_STR	"%15s"
+#define QUOTA_PROC_MAX_LINE_LEN 2048
+
+/*
+ * prints /proc/ve_dq header line
+ */
+static int print_proc_header(char * buffer)
+{
+	return sprintf(buffer,
+		       "%-11s"
+		       QUOTA_NUM_LEN_FMT_STR
+		       QUOTA_NUM_LEN_FMT_STR
+		       QUOTA_NUM_LEN_FMT_STR
+		       QUOTA_TIME_LEN_FMT_STR
+		       QUOTA_TIME_LEN_FMT_STR
+		       "\n",
+		       "qid: path", 
+		       "usage", "softlimit", "hardlimit", "time", "expire");
+}
+
+/*
+ * prints proc master record id, dentry path
+ */
+static int print_proc_master_id(char * buffer, char * path_buf,
+		struct vz_quota_master * qp)
+{
+	char *path;
+	int over;
+
+	path = NULL;
+	switch (qp->dq_state) {
+		case VZDQ_WORKING:
+			if (!path_buf) {
+				path = "";
+				break;
+			}
+			path = d_path(&qp->dq_root_path, path_buf, PAGE_SIZE);
+			if (IS_ERR(path)) {
+				path = "";
+				break;
+			}
+			/* do not print large path, truncate it */
+			over = strlen(path) -
+				(QUOTA_PROC_MAX_LINE_LEN - 3 - 3 -
+				 	QUOTA_UINT_LEN);
+			if (over > 0) {
+				path += over - 3;
+				path[0] = path[1] = path[3] = '.';
+			}
+			break;
+		case VZDQ_STARTING:
+			path = "-- started --";
+			break;
+		case VZDQ_STOPING:
+			path = "-- stopped --";
+			break;
+	}
+
+	return sprintf(buffer, "%u: %s\n", qp->dq_id, path);
+}
+
+/*
+ * prints struct vz_quota_kstat data
+ */
+static int print_proc_stat(char * buffer, struct dq_kstat *qs,
+		struct dq_kinfo *qi)
+{
+	return sprintf(buffer,
+		       "%11s"
+		       QUOTA_NUM_LEN_FMT_ULL
+		       QUOTA_NUM_LEN_FMT_ULL
+		       QUOTA_NUM_LEN_FMT_ULL
+		       QUOTA_TIME_LEN_FMT_UINT
+		       QUOTA_TIME_LEN_FMT_UINT
+		       "\n"
+		       "%11s"
+		       QUOTA_NUM_LEN_FMT_UINT
+		       QUOTA_NUM_LEN_FMT_UINT
+		       QUOTA_NUM_LEN_FMT_UINT
+		       QUOTA_TIME_LEN_FMT_UINT
+		       QUOTA_TIME_LEN_FMT_UINT
+		       "\n",
+		       "1k-blocks",
+		       (unsigned long long)qs->bcurrent >> 10,
+		       (unsigned long long)qs->bsoftlimit >> 10,
+		       (unsigned long long)qs->bhardlimit >> 10,
+		       (unsigned int)qs->btime,
+		       (unsigned int)qi->bexpire,
+		       "inodes",
+		       qs->icurrent,
+		       qs->isoftlimit,
+		       qs->ihardlimit,
+		       (unsigned int)qs->itime,
+		       (unsigned int)qi->iexpire);
+}
+
+
+/*
+ * for /proc filesystem output
+ */
+static int vzquota_read_proc(char *page, char **start, off_t off, int count,
+			   int *eof, void *data)
+{
+	int len, i;
+	off_t printed = 0;
+	char *p = page;
+	struct vz_quota_master *qp;
+	struct vz_quota_ilink *ql2;
+	struct list_head *listp;
+	char *path_buf;
+
+	path_buf = (char*)__get_free_page(GFP_KERNEL);
+	if (path_buf == NULL)
+		return -ENOMEM;
+
+	len = print_proc_header(p);
+	printed += len;
+	if (off < printed) /* keep header in output */ {
+		*start = p + off;
+		p += len;
+	}
+
+	mutex_lock(&vz_quota_mutex);
+
+	/* traverse master hash table for all records */
+	for (i = 0; i < vzquota_hash_size; i++) {
+		list_for_each(listp, &vzquota_hash_table[i]) {
+			qp = list_entry(listp,
+					struct vz_quota_master, dq_hash);
+
+			/* Skip other VE's information if not root of VE0 */
+			if ((!capable(CAP_SYS_ADMIN) ||
+			     !capable(CAP_SYS_RESOURCE))) {
+				ql2 = INODE_QLNK(current->fs->root.dentry->d_inode);
+				if (ql2 == NULL || qp != ql2->qmblk)
+					continue;
+			}
+			/*
+			 * Now print the next record
+			 */
+			len = 0;
+			/* we print quotaid and path only in VE0 */
+			if (capable(CAP_SYS_ADMIN))
+				len += print_proc_master_id(p+len,path_buf, qp);
+			len += print_proc_stat(p+len, &qp->dq_stat,
+					&qp->dq_info);
+			printed += len;
+			/* skip unnecessary lines */
+			if (printed <= off)
+				continue;
+			p += len;
+			/* provide start offset */
+			if (*start == NULL)
+				*start = p + (off - printed);
+			/* have we printed all requested size? */
+			if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN ||
+			    (p - *start) >= count)
+				goto out;
+		}
+	}
+
+	*eof = 1; /* checked all hash */
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	len = 0;
+	if (*start != NULL) {
+		len = (p - *start);
+		if (len > count)
+			len = count;
+	}
+
+	if (path_buf)
+		free_page((unsigned long) path_buf);
+
+	return len;
+}
+
+/*
+ * Register procfs read callback
+ */
+int vzquota_proc_init(void)
+{
+	struct proc_dir_entry *de;
+
+	de = proc_create("vzquota", S_IFREG|S_IRUSR, proc_vz_dir, NULL);
+	if (de == NULL)
+		return -EBUSY;
+
+	de->read_proc = vzquota_read_proc;
+	de->data = NULL;
+	return 0;
+}
+
+void vzquota_proc_release(void)
+{
+	/* Unregister procfs read callback */
+	remove_proc_entry("vzquota", proc_vz_dir);
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_ops.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_ops.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_ops.c	2015-01-21 12:02:53.118963690 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_ops.c	2015-01-21 12:02:53.398956258 +0300
@@ -0,0 +1,883 @@
+/*
+ * Copyright (C) 2001, 2002, 2004, 2005  SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/vzquota.h>
+#include <linux/quotaops.h>
+#include <linux/vzsnap.h>
+
+/* ----------------------------------------------------------------------
+ * Quota superblock operations - helper functions.
+ * --------------------------------------------------------------------- */
+
+static inline void vzquota_incr_inodes(struct dq_kstat *dqstat,
+		unsigned long number)
+{
+	dqstat->icurrent += number;
+}
+
+static inline void vzquota_add_space(struct dq_kstat *dqstat,
+				__u64 number)
+{
+	dqstat->bcurrent += number;
+}
+static inline void vzquota_rsv_space(struct dq_kstat *dqstat,
+				__u64 number)
+{
+	dqstat->breserved += number;
+}
+static inline void vzquota_incr_space(struct dq_kstat *dqstat, __u64 number,
+				int reserved)
+{
+	if (reserved)
+		vzquota_rsv_space(dqstat, number);
+	else
+		vzquota_add_space(dqstat, number);
+}
+
+
+static inline void vzquota_decr_inodes(struct dq_kstat *dqstat,
+		__u64 number)
+{
+	if (dqstat->icurrent > number)
+		dqstat->icurrent -= number;
+	else
+		dqstat->icurrent = 0;
+	if (dqstat->icurrent < dqstat->isoftlimit)
+		dqstat->itime = (time_t) 0;
+}
+
+static inline void vzquota_free_space(struct dq_kstat *dqstat,
+		__u64 number)
+{
+	if (dqstat->bcurrent > number)
+		dqstat->bcurrent -= number;
+	else
+		dqstat->bcurrent = 0;
+	if (dqstat->bcurrent < dqstat->bsoftlimit)
+		dqstat->btime = (time_t) 0;
+}
+
+static inline void vzquota_free_rsv_space(struct dq_kstat *dqstat, unsigned long  number)
+{
+	if (dqstat->breserved > number)
+		dqstat->breserved -= number;
+	else
+		dqstat->breserved = 0;
+}
+
+static inline void vzquota_decr_space(struct dq_kstat *dqstat, __u64 number,
+	int reserved)
+{
+	if (reserved)
+		vzquota_free_rsv_space(dqstat, number);
+	else
+		vzquota_free_space(dqstat, number);
+}
+
+static inline void vzquota_claim_rsv_space(struct dq_kstat *dqstat,
+					__u64 number)
+{
+	if (dqstat->breserved > number)
+		dqstat->breserved -= number;
+	else
+		dqstat->breserved = 0;
+	dqstat->bcurrent += number;
+}
+
+/*
+ * better printk() message or use /proc/vzquotamsg interface
+ * similar to /proc/kmsg
+ */
+static inline void vzquota_warn(struct dq_kinfo *dq_info, int dq_id, int flag,
+		const char *fmt)
+{
+	if (dq_info->flags & flag) /* warning already printed for this
+				       masterblock */
+		return;
+	printk(fmt, dq_id);
+	dq_info->flags |= flag;
+}
+
+/*
+ * ignore_hardlimit -
+ *
+ * Intended to allow superuser of VE0 to overwrite hardlimits.
+ *
+ * ignore_hardlimit() has a very bad feature:
+ *
+ *	writepage() operation for writable mapping of a file with holes
+ *	may trigger get_block() with wrong current and as a consequence,
+ *	opens a possibility to overcommit hardlimits
+ */
+/* for the reason above, it is disabled now */
+static inline int ignore_hardlimit(struct dq_kinfo *dqstat)
+{
+#if 0
+	return	ve_is_super(get_exec_env()) &&
+		capable(CAP_SYS_RESOURCE) &&
+		(dqstat->options & VZ_QUOTA_OPT_RSQUASH);
+#else
+	return 0;
+#endif
+}
+
+static int vzquota_check_inodes(struct dq_kinfo *dq_info,
+		struct dq_kstat *dqstat,
+		unsigned long number, int dq_id)
+{
+	if (number == 0)
+		return QUOTA_OK;
+
+	if (dqstat->icurrent + number > dqstat->ihardlimit &&
+	    !ignore_hardlimit(dq_info)) {
+		vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
+			   "VZ QUOTA: file hardlimit reached for id=%d\n");
+		return NO_QUOTA;
+	}
+
+	if (dqstat->icurrent + number > dqstat->isoftlimit) {
+		if (dqstat->itime == (time_t)0) {
+			vzquota_warn(dq_info, dq_id, 0,
+				"VZ QUOTA: file softlimit exceeded "
+				"for id=%d\n");
+			dqstat->itime = CURRENT_TIME_SECONDS +
+				dq_info->iexpire;
+		} else if (CURRENT_TIME_SECONDS >= dqstat->itime &&
+			   !ignore_hardlimit(dq_info)) {
+			vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
+				"VZ QUOTA: file softlimit expired "
+				"for id=%d\n");
+			return NO_QUOTA;
+		}
+	}
+
+	return QUOTA_OK;
+}
+
+static int vzquota_check_space(struct dq_kinfo *dq_info,
+		struct dq_kstat *dqstat,
+		__u64 number, int dq_id, char prealloc)
+{
+	__u64 bcurr = dqstat->bcurrent + dqstat->breserved;
+	if (number == 0)
+		return QUOTA_OK;
+
+	if (prealloc & DQUOT_SPACE_NOFAIL)
+		return QUOTA_OK;
+
+	if (bcurr + number >dqstat->bhardlimit && !ignore_hardlimit(dq_info)) {
+		if (!prealloc)
+			vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
+				"VZ QUOTA: disk hardlimit reached "
+				"for id=%d\n");
+		return NO_QUOTA;
+	}
+
+	if (bcurr + number > dqstat->bsoftlimit) {
+		if (dqstat->btime == (time_t)0) {
+			if (!prealloc) {
+				vzquota_warn(dq_info, dq_id, 0,
+					"VZ QUOTA: disk softlimit exceeded "
+					"for id=%d\n");
+				dqstat->btime = CURRENT_TIME_SECONDS
+							+ dq_info->bexpire;
+			} else {
+				/*
+				 * Original Linux quota doesn't allow
+				 * preallocation to exceed softlimit so
+				 * exceeding will be always printed
+				 */
+				return NO_QUOTA;
+			}
+		} else if (CURRENT_TIME_SECONDS >= dqstat->btime &&
+			   !ignore_hardlimit(dq_info)) {
+			if (!prealloc)
+				vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
+					"VZ QUOTA: disk quota "
+					"softlimit expired "
+					"for id=%d\n");
+			return NO_QUOTA;
+		}
+	}
+
+	return QUOTA_OK;
+}
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid *qugid[],
+		int type, unsigned long number)
+{
+	struct dq_kinfo *dqinfo;
+	struct dq_kstat *dqstat;
+
+	if (qugid[type] == NULL)
+		return QUOTA_OK;
+	if (qugid[type] == VZ_QUOTA_UGBAD)
+		return NO_QUOTA;
+
+	if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
+		return QUOTA_OK;
+	if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
+		return QUOTA_OK;
+	if (number == 0)
+		return QUOTA_OK;
+
+	dqinfo = &qmblk->dq_ugid_info[type];
+	dqstat = &qugid[type]->qugid_stat;
+
+	if (dqstat->ihardlimit != 0 &&
+	    dqstat->icurrent + number > dqstat->ihardlimit)
+		return NO_QUOTA;
+
+	if (dqstat->isoftlimit != 0 &&
+	    dqstat->icurrent + number > dqstat->isoftlimit) {
+		if (dqstat->itime == (time_t)0)
+			dqstat->itime = CURRENT_TIME_SECONDS +
+				dqinfo->iexpire;
+		else if (CURRENT_TIME_SECONDS >= dqstat->itime)
+			return NO_QUOTA;
+	}
+
+	return QUOTA_OK;
+}
+
+static int vzquota_check_ugid_space(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid *qugid[],
+		int type, __u64 number, char prealloc)
+{
+	struct dq_kinfo *dqinfo;
+	struct dq_kstat *dqstat;
+	qsize_t btotal;
+
+	if (prealloc & DQUOT_SPACE_NOFAIL)
+		return QUOTA_OK;
+
+	if (qugid[type] == NULL)
+		return QUOTA_OK;
+	if (qugid[type] == VZ_QUOTA_UGBAD)
+		return NO_QUOTA;
+
+	if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
+		return QUOTA_OK;
+	if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
+		return QUOTA_OK;
+	if (number == 0)
+		return QUOTA_OK;
+
+	dqinfo = &qmblk->dq_ugid_info[type];
+	dqstat = &qugid[type]->qugid_stat;
+	btotal = dqstat->bcurrent + dqstat->breserved + number;
+
+	if (dqstat->bhardlimit != 0 &&
+	    btotal > dqstat->bhardlimit)
+		return NO_QUOTA;
+
+	if (dqstat->bsoftlimit != 0 &&
+	    btotal > dqstat->bsoftlimit) {
+		if (dqstat->btime == (time_t)0) {
+			if (!prealloc)
+				dqstat->btime = CURRENT_TIME_SECONDS
+							+ dqinfo->bexpire;
+			else
+				/*
+				 * Original Linux quota doesn't allow
+				 * preallocation to exceed softlimit so
+				 * exceeding will be always printed
+				 */
+				return NO_QUOTA;
+		} else if (CURRENT_TIME_SECONDS >= dqstat->btime)
+			return NO_QUOTA;
+	}
+
+	return QUOTA_OK;
+}
+#endif
+
+/* ----------------------------------------------------------------------
+ * Quota superblock operations
+ * --------------------------------------------------------------------- */
+
+/*
+ * S_NOQUOTA note.
+ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for
+ *  - quota file (absent in our case)
+ *  - after explicit DQUOT_DROP (earlier than clear_inode) in functions like
+ *    filesystem-specific new_inode, before the inode gets outside links.
+ * For the latter case, the only quota operation where care about S_NOQUOTA
+ * might be required is vzquota_drop, but there S_NOQUOTA has already been
+ * checked in DQUOT_DROP().
+ * So, S_NOQUOTA may be ignored for now in the VZDQ code.
+ *
+ * The above note is not entirely correct.
+ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from
+ * delete_inode if new_inode fails (for example, because of inode quota
+ * limits), so S_NOQUOTA check is needed in free_inode.
+ * This seems to be the dark corner of the current quota API.
+ */
+
+/*
+ * Initialize quota operations for the specified inode.
+ */
+static int vzquota_initialize(struct inode *inode, int type)
+{
+	vzquota_inode_init_call(inode);
+	return 0; /* ignored by caller */
+}
+
+/*
+ * Release quota for the specified inode.
+ */
+static int vzquota_drop(struct inode *inode)
+{
+	vzquota_inode_drop_call(inode);
+	return 0; /* ignored by caller */
+}
+
+/*
+ * Allocate block callback.
+ *
+ * If (prealloc) disk quota exceeding warning is not printed.
+ * See Linux quota to know why.
+ *
+ * Return:
+ *	QUOTA_OK == 0 on SUCCESS
+ *	NO_QUOTA == 1 if allocation should fail
+ */
+static int __vzquota_alloc_space(struct inode *inode,
+			qsize_t number, int prealloc, int rsv)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+	struct vzsnap_struct *vzs = NULL;
+	int ret = QUOTA_OK;
+
+	qmblk = vzquota_inode_data(inode, &data);
+	if (qmblk == VZ_QUOTA_BAD)
+		return NO_QUOTA;
+	if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+		int cnt;
+		struct vz_quota_ugid * qugid[MAXQUOTAS];
+#endif
+
+		/* checking first */
+		ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat,
+				number, qmblk->dq_id, prealloc);
+		if (ret == NO_QUOTA)
+			goto no_quota;
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+			ret = vzquota_check_ugid_space(qmblk, qugid,
+					cnt, number, prealloc);
+			if (ret == NO_QUOTA)
+				goto no_quota;
+		}
+		/* check ok, may increment */
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_incr_space(&qugid[cnt]->qugid_stat, number,
+					rsv);
+			if (!rsv)
+				__vzquota_get_ugid(qugid[cnt]);
+		}
+#endif
+		vzquota_incr_space(&qmblk->dq_stat, number, rsv);
+		if (qmblk->dq_snap && !rsv)
+			vzs = vzsnap_get(qmblk->dq_snap);
+		vzquota_data_unlock(inode, &data);
+		/* Reservation doesn't change state of on-disk quota's data,
+		   skip quota dirtying */
+		if (rsv)
+			goto out;
+		vzquota_mark_dirty(qmblk, qugid);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_put_ugid(qmblk, qugid[cnt]);
+		}
+#endif
+	}
+out:
+	inode_incr_space(inode, number, rsv);
+	if (vzs)
+		vzs->ops->addblock(vzs, inode);
+	might_sleep();
+	return QUOTA_OK;
+
+no_quota:
+	vzquota_data_unlock(inode, &data);
+	return NO_QUOTA;
+}
+
+static int vzquota_alloc_space(struct inode *inode, qsize_t number, int warn)
+{
+	return __vzquota_alloc_space(inode, number, warn, 0);
+}
+
+static int vzquota_reserve_space(struct inode *inode, qsize_t number, int warn)
+{
+	return __vzquota_alloc_space(inode, number, warn, 1);
+}
+
+/* Claim reserved space callback */
+static int vzquota_claim_reserved_space(struct inode *inode, qsize_t number)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+	struct vzsnap_struct *vzs = NULL;
+
+	qmblk = vzquota_inode_data(inode, &data);
+	if (qmblk == VZ_QUOTA_BAD)
+		return NO_QUOTA; /* isn't checked by the caller */
+	if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+		int cnt;
+		struct vz_quota_ugid * qugid[MAXQUOTAS];
+#endif
+
+		vzquota_claim_rsv_space(&qmblk->dq_stat, number);
+		if(qmblk->dq_snap)
+			vzs = vzsnap_get(qmblk->dq_snap);
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+			if (qugid[cnt] == NULL || qugid[cnt] == VZ_QUOTA_UGBAD)
+				continue;
+			vzquota_claim_rsv_space(&qugid[cnt]->qugid_stat,
+						number);
+			__vzquota_get_ugid(qugid[cnt]);
+		}
+#endif
+		vzquota_data_unlock(inode, &data);
+		vzquota_mark_dirty(qmblk, qugid);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_put_ugid(qmblk, qugid[cnt]);
+		}
+#endif
+	}
+	/* Update inode bytes */
+	inode_claim_rsv_space(inode, number);
+	if (vzs)
+		vzs->ops->addblock(vzs, inode);
+	might_sleep();
+	return QUOTA_OK;
+}
+
+/*
+ * Allocate inodes callback.
+ *
+ * Return:
+ *	QUOTA_OK == 0 on SUCCESS
+ *	NO_QUOTA == 1 if allocation should fail
+ */
+static int vzquota_alloc_inode(const struct inode *inode, qsize_t number)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+	int ret = QUOTA_OK;
+
+	qmblk = vzquota_inode_data((struct inode *)inode, &data);
+	if (qmblk == VZ_QUOTA_BAD)
+		return NO_QUOTA;
+	if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+		int cnt;
+		struct vz_quota_ugid *qugid[MAXQUOTAS];
+#endif
+
+		/* checking first */
+		ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat,
+				number, qmblk->dq_id);
+		if (ret == NO_QUOTA)
+			goto no_quota;
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+			ret = vzquota_check_ugid_inodes(qmblk, qugid,
+					cnt, number);
+			if (ret == NO_QUOTA)
+				goto no_quota;
+		}
+		/* check ok, may increment */
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number);
+			__vzquota_get_ugid(qugid[cnt]);
+		}
+#endif
+		vzquota_incr_inodes(&qmblk->dq_stat, number);
+		vzquota_data_unlock((struct inode *)inode, &data);
+
+		vzquota_mark_dirty(qmblk, qugid);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_put_ugid(qmblk, qugid[cnt]);
+		}
+#endif
+	}
+
+	might_sleep();
+	return QUOTA_OK;
+
+no_quota:
+	vzquota_data_unlock((struct inode *)inode, &data);
+	return NO_QUOTA;
+}
+
+/*
+ * Free space callback.
+ */
+static int __vzquota_free_space(struct inode *inode, qsize_t number, int rsv)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+
+	qmblk = vzquota_inode_data(inode, &data);
+	if (qmblk == VZ_QUOTA_BAD)
+		return NO_QUOTA; /* isn't checked by the caller */
+	if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+		int cnt;
+		struct vz_quota_ugid * qugid[MAXQUOTAS];
+#endif
+
+		vzquota_decr_space(&qmblk->dq_stat, number, rsv);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+			if (qugid[cnt] == NULL || qugid[cnt] == VZ_QUOTA_UGBAD)
+				continue;
+			vzquota_decr_space(&qugid[cnt]->qugid_stat, number,
+					rsv);
+			if (!rsv)
+				__vzquota_get_ugid(qugid[cnt]);
+		}
+#endif
+		vzquota_data_unlock(inode, &data);
+		/* Reservation doesn't change state of on-disk quota's data,
+		   skip quota dirtying */
+		if (rsv)
+			goto out;
+		vzquota_mark_dirty(qmblk, qugid);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_put_ugid(qmblk, qugid[cnt]);
+		}
+#endif
+	}
+out:
+	inode_decr_space(inode, number, rsv);
+	might_sleep();
+	return QUOTA_OK;
+}
+
+static int vzquota_release_space(struct inode *inode, qsize_t number)
+{
+	return  __vzquota_free_space(inode, number, 0);
+}
+
+/*
+ * Release reserved quota space
+ */
+static void vzquota_release_reserved_space(struct inode *inode, qsize_t number)
+{
+	__vzquota_free_space(inode, number, 1);
+
+}
+
+/*
+ * Free inodes callback.
+ */
+static int vzquota_free_inode(const struct inode *inode, qsize_t number)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+
+	qmblk = vzquota_inode_data((struct inode *)inode, &data);
+	if (qmblk == VZ_QUOTA_BAD)
+		return NO_QUOTA;
+	if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+		int cnt;
+		struct vz_quota_ugid * qugid[MAXQUOTAS];
+#endif
+
+		vzquota_decr_inodes(&qmblk->dq_stat, number);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+			if (qugid[cnt] == NULL || qugid[cnt] == VZ_QUOTA_UGBAD)
+				continue;
+			vzquota_decr_inodes(&qugid[cnt]->qugid_stat, number);
+			__vzquota_get_ugid(qugid[cnt]);
+		}
+#endif
+		vzquota_data_unlock((struct inode *)inode, &data);
+
+		vzquota_mark_dirty(qmblk, qugid);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] == NULL)
+				continue;
+			vzquota_put_ugid(qmblk, qugid[cnt]);
+		}
+#endif
+	}
+	might_sleep();
+	return QUOTA_OK;
+}
+
+void vzquota_inode_off(struct inode * inode)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+
+	/* The call is made through virtinfo, it can be an inode
+	 * not controlled by vzquota.
+	 */
+	if (!IS_VZ_QUOTA(inode->i_sb))
+		return;
+
+	qmblk = vzquota_inode_data(inode, &data);
+	if (qmblk == VZ_QUOTA_BAD)
+		return;
+
+	if (qmblk == NULL) {
+		/* Tricky place. If qmblk == NULL, it means that this inode
+		 * is not in area controlled by vzquota (except for rare
+		 * case of already set S_NOQUOTA). But we have to set
+		 * S_NOQUOTA in any case because vzquota can be turned
+		 * on later, when this inode is invalid from viewpoint
+		 * of vzquota.
+		 *
+		 * To be safe, we reacquire vzquota lock.
+		 * The assumption is that it would not hurt to call
+		 * vzquota_inode_drop() more than once, but it must
+		 * be called at least once after S_NOQUOTA is set.
+		 */
+		inode_qmblk_lock(inode->i_sb);
+		inode->i_flags |= S_NOQUOTA;
+		inode_qmblk_unlock(inode->i_sb);
+	} else {
+		loff_t bytes = inode_get_bytes(inode);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		int cnt;
+		struct vz_quota_ugid * qugid[MAXQUOTAS];
+#endif
+
+		inode->i_flags |= S_NOQUOTA;
+
+		vzquota_decr_space(&qmblk->dq_stat, bytes, 0);
+		vzquota_decr_inodes(&qmblk->dq_stat, 1);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+			if (qugid[cnt] == NULL || qugid[cnt] == VZ_QUOTA_UGBAD)
+				continue;
+			vzquota_decr_space(&qugid[cnt]->qugid_stat, bytes, 0);
+			vzquota_decr_inodes(&qugid[cnt]->qugid_stat, 1);
+			__vzquota_get_ugid(qugid[cnt]);
+		}
+#endif
+
+		vzquota_data_unlock(inode, &data);
+
+		vzquota_mark_dirty(qmblk, qugid);
+#ifdef CONFIG_VZ_QUOTA_UGID
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (qugid[cnt] != NULL && qugid[cnt] != VZ_QUOTA_UGBAD)
+				vzquota_put_ugid(qmblk, qugid[cnt]);
+		}
+#endif
+	}
+	vzquota_inode_drop_call(inode);
+}
+
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+
+/*
+ * helper function for quota_transfer
+ * check that we can add inode to this quota_id
+ */
+static int vzquota_transfer_check(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid *qugid[],
+		unsigned int type, __u64 size)
+{
+	if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK ||
+	    vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK)
+		return -1;
+	return 0;
+}
+
+int vzquota_transfer_usage(struct inode *inode, int mask,
+		struct vz_quota_ilink *qlnk, struct vz_quota_ugid **dirty)
+{
+	struct vz_quota_ugid *qugid_old;
+	__u64 space, cur_space, rsv_space;
+	int i;
+	cur_space = inode_get_bytes(inode);
+	rsv_space = inode_get_rsv_space(inode);
+	space = cur_space + rsv_space;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (!(mask & (1 << i)))
+			continue;
+		/*
+		 * Do not permit chown a file if its owner does not have
+		 * ugid record. This might happen if we somehow exceeded
+		 * the UID/GID (e.g. set uglimit less than number of users).
+		 */
+		if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD)
+			return -1;
+		if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space))
+			return -1;
+	}
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (!(mask & (1 << i)))
+			continue;
+		qugid_old = INODE_QLNK(inode)->qugid[i];
+		vzquota_free_space(&qugid_old->qugid_stat, cur_space);
+		vzquota_free_rsv_space(&qugid_old->qugid_stat, rsv_space);
+		vzquota_decr_inodes(&qugid_old->qugid_stat, 1);
+		vzquota_add_space(&qlnk->qugid[i]->qugid_stat, cur_space);
+		vzquota_rsv_space(&qlnk->qugid[i]->qugid_stat, rsv_space);
+		vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1);
+
+		if (dirty) {
+			dirty[i] = __vzquota_get_ugid(qugid_old);
+			dirty[i + MAXQUOTAS] = __vzquota_get_ugid(qlnk->qugid[i]);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Transfer the inode between diffent user/group quotas.
+ */
+static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
+{
+	return vzquota_inode_transfer_call(inode, iattr) ?
+		NO_QUOTA : QUOTA_OK;
+}
+
+static qsize_t *vzquota_get_reserved_space(struct inode *inode)
+{
+	return inode->i_sb->s_dquot.dq_op_orig->get_reserved_space(inode);
+}
+
+static void vzquota_swap_inode(struct inode *inode, struct inode *tmpl)
+{
+	vzquota_inode_swap_call(inode, tmpl);
+}
+
+
+#else /* CONFIG_VZ_QUOTA_UGID */
+
+static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
+{
+	return QUOTA_OK;
+}
+
+static void vzquota_swap_inode(struct inode *inode, struct inode *tmpl)
+{
+}
+#endif
+
+/*
+ * Called under following semaphores:
+ *	old_d->d_inode->i_sb->s_vfs_rename_sem
+ *	old_d->d_inode->i_sem
+ *	new_d->d_inode->i_sem
+ * [not verified  --SAW]
+ */
+static int vzquota_rename(struct inode *inode,
+		struct inode *old_dir, struct inode *new_dir)
+{
+	return vzquota_rename_check(inode, old_dir, new_dir) ?
+		NO_QUOTA : QUOTA_OK;
+}
+
+static unsigned int vzquota_qmblk_id(struct inode *inode)
+{
+	unsigned int ret;
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+
+	/*
+	 * FIXME - vzquota_inode_data takes qmblk lock here
+	 *         which is not actually needed
+	 */
+
+	qmblk = vzquota_inode_data(inode, &data);
+	if (qmblk == NULL)
+		return 0;
+	if (qmblk == VZ_QUOTA_BAD)
+		return (unsigned int)-1;
+
+	ret = qmblk->dq_id;
+	vzquota_data_unlock(inode, &data);
+
+	return ret;
+}
+
+extern void vzquota_shutdown_super(struct super_block *sb);
+
+/*
+ * Structure of superblock diskquota operations.
+ */
+struct dquot_operations vz_quota_operations = {
+	.initialize	= vzquota_initialize,
+	.drop		= vzquota_drop,
+	.alloc_space	= vzquota_alloc_space,
+	.alloc_inode	= vzquota_alloc_inode,
+	.free_space     = vzquota_release_space,
+	.free_inode	= vzquota_free_inode,
+	.transfer	= vzquota_transfer,
+	.rename		= vzquota_rename,
+
+	.swap_inode	= vzquota_swap_inode,
+	.shutdown	= vzquota_shutdown_super,
+	.orphan_cookie	= vzquota_qmblk_id,
+};
+
+struct dquot_operations vz_quota_operations_rsv = {
+	.initialize	= vzquota_initialize,
+	.drop		= vzquota_drop,
+	.alloc_space	= vzquota_alloc_space,
+	.reserve_space  = vzquota_reserve_space,
+	.claim_space    = vzquota_claim_reserved_space,
+	.release_rsv    = vzquota_release_reserved_space,
+	.alloc_inode	= vzquota_alloc_inode,
+	.free_space     = vzquota_release_space,
+	.free_inode	= vzquota_free_inode,
+	.transfer	= vzquota_transfer,
+	.rename		= vzquota_rename,
+	.get_reserved_space = vzquota_get_reserved_space,
+
+	.swap_inode	= vzquota_swap_inode,
+	.shutdown	= vzquota_shutdown_super,
+	.orphan_cookie	= vzquota_qmblk_id,
+};
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_tree.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_tree.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_tree.c	2015-01-21 12:02:53.118963690 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_tree.c	2015-01-21 12:02:53.118963690 +0300
@@ -0,0 +1,286 @@
+/*
+ *
+ * Copyright (C) 2005  SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo quota tree implementation
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/vzdq_tree.h>
+
+struct quotatree_tree *quotatree_alloc(void)
+{
+	int l;
+	struct quotatree_tree *tree;
+
+	tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL);
+	if (tree == NULL)
+		goto out;
+
+	for (l = 0; l < QUOTATREE_DEPTH; l++) {
+		INIT_LIST_HEAD(&tree->levels[l].usedlh);
+		INIT_LIST_HEAD(&tree->levels[l].freelh);
+		tree->levels[l].freenum = 0;
+	}
+	tree->root = NULL;
+	tree->leaf_num = 0;
+out:
+	return tree;
+}
+
+static struct quotatree_node *
+quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level,
+		struct quotatree_find_state *st)
+{
+	void **block;
+	struct quotatree_node *parent;
+	int l, index;
+
+	parent = NULL;
+	block = (void **)&tree->root;
+	l = 0;
+	while (l < level && *block != NULL) {
+		index = (id >>  QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
+		parent = *block;
+		block = parent->blocks + index;
+		l++;
+	}
+	if (st != NULL) {
+		st->block = block;
+		st->level = l;
+	}
+
+	return parent;
+}
+
+void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
+		struct quotatree_find_state *st)
+{
+	quotatree_follow(tree, id, QUOTATREE_DEPTH, st);
+	if (st->level == QUOTATREE_DEPTH)
+		return *st->block;
+	else
+		return NULL;
+}
+
+void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index)
+{
+	int i, count;
+	struct quotatree_node *p;
+	void *leaf;
+
+	if (QTREE_LEAFNUM(tree) <= index)
+		return NULL;
+
+	count = 0;
+	list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
+		for (i = 0; i < QUOTATREE_BSIZE; i++) {	
+			leaf = p->blocks[i];
+			if (leaf == NULL)
+				continue;
+			if (count == index)
+				return leaf;
+			count++;
+		}
+	}
+	return NULL;
+}
+
+/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id)
+ * in the tree... */
+void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id)
+{
+	int off;
+	struct quotatree_node *parent, *p;
+	struct list_head *lh;
+
+	/* get parent refering correct quota tree node of the last level */
+	parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL);
+	if (!parent)
+		return NULL;
+
+	off = (id & QUOTATREE_BMASK) + 1;	/* next ugid */
+	lh = &parent->list;
+	do {
+		p = list_entry(lh, struct quotatree_node, list);
+		for ( ; off < QUOTATREE_BSIZE; off++)
+			if (p->blocks[off])
+				return p->blocks[off];
+		off = 0;
+		lh = lh->next;
+	} while (lh != &QTREE_LEAFLVL(tree)->usedlh);
+
+	return NULL;
+}
+
+int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
+		struct quotatree_find_state *st, void *data)
+{
+	struct quotatree_node *p;
+	int l, index;
+
+	while (st->level < QUOTATREE_DEPTH) {
+		l = st->level;
+		if (!list_empty(&tree->levels[l].freelh)) {
+			p = list_entry(tree->levels[l].freelh.next,
+					struct quotatree_node, list);
+			list_del(&p->list);
+		} else {
+			p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL);
+			if (p == NULL)
+				return -ENOMEM;
+			/* save block number in the l-level
+			 * it uses for quota file generation */
+			p->num = tree->levels[l].freenum++;
+		}
+		list_add(&p->list, &tree->levels[l].usedlh);
+		memset(p->blocks, 0, sizeof(p->blocks));
+		*st->block = p;
+
+		index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
+		st->block = p->blocks + index;
+		st->level++;
+	}
+	tree->leaf_num++;
+	*st->block = data;
+
+	return 0;
+}
+
+static struct quotatree_node *
+quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id,
+		int level)
+{
+	struct quotatree_node *parent;
+	struct quotatree_find_state st;
+
+	parent = quotatree_follow(tree, id, level, &st);
+	if (st.level == QUOTATREE_DEPTH)
+		tree->leaf_num--;
+	*st.block = NULL;
+	return parent;
+}
+
+void quotatree_remove(struct quotatree_tree *tree, quotaid_t id)
+{
+	struct quotatree_node *p;
+	int level, i;
+
+	p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH);
+	for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) {
+		for (i = 0; i < QUOTATREE_BSIZE; i++)
+			if (p->blocks[i] != NULL)
+				return;
+		list_move(&p->list, &tree->levels[level].freelh);
+		p = quotatree_remove_ptr(tree, id, level);
+	}
+}
+
+#if 0
+static void quotatree_walk(struct quotatree_tree *tree,
+		struct quotatree_node *node_start,
+		quotaid_t id_start,
+		int level_start, int level_end,
+		int (*callback)(struct quotatree_tree *,
+				quotaid_t id,
+				int level,
+				void *ptr,
+				void *data),
+		void *data)
+{
+	struct quotatree_node *p;
+	int l, shift, index;
+	quotaid_t id;
+	struct quotatree_find_state st;
+
+	p = node_start;
+	l = level_start;
+	shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
+	id = id_start;
+	index = 0;
+
+	/*
+	 * Invariants:
+	 * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
+	 * id & ((1 << shift) - 1) == 0
+	 * p is l-level node corresponding to id
+	 */
+	do {
+		if (!p)
+			break;
+
+		if (l < level_end) {
+			for (; index < QUOTATREE_BSIZE; index++)
+				if (p->blocks[index] != NULL)
+					break;
+			if (index < QUOTATREE_BSIZE) {
+				/* descend */
+				p = p->blocks[index];
+				l++;
+				shift -= QUOTAID_BBITS;
+				id += (quotaid_t)index << shift;
+				index = 0;
+				continue;
+			}
+		}
+
+		if ((*callback)(tree, id, l, p, data))
+			break;
+
+		/* ascend and to the next node */
+		p = quotatree_follow(tree, id, l, &st);
+
+		index = ((id >> shift) & QUOTATREE_BMASK) + 1;
+		l--;
+		shift += QUOTAID_BBITS;
+		id &= ~(((quotaid_t)1 << shift) - 1);
+	} while (l >= level_start);
+}
+#endif
+
+static void free_list(struct list_head *node_list)
+{
+	struct quotatree_node *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, node_list, list) {
+		list_del(&p->list);
+		kfree(p);
+	}
+}
+
+static inline void quotatree_free_nodes(struct quotatree_tree *tree)
+{
+	int i;
+
+	for (i = 0; i < QUOTATREE_DEPTH; i++) {
+		free_list(&tree->levels[i].usedlh);
+		free_list(&tree->levels[i].freelh);
+	}
+}
+
+static void quotatree_free_leafs(struct quotatree_tree *tree,
+		void (*dtor)(void *))
+{
+	int i;
+	struct quotatree_node *p;
+
+	list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
+		for (i = 0; i < QUOTATREE_BSIZE; i++) {
+			if (p->blocks[i] == NULL)
+				continue;
+
+			dtor(p->blocks[i]);
+		}
+	}
+}
+
+void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *))
+{
+	quotatree_free_leafs(tree, dtor);
+	quotatree_free_nodes(tree);
+	kfree(tree);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_ugid.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_ugid.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdq_ugid.c	2015-01-21 12:02:53.119963664 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdq_ugid.c	2015-01-21 12:02:53.409955965 +0300
@@ -0,0 +1,1445 @@
+/*
+ * Copyright (C) 2002 SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo UID/GID disk quota implementation
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/smp_lock.h>
+#include <linux/rcupdate.h>
+#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/quota.h>
+#include "../quotaio_v2.h"
+#include <linux/virtinfo.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include <linux/vmalloc.h>
+#include <linux/quotaops.h>
+
+#include <linux/vzctl.h>
+#include <linux/vzctl_quota.h>
+#include <linux/vzquota.h>
+
+/*
+ * XXX
+ * may be something is needed for sb->s_dquot->info[]?
+ */
+
+#define USRQUOTA_MASK		(1 << USRQUOTA)
+#define GRPQUOTA_MASK		(1 << GRPQUOTA)
+#define QTYPE2MASK(type)	(1 << (type))
+
+static struct kmem_cache *vz_quota_ugid_cachep;
+
+inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid)
+{
+	if (qugid != VZ_QUOTA_UGBAD)
+		atomic_inc(&qugid->qugid_count);
+	return qugid;
+}
+
+/* we don't limit users with zero limits */
+static inline int vzquota_fake_stat(struct dq_kstat *stat)
+{
+	return stat->bhardlimit == 0 && stat->bsoftlimit == 0 &&
+		stat->ihardlimit == 0 && stat->isoftlimit == 0;
+}
+
+/* callback function for quotatree_free() */
+static inline void vzquota_free_qugid(void *ptr)
+{
+	struct vz_quota_ugid *qugid = (struct vz_quota_ugid *) ptr;
+	if (qugid && qugid->qugid_stat.breserved) {
+		printk("VZQUOTA: quota %u still has %lld block reserved\n",
+			qugid->qugid_id, qugid->qugid_stat.breserved);
+		dump_stack();
+	}
+	kmem_cache_free(vz_quota_ugid_cachep, ptr);
+}
+
+/*
+ * destroy ugid, if it have zero refcount, limits and usage
+ * must be called under qmblk->dq_mutex
+ */
+void vzquota_put_ugid(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid *qugid)
+{
+	if (qugid == VZ_QUOTA_UGBAD)
+		return;
+	qmblk_data_read_lock(qmblk);
+	if (atomic_dec_and_test(&qugid->qugid_count) &&
+	    (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 &&
+	    vzquota_fake_stat(&qugid->qugid_stat) &&
+	    qugid->qugid_stat.bcurrent == 0 &&
+	    qugid->qugid_stat.icurrent == 0) {
+		quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type),
+				qugid->qugid_id);
+		qmblk->dq_ugid_count--;
+		vzquota_free_qugid(qugid);
+	}
+	qmblk_data_read_unlock(qmblk);
+}
+
+/*
+ * Get ugid block by its index, like it would present in array.
+ * In reality, this is not array - this is leafs chain of the tree.
+ * NULL if index is out of range.
+ * qmblk semaphore is required to protect the tree.
+ */
+static inline struct vz_quota_ugid *
+vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type)
+{
+	return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index);
+}
+
+/*
+ * get next element from ugid "virtual array"
+ * ugid must be in current array and this array may not be changed between
+ * two accesses (quaranteed by "stopped" quota state and quota semaphore)
+ * qmblk semaphore is required to protect the tree
+ */
+static inline struct vz_quota_ugid *
+vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid)
+{
+	return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type),
+			qugid->qugid_id);
+}
+
+/*
+ * requires dq_mutex
+ */
+struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
+			unsigned int quota_id, int type, int flags)
+{
+	struct vz_quota_ugid *qugid;
+	struct quotatree_tree *tree;
+	struct quotatree_find_state st;
+
+	tree = QUGID_TREE(qmblk, type);
+	qugid = quotatree_find(tree, quota_id, &st);
+	if (qugid)
+		goto success;
+
+	/* caller does not want alloc */
+	if (flags & VZDQUG_FIND_DONT_ALLOC)
+		goto fail;
+
+	if (flags & VZDQUG_FIND_FAKE)
+		goto doit;
+
+	/* check limit */
+	if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max)
+		goto fail;
+
+	/* see comment at VZDQUG_FIXED_SET define */
+	if (qmblk->dq_flags & VZDQUG_FIXED_SET)
+		goto fail;
+
+doit:
+	/* alloc new structure */
+	qugid = kmem_cache_alloc(vz_quota_ugid_cachep,
+			GFP_NOFS | __GFP_NOFAIL);
+	if (qugid == NULL)
+		goto fail;
+
+	/* initialize new structure */
+	qugid->qugid_id = quota_id;
+	memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat));
+	qugid->qugid_type = type;
+	atomic_set(&qugid->qugid_count, 0);
+
+	/* insert in tree */
+	if (quotatree_insert(tree, quota_id, &st, qugid) < 0)
+		goto fail_insert;
+	qmblk->dq_ugid_count++;
+
+success:
+	vzquota_get_ugid(qugid);
+	return qugid;
+
+fail_insert:
+	vzquota_free_qugid(qugid);
+fail:
+	return VZ_QUOTA_UGBAD;
+}
+
+/*
+ * takes dq_mutex, may schedule
+ */
+struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
+			unsigned int quota_id, int type, int flags)
+{
+	struct vz_quota_ugid *qugid;
+
+	mutex_lock(&qmblk->dq_mutex);
+	qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags);
+	mutex_unlock(&qmblk->dq_mutex);
+
+	return qugid;
+}
+
+/*
+ * destroy all ugid records on given quota master
+ */
+void vzquota_kill_ugid(struct vz_quota_master *qmblk)
+{
+	BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) ||
+		(qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL));
+
+	if (qmblk->dq_uid_tree != NULL) {
+		quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid);
+		quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid);
+	}
+}
+
+
+/* ----------------------------------------------------------------------
+ * Management interface to ugid quota for (super)users.
+ * --------------------------------------------------------------------- */
+
+static int vzquota_initialize2(struct inode *inode, int type)
+{
+	return QUOTA_OK;
+}
+
+static int vzquota_drop2(struct inode *inode)
+{
+	return QUOTA_OK;
+}
+
+static int vzquota_alloc_space2(struct inode *inode,
+			     qsize_t number, int prealloc)
+{
+	inode_add_bytes(inode, number);
+	return QUOTA_OK;
+}
+
+static int vzquota_reserve_space2(struct inode *inode,
+			     qsize_t number, int prealloc)
+{
+	inode_add_rsv_space(inode, number);
+	return QUOTA_OK;
+}
+
+static int vzquota_claim_reserved_space2(struct inode *inode, qsize_t number)
+{
+	inode_claim_rsv_space(inode, number);
+	return QUOTA_OK;
+}
+
+static int vzquota_alloc_inode2(const struct inode *inode, qsize_t number)
+{
+	return QUOTA_OK;
+}
+
+static int vzquota_free_space2(struct inode *inode, qsize_t number)
+{
+	inode_sub_bytes(inode, number);
+	return QUOTA_OK;
+}
+static void vzquota_release_reserved_space2(struct inode *inode, qsize_t num)
+{
+	inode_sub_rsv_space(inode, num);
+}
+
+static int vzquota_free_inode2(const struct inode *inode, qsize_t number)
+{
+	return QUOTA_OK;
+}
+
+static int vzquota_transfer2(struct inode *inode, struct iattr *iattr)
+{
+	return QUOTA_OK;
+}
+
+static qsize_t *vzquota_get_reserved_space2(struct inode *inode)
+{
+	return inode->i_sb->s_dquot.dq_op_orig->get_reserved_space(inode);
+}
+
+struct dquot_operations vz_quota_operations2 = {
+	.initialize	= vzquota_initialize2,
+	.drop		= vzquota_drop2,
+	.alloc_space	= vzquota_alloc_space2,
+	.alloc_inode	= vzquota_alloc_inode2,
+	.free_space	= vzquota_free_space2,
+	.free_inode	= vzquota_free_inode2,
+	.transfer	= vzquota_transfer2,
+};
+
+
+struct dquot_operations vz_quota_operations2_rsv = {
+	.initialize	= vzquota_initialize2,
+	.drop		= vzquota_drop2,
+	.alloc_space	= vzquota_alloc_space2,
+	.alloc_inode	= vzquota_alloc_inode2,
+	.reserve_space  = vzquota_reserve_space2,
+	.claim_space    = vzquota_claim_reserved_space2,
+	.release_rsv    = vzquota_release_reserved_space2,
+	.get_reserved_space = vzquota_get_reserved_space2,
+	.free_space	= vzquota_free_space2,
+	.free_inode	= vzquota_free_inode2,
+	.transfer	= vzquota_transfer2,
+};
+
+
+asmlinkage long sys_unlink(const char __user * pathname);
+asmlinkage long sys_rename(const char __user * oldname,
+	       const char __user * newname);
+asmlinkage long sys_symlink(const char __user * oldname,
+	       const char __user * newname);
+
+/* called under sb->s_umount semaphore */
+static int vz_restore_symlink(struct super_block *sb, char *path, int type)
+{
+	mm_segment_t oldfs;
+	char *newpath;
+	char dest[64];
+	const char *names[] = {
+		[USRQUOTA] "aquota.user",
+		[GRPQUOTA] "aquota.group"
+	};
+	int err;
+
+	newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL);
+	if (newpath == NULL)
+		return -ENOMEM;
+
+	strcpy(newpath, path);
+	strcat(newpath, ".new");
+
+	sprintf(dest, "/proc/vz/vzaquota/%08x/%s",
+			new_encode_dev(sb->s_dev), names[type]);
+
+	/*
+	 * Lockdep will learn unneeded dependency while unlink(2):
+	 *	->s_umount => ->i_mutex/1 => ->i_mutex
+	 * Reverse dependency is,
+	 *	open_namei() => ->i_mutex => lookup_hash() => __lookup_hash()
+	 *	=> ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev()
+	 *	=> user_get_super() => ->s_umount
+	 *
+	 * However, first set of ->i_mutex'es belong to /, second to /proc .
+	 * Right fix is to get rid of vz_restore_symlink(), of course.
+	 */
+	up_read(&sb->s_umount);
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sys_unlink(newpath);
+	if (err < 0 && err != -ENOENT)
+		goto out_restore;
+	err = sys_symlink(dest, newpath);
+	if (err < 0)
+		goto out_restore;
+	err = sys_rename(newpath, path);
+out_restore:
+	set_fs(oldfs);
+
+	down_read(&sb->s_umount);
+	/* umounted meanwhile? */
+	if (err == 0 && !sb->s_root)
+		err = -ENODEV;
+
+	kfree(newpath);
+	return err;
+}
+
+/* called under sb->s_umount semaphore */
+static int vz_quota_on(struct super_block *sb, int type,
+		int format_id, char *path, int remount)
+{
+	struct vz_quota_master *qmblk;
+	struct super_block *real_sb;
+	int mask2;
+	int err;
+
+	if (remount)
+		return 0;
+
+	qmblk = vzquota_find_qmblk(sb);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+
+	err = vz_restore_symlink(sb, path, type);
+	if (err < 0)
+		goto out_put;
+
+	mutex_lock(&vz_quota_mutex);
+	mask2 = 0;
+
+	err = -EIO;
+	if (!sb->s_op->get_quota_root)
+		goto out_sem;
+	real_sb = sb->s_op->get_quota_root(sb)->i_sb;
+	if (!IS_VZ_QUOTA(real_sb))
+		goto out_sem;
+	if (real_sb->s_dquot.dq_op_orig->reserve_space)
+		sb->dq_op = &vz_quota_operations2_rsv;
+	else
+		sb->dq_op = &vz_quota_operations2;
+
+	sb->s_qcop = &vz_quotactl_operations;
+	if (type == USRQUOTA)
+		mask2 = VZDQ_USRQUOTA;
+	if (type == GRPQUOTA)
+		mask2 = VZDQ_GRPQUOTA;
+
+	err = -EBUSY;
+	if (qmblk->dq_flags & mask2)
+		goto out_sem;
+
+	err = 0;
+	qmblk->dq_flags |= mask2;
+	sb->s_dquot.flags |= dquot_state_flag(
+			DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, type);
+
+out_sem:
+	mutex_unlock(&vz_quota_mutex);
+out_put:
+	qmblk_put(qmblk);
+out:
+	return err;
+}
+
+static int vz_quota_off(struct super_block *sb, int type, int remount)
+{
+	struct vz_quota_master *qmblk;
+	int mask2;
+	int err;
+
+	if (remount)
+		return 0;
+
+	qmblk = vzquota_find_qmblk(sb);
+	mutex_lock(&vz_quota_mutex);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+
+	mask2 = 0;
+	if (type == USRQUOTA)
+		mask2 = VZDQ_USRQUOTA;
+	if (type == GRPQUOTA)
+		mask2 = VZDQ_GRPQUOTA;
+	err = -EINVAL;
+	if (!(qmblk->dq_flags & mask2))
+		goto out;
+
+	qmblk->dq_flags &= ~mask2;
+	err = 0;
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qmblk);
+	return err;
+}
+
+static int vz_quota_sync(struct super_block *sb, int type)
+{
+	return 0;	/* vz quota is always uptodate */
+}
+
+static int vz_get_dqblk(struct super_block *sb, int type,
+		qid_t id, struct if_dqblk *di)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ugid *ugid;
+	int err;
+
+	qmblk = vzquota_find_qmblk(sb);
+	mutex_lock(&vz_quota_mutex);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+
+	err = 0;
+	ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC);
+	if (ugid != VZ_QUOTA_UGBAD) {
+		qmblk_data_read_lock(qmblk);
+		di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10;
+		di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10;
+		di->dqb_curspace = ugid->qugid_stat.bcurrent;
+		di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit;
+		di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit;
+		di->dqb_curinodes = ugid->qugid_stat.icurrent;
+		di->dqb_btime = ugid->qugid_stat.btime;
+		di->dqb_itime = ugid->qugid_stat.itime;
+		qmblk_data_read_unlock(qmblk);
+		di->dqb_valid = QIF_ALL;
+		vzquota_put_ugid(qmblk, ugid);
+	} else {
+		memset(di, 0, sizeof(*di));
+		di->dqb_valid = QIF_ALL;
+	}
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qmblk);
+	return err;
+}
+
+/* must be called under vz_quota_mutex */
+static int __vz_set_dqblk(struct vz_quota_master *qmblk,
+		int type, qid_t id, struct if_dqblk *di)
+{
+	struct vz_quota_ugid *ugid;
+
+	ugid = vzquota_find_ugid(qmblk, id, type, 0);
+	if (ugid == VZ_QUOTA_UGBAD)
+		return -ESRCH;
+
+	qmblk_data_write_lock(qmblk);
+	/*
+	 * Subtle compatibility breakage.
+	 *
+	 * Some old non-vz kernel quota didn't start grace period
+	 * if the new soft limit happens to be below the usage.
+	 * Non-vz kernel quota in 2.4.20 starts the grace period
+	 * (if it hasn't been started).
+	 * Current non-vz kernel performs even more complicated
+	 * manipulations...
+	 *
+	 * Also, current non-vz kernels have inconsistency related to 
+	 * the grace time start.  In regular operations the grace period
+	 * is started if the usage is greater than the soft limit (and,
+	 * strangely, is cancelled if the usage is less).
+	 * However, set_dqblk starts the grace period if the usage is greater
+	 * or equal to the soft limit.
+	 *
+	 * Here we try to mimic the behavior of the current non-vz kernel.
+	 */
+	if (di->dqb_valid & QIF_BLIMITS) {
+		ugid->qugid_stat.bhardlimit =
+			(__u64)di->dqb_bhardlimit << 10;
+		ugid->qugid_stat.bsoftlimit =
+			(__u64)di->dqb_bsoftlimit << 10;
+		if (di->dqb_bsoftlimit == 0 ||
+		    ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit)
+			ugid->qugid_stat.btime = 0;
+		else if (!(di->dqb_valid & QIF_BTIME))
+			ugid->qugid_stat.btime = CURRENT_TIME_SECONDS
+				+ qmblk->dq_ugid_info[type].bexpire;
+		else
+			ugid->qugid_stat.btime = di->dqb_btime;
+	}
+	if (di->dqb_valid & QIF_ILIMITS) {
+		ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit;
+		ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit;
+		if (di->dqb_isoftlimit == 0 ||
+		    ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit)
+			ugid->qugid_stat.itime = 0;
+		else if (!(di->dqb_valid & QIF_ITIME))
+			ugid->qugid_stat.itime = CURRENT_TIME_SECONDS
+				+ qmblk->dq_ugid_info[type].iexpire;
+		else
+			ugid->qugid_stat.itime = di->dqb_itime;
+	}
+	qmblk_data_write_unlock(qmblk);
+	vzquota_put_ugid(qmblk, ugid);
+
+	return 0;
+}
+
+static int vz_set_dqblk(struct super_block *sb, int type,
+		qid_t id, struct if_dqblk *di)
+{
+	struct vz_quota_master *qmblk;
+	int err;
+
+	qmblk = vzquota_find_qmblk(sb);
+	mutex_lock(&vz_quota_mutex);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+	err = __vz_set_dqblk(qmblk, type, id, di);
+out:
+	mutex_unlock(&vz_quota_mutex);
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qmblk);
+	return err;
+}
+
+static int vz_get_dqinfo(struct super_block *sb, int type,
+		struct if_dqinfo *ii)
+{
+	struct vz_quota_master *qmblk;
+	int err;
+
+	qmblk = vzquota_find_qmblk(sb);
+	mutex_lock(&vz_quota_mutex);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+
+	err = 0;
+	ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire;
+	ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire;
+	ii->dqi_flags = 0;
+	ii->dqi_valid = IIF_ALL;
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qmblk);
+	return err;
+}
+
+/* must be called under vz_quota_mutex */
+static int __vz_set_dqinfo(struct vz_quota_master *qmblk,
+		int type, struct if_dqinfo *ii)
+{
+	if (ii->dqi_valid & IIF_FLAGS)
+		if (ii->dqi_flags & DQF_MASK)
+			return -EINVAL;
+
+	if (ii->dqi_valid & IIF_BGRACE)
+		qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace;
+	if (ii->dqi_valid & IIF_IGRACE)
+		qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace;
+	return 0;
+}
+
+static int vz_set_dqinfo(struct super_block *sb, int type,
+		struct if_dqinfo *ii)
+{
+	struct vz_quota_master *qmblk;
+	int err;
+
+	qmblk = vzquota_find_qmblk(sb);
+	mutex_lock(&vz_quota_mutex);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+	err = __vz_set_dqinfo(qmblk, type, ii);
+out:
+	mutex_unlock(&vz_quota_mutex);
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qmblk);
+	return err;
+}
+
+#ifdef CONFIG_QUOTA_COMPAT
+
+#define Q_GETQUOTI_SIZE 1024
+
+#define UGID2DQBLK(dst, src)						\
+	do {								\
+		(dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit;	\
+		(dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit;	\
+		(dst)->dqb_curinodes = (src)->qugid_stat.icurrent;	\
+		/* in 1K blocks */					\
+		(dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \
+		/* in 1K blocks */					\
+		(dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \
+		/* in bytes, 64 bit */					\
+		(dst)->dqb_curspace = (src)->qugid_stat.bcurrent;	\
+		(dst)->dqb_btime = (src)->qugid_stat.btime;		\
+		(dst)->dqb_itime = (src)->qugid_stat.itime;		\
+	} while (0)
+
+static int vz_get_quoti(struct super_block *sb, int type, qid_t idx,
+		struct v2_disk_dqblk __user *dqblk)
+{
+	struct vz_quota_master *qmblk;
+	struct v2r0_disk_dqblk *data, *kbuf;
+	struct vz_quota_ugid *ugid;
+	int count;
+	int err;
+
+	qmblk = vzquota_find_qmblk(sb);
+	err = -ESRCH;
+	if (qmblk == NULL)
+		goto out;
+	err = -EIO;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out;
+
+	err = -ENOMEM;
+	kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf));
+	if (!kbuf)
+		goto out;
+
+	mutex_lock(&vz_quota_mutex);
+	mutex_lock(&qmblk->dq_mutex);
+	for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0;
+		ugid != NULL && count < Q_GETQUOTI_SIZE;
+		count++)
+	{
+		data = kbuf + count;
+		qmblk_data_read_lock(qmblk);
+		UGID2DQBLK(data, ugid);
+		qmblk_data_read_unlock(qmblk);
+		data->dqb_id = ugid->qugid_id;
+
+		/* Find next entry */
+		ugid = vzquota_get_next(qmblk, ugid);
+		BUG_ON(ugid != NULL && ugid->qugid_type != type);
+	}
+	mutex_unlock(&qmblk->dq_mutex);
+	mutex_unlock(&vz_quota_mutex);
+
+	err = count;
+	if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf)))
+		err = -EFAULT;
+
+	vfree(kbuf);
+out:
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qmblk);
+
+	return err;
+}
+
+#endif
+
+struct quotactl_ops vz_quotactl_operations = {
+	.quota_on	= vz_quota_on,
+	.quota_off	= vz_quota_off,
+	.quota_sync	= vz_quota_sync,
+	.get_info	= vz_get_dqinfo,
+	.set_info	= vz_set_dqinfo,
+	.get_dqblk	= vz_get_dqblk,
+	.set_dqblk	= vz_set_dqblk,
+#ifdef CONFIG_QUOTA_COMPAT
+	.get_quoti	= vz_get_quoti,
+#endif
+};
+
+int vzquota_read_uginfo(struct vz_quota_master *qmblk, struct inode *ino)
+{
+	struct super_block *sb = ino->i_sb;
+	size_t size;
+	struct vz_quota_uginfo_img i;
+
+	size = sb->s_op->quota_read_ino(sb, ino,
+			(char *)&i, sizeof(i), VZQUOTA_UGINFO_OFF);
+	if (size != sizeof(i))
+		return -EIO;
+
+	qmblk->dq_ugid_max = le32_to_cpu(i.ugid_max);
+	qmblk->dq_flags = le32_to_cpu(i.user_flags) & VZDQF_USER_MASK;
+	qmblk->dq_ugid_info[USRQUOTA].iexpire = le64_to_cpu(i.uid_iexpire);
+	qmblk->dq_ugid_info[USRQUOTA].bexpire = le64_to_cpu(i.uid_bexpire);
+	qmblk->dq_ugid_info[GRPQUOTA].iexpire = le64_to_cpu(i.uid_iexpire);
+	qmblk->dq_ugid_info[GRPQUOTA].bexpire = le64_to_cpu(i.uid_bexpire);
+
+	return 0;
+}
+
+static int vzquota_read_ugid_block(struct inode *ino, unsigned itemn, char *buf)
+{
+	struct super_block *sb = ino->i_sb;
+	size_t size;
+
+	size = sb->s_op->quota_read_ino(sb, ino, buf,
+			VZQUOTA_UGID_ITEM_SIZE,
+			VZQUOTA_UGID_OFF + (itemn << VZQUOTA_UGID_ITEM_BITS));
+	return (size == VZQUOTA_UGID_ITEM_SIZE) ? 0 : -EIO;
+}
+
+static int vzquota_load_ugid_block(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid_stat_img *img, int id, int type)
+{
+	u32 flags;
+	struct vz_quota_ugid *ugid;
+
+	flags = le32_to_cpu(img->flags);
+	if (!(flags & VZQUOTA_UGID_PRESENT))
+		return 0;
+
+	ugid = vzquota_find_ugid(qmblk, id, type, 0);
+	if (ugid == VZ_QUOTA_UGBAD)
+		return -ENOMEM;
+
+	ugid->qugid_stat.bhardlimit = le64_to_cpu(img->bhardlimit);
+	ugid->qugid_stat.bsoftlimit = le64_to_cpu(img->bsoftlimit);
+	ugid->qugid_stat.bcurrent = le64_to_cpu(img->bcurrent);
+	ugid->qugid_stat.ihardlimit = le32_to_cpu(img->ihardlimit);
+	ugid->qugid_stat.isoftlimit = le32_to_cpu(img->isoftlimit);
+	ugid->qugid_stat.icurrent = le32_to_cpu(img->icurrent);
+	ugid->qugid_stat.btime = le64_to_cpu(img->btime);
+	ugid->qugid_stat.itime = le64_to_cpu(img->itime);
+
+	vzquota_put_ugid(qmblk, ugid);
+
+	return 0;
+}
+
+int vzquota_read_ugid(struct vz_quota_master *qmblk, struct inode *ino)
+{
+	unsigned nr_items, i;
+	int err;
+	char *buf;
+
+	BUILD_BUG_ON(sizeof(struct vz_quota_ugid_stat_img) > VZQUOTA_UGID_ITEM_SIZE);
+	BUILD_BUG_ON(VZQUOTA_UGID_SIZE < VZQUOTA_MAX_UGID * 2 * VZQUOTA_UGID_ITEM_SIZE);
+	BUG_ON(ino->i_blkbits < VZQUOTA_UGID_ITEM_BITS);
+
+	err = -ENODATA;
+	if (ino->i_size != VZQUOTA_UGID_OFF + VZQUOTA_UGID_SIZE)
+		goto out;
+
+	err = vzquota_read_uginfo(qmblk, ino);
+	if (err)
+		goto out;
+
+	err = 0;
+	if (qmblk->dq_ugid_max == 0)
+		goto out;
+
+	qmblk->dq_flags |= VZDQUG_ON | VZDQ_USRQUOTA | VZDQ_GRPQUOTA;
+
+	err = -ENOMEM;
+	buf = kmalloc(VZQUOTA_UGID_ITEM_SIZE, GFP_KERNEL);
+	if (buf == NULL)
+		goto out;
+
+	nr_items = 1 << (VZQUOTA_UGID_BITS - VZQUOTA_UGID_ITEM_BITS);
+
+	for (i = 0; i < nr_items; i++) {
+		err = vzquota_read_ugid_block(ino, i, buf);
+		if (err)
+			break;
+
+		err = vzquota_load_ugid_block(qmblk,
+				(struct vz_quota_ugid_stat_img *)buf,
+				i >> 1, i & 1);
+		if (err)
+			break;
+	}
+
+	kfree(buf);
+out:
+	return err;
+}
+
+void vzquota_uginfo_dump(struct vz_quota_master *qmblk,
+		struct vz_quota_uginfo_img *img)
+{
+	img->uid_iexpire = cpu_to_le64(qmblk->dq_ugid_info[USRQUOTA].iexpire);
+	img->uid_bexpire = cpu_to_le64(qmblk->dq_ugid_info[USRQUOTA].bexpire);
+	img->uid_iexpire = cpu_to_le64(qmblk->dq_ugid_info[GRPQUOTA].iexpire);
+	img->uid_bexpire = cpu_to_le64(qmblk->dq_ugid_info[GRPQUOTA].bexpire);
+	img->user_flags = cpu_to_le32(qmblk->dq_flags & VZDQF_USER_MASK);
+	img->ugid_max = cpu_to_le32(qmblk->dq_ugid_max);
+}
+
+void vzquota_ugid_dump(struct vz_quota_ugid *ugid,
+		struct vz_quota_ugid_stat_img *img)
+{
+	img->flags = cpu_to_le32(VZQUOTA_UGID_PRESENT);
+	img->bhardlimit = cpu_to_le64(ugid->qugid_stat.bhardlimit);
+	img->bsoftlimit = cpu_to_le64(ugid->qugid_stat.bsoftlimit);
+	img->bcurrent = cpu_to_le64(ugid->qugid_stat.bcurrent);
+	img->ihardlimit = cpu_to_le32(ugid->qugid_stat.ihardlimit);
+	img->isoftlimit = cpu_to_le32(ugid->qugid_stat.isoftlimit);
+	img->icurrent = cpu_to_le32(ugid->qugid_stat.icurrent);
+	img->btime = cpu_to_le64(ugid->qugid_stat.btime);
+	img->itime = cpu_to_le64(ugid->qugid_stat.itime);
+}
+
+int vzquota_uginfo_write(struct inode *ino, struct vz_quota_uginfo_img *img)
+{
+	struct super_block *sb = ino->i_sb;
+	size_t size;
+
+	size = sb->s_op->quota_write_ino(sb, ino,
+			(char *)img, sizeof(*img), VZQUOTA_UGINFO_OFF);
+	return (size == sizeof(*img)) ? 0 : -EIO;
+}
+
+int vzquota_ugid_write(struct inode *ino, struct vz_quota_ugid_stat_img *img,
+		int id, int type)
+{
+	struct super_block *sb = ino->i_sb;
+	int itemn;
+	size_t size;
+
+	itemn = (id << 1 | type);
+	size = sb->s_op->quota_write_ino(sb, ino, (char *)img,
+			VZQUOTA_UGID_ITEM_SIZE,
+			VZQUOTA_UGID_OFF + (itemn << VZQUOTA_UGID_ITEM_BITS));
+
+	return (size == VZQUOTA_UGID_ITEM_SIZE) ? 0 : -EIO;
+}
+
+/* ----------------------------------------------------------------------
+ * Management interface for host system admins.
+ * --------------------------------------------------------------------- */
+
+static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size,
+		struct vz_quota_iface __user *u_ugid_buf, int compat)
+{
+	struct vz_quota_master *qmblk;
+	int ret;
+
+	mutex_lock(&vz_quota_mutex);
+
+	ret = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	ret = -EBUSY;
+	if (qmblk->dq_state != VZDQ_STARTING)
+		goto out; /* working quota doesn't accept new ugids */
+
+	ret = 0;
+	/* start to add ugids */
+	for (ret = 0; ret < ugid_size; ret++) {
+		struct vz_quota_iface qif;
+		struct vz_quota_ugid *ugid;
+
+		if (!compat) {
+			if (copy_from_user(&qif, u_ugid_buf, sizeof(qif)))
+				break;
+			u_ugid_buf++; /* next user buffer */
+		} else {
+#ifdef CONFIG_COMPAT
+			struct compat_vz_quota_iface oqif;
+			if (copy_from_user(&oqif, u_ugid_buf,
+							sizeof(oqif)))
+				break;
+			qif.qi_id = oqif.qi_id;
+			qif.qi_type = oqif.qi_type;
+			compat_dqstat2dqstat(&oqif.qi_stat, &qif.qi_stat);
+			u_ugid_buf = (struct vz_quota_iface __user *)
+					(((void *)u_ugid_buf) + sizeof(oqif));
+#endif
+		}
+
+		if (qif.qi_type >= MAXQUOTAS)
+			break; /* bad quota type - this is the only check */
+
+		ugid = vzquota_find_ugid(qmblk,
+				qif.qi_id, qif.qi_type, 0);
+		if (ugid == VZ_QUOTA_UGBAD) {
+			qmblk->dq_flags |= VZDQUG_FIXED_SET;
+			break; /* limit reached */
+		}
+
+		/* update usage/limits
+		 * we can copy the data without the lock, because the data
+		 * cannot be modified in VZDQ_STARTING state */
+		user_dqstat2dqstat(&qif.qi_stat, &ugid->qugid_stat);
+		vzquota_put_ugid(qmblk, ugid);
+	}
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return ret;
+}
+
+static int quota_ugid_setgrace(unsigned int quota_id,
+		struct dq_info __user u_dq_info[], int compat)
+{
+	struct vz_quota_master *qmblk;
+	struct dq_info udq_info[MAXQUOTAS];
+	struct dq_kinfo *target;
+	int err, type;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	err = -EBUSY;
+	if (qmblk->dq_state != VZDQ_STARTING)
+		goto out; /* working quota doesn't accept changing options */
+
+	err = -EFAULT;
+	if (!compat) {
+		if (copy_from_user(udq_info, u_dq_info, sizeof(udq_info)))
+			goto out;
+	} else {
+#ifdef CONFIG_COMPAT
+		struct compat_dq_info odqi[MAXQUOTAS];
+		if (copy_from_user(odqi, u_dq_info, sizeof(odqi)))
+			goto out;
+		for (type = 0; type < MAXQUOTAS; type++)
+			compat_dqinfo2dqinfo(&odqi[type], &udq_info[type]);
+#endif
+	}
+	err = 0;
+
+	/* update in qmblk */
+	for (type = 0; type < MAXQUOTAS; type++) {
+		target = &qmblk->dq_ugid_info[type];
+		target->bexpire = udq_info[type].bexpire;
+		target->iexpire = udq_info[type].iexpire;
+	}
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size,
+		struct vz_quota_iface *ugid_buf)
+{
+	int type, count;
+	struct vz_quota_ugid *ugid;
+
+	if (QTREE_LEAFNUM(qmblk->dq_uid_tree) +
+	    QTREE_LEAFNUM(qmblk->dq_gid_tree)
+	    		<= index)
+		return 0;
+
+	count = 0;
+
+	type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA;
+	if (type == GRPQUOTA)
+		index -= QTREE_LEAFNUM(qmblk->dq_uid_tree);
+
+	/* loop through ugid and then qgid quota */
+repeat:
+	for (ugid = vzquota_get_byindex(qmblk, index, type);
+		ugid != NULL && count < size;
+		ugid = vzquota_get_next(qmblk, ugid), count++)
+	{
+		struct vz_quota_iface qif;
+		/* form interface buffer and send in to user-level */
+		qmblk_data_read_lock(qmblk);
+		dqstat2user_dqstat(&ugid->qugid_stat, &qif.qi_stat);
+		qmblk_data_read_unlock(qmblk);
+		qif.qi_id = ugid->qugid_id;
+		qif.qi_type = ugid->qugid_type;
+		memcpy(ugid_buf, &qif, sizeof(qif));
+		ugid_buf++; /* next portion of user buffer */
+	}
+
+	if (type == USRQUOTA && count < size) {
+		type = GRPQUOTA;
+		index = 0;
+		goto repeat;
+	}
+
+	return count;
+}
+
+static int quota_ugid_getstat(unsigned int quota_id,
+		int index, int size, struct vz_quota_iface __user *u_ugid_buf,
+		int compat)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_iface *k_ugid_buf;
+	int err;
+
+	if (index < 0 || size < 0)
+		return -EINVAL;
+
+	if (size > INT_MAX / sizeof(struct vz_quota_iface))
+		return -EINVAL;
+
+	k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface));
+	if (k_ugid_buf == NULL)
+		return -ENOMEM;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	mutex_lock(&qmblk->dq_mutex);
+	err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf);
+	mutex_unlock(&qmblk->dq_mutex);
+	if (err < 0)
+		goto out;
+
+	if (!compat) {
+		if (copy_to_user(u_ugid_buf, k_ugid_buf,
+					err * sizeof(struct vz_quota_iface)))
+			err = -EFAULT;
+	} else {
+#ifdef CONFIG_COMPAT
+		struct compat_vz_quota_iface oqif;
+		int i;
+		for (i = 0; i < err; i++) {
+			oqif.qi_id = k_ugid_buf[i].qi_id;
+			oqif.qi_type = k_ugid_buf[i].qi_type;
+			dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat,
+					  &oqif.qi_stat);
+			if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif)))
+				err = -EFAULT;
+			u_ugid_buf = (struct vz_quota_iface __user *)
+					(((void *)u_ugid_buf) + sizeof(oqif));
+		}
+#endif
+	}
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+	vfree(k_ugid_buf);
+	return err;
+}
+
+static int quota_ugid_getgrace(unsigned int quota_id,
+		struct dq_info __user u_dq_info[], int compat)
+{
+	struct vz_quota_master *qmblk;
+	struct dq_info dq_info[MAXQUOTAS];
+	struct dq_kinfo *target;
+	int err, type;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+	
+	err = 0;
+	/* update from qmblk */
+	for (type = 0; type < MAXQUOTAS; type ++) {
+		target = &qmblk->dq_ugid_info[type];
+		dq_info[type].bexpire = target->bexpire;
+		dq_info[type].iexpire = target->iexpire;
+		dq_info[type].flags = target->flags;
+	}
+
+	if (!compat) {
+		if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info)))
+			err = -EFAULT;
+	} else {
+#ifdef CONFIG_COMPAT
+		struct compat_dq_info odqi[MAXQUOTAS];
+		for (type = 0; type < MAXQUOTAS; type ++)
+			dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]);
+		if (copy_to_user(u_dq_info, odqi, sizeof(odqi)))
+			err = -EFAULT;
+#endif
+	}
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+static int quota_ugid_getconfig(unsigned int quota_id, 
+		struct vz_quota_ugid_stat __user *info)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ugid_stat kinfo;
+	int err;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+	
+	err = 0;
+	kinfo.limit = qmblk->dq_ugid_max;
+	kinfo.count = qmblk->dq_ugid_count;
+	kinfo.flags = qmblk->dq_flags;
+	if (qmblk->qfile == NULL)
+		kinfo.flags &= ~VZDQF_USER_MASK;
+
+	if (copy_to_user(info, &kinfo, sizeof(kinfo)))
+		err = -EFAULT;
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+static int quota_ugid_setconfig(unsigned int quota_id,
+		struct vz_quota_ugid_stat __user *info)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ugid_stat kinfo;
+	int err;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ENOENT;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	err = -EFAULT;
+	if (copy_from_user(&kinfo, info, sizeof(kinfo)))
+		goto out;
+
+	err = 0;
+	qmblk->dq_ugid_max = kinfo.limit;
+	if (qmblk->qfile != NULL) {
+		if (kinfo.flags & ~VZDQF_USER_MASK) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		qmblk->dq_flags = (qmblk->dq_flags & ~VZDQF_USER_MASK) |
+					(kinfo.flags & VZDQF_USER_MASK);
+	} else if (qmblk->dq_state == VZDQ_STARTING) {
+		if (kinfo.flags & VZDQF_USER_MASK) {
+			printk("VZDQ: API misuse!\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		qmblk->dq_flags = kinfo.flags;
+		if (qmblk->dq_flags & VZDQUG_ON)
+			qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA;
+	}
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+static int quota_ugid_setlimit(unsigned int quota_id,
+		struct vz_quota_ugid_setlimit __user *u_lim)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ugid_setlimit lim;
+	int err;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ESRCH;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	err = -EFAULT;
+	if (copy_from_user(&lim, u_lim, sizeof(lim)))
+		goto out;
+
+	err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb);
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+static int quota_ugid_setinfo(unsigned int quota_id,
+		struct vz_quota_ugid_setinfo __user *u_info)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ugid_setinfo info;
+	int err;
+
+	mutex_lock(&vz_quota_mutex);
+
+	err = -ESRCH;
+	qmblk = vzquota_find_master(quota_id);
+	if (qmblk == NULL)
+		goto out;
+
+	err = -EFAULT;
+	if (copy_from_user(&info, u_info, sizeof(info)))
+		goto out;
+
+	err = __vz_set_dqinfo(qmblk, info.type, &info.dqi);
+
+out:
+	mutex_unlock(&vz_quota_mutex);
+
+	return err;
+}
+
+/*
+ * This is a system call to maintain UGID quotas
+ * Note this call is allowed to run ONLY from VE0
+ */
+long do_vzquotaugidctl(int cmd, unsigned int quota_id,
+		unsigned int ugid_index, unsigned int ugid_size,
+		void *addr, int compat)
+{
+	int ret;
+
+	ret = -EPERM;
+	/* access allowed only from root of VE0 */
+	if (!capable(CAP_SYS_RESOURCE) ||
+	    !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	switch (cmd) {
+		case VZ_DQ_UGID_GETSTAT:
+			ret = quota_ugid_getstat(quota_id,
+					ugid_index, ugid_size,
+				       	(struct vz_quota_iface __user *)addr,
+					compat);
+			break;
+		case VZ_DQ_UGID_ADDSTAT:
+			ret = quota_ugid_addstat(quota_id, ugid_size,
+					(struct vz_quota_iface __user *) addr,
+					compat);
+			break;
+		case VZ_DQ_UGID_GETGRACE:
+			ret = quota_ugid_getgrace(quota_id,
+					(struct dq_info __user *)addr, compat);
+			break;
+		case VZ_DQ_UGID_SETGRACE:
+			ret = quota_ugid_setgrace(quota_id,
+					(struct dq_info __user *)addr, compat);
+			break;
+		case VZ_DQ_UGID_GETCONFIG:
+			ret = quota_ugid_getconfig(quota_id,
+					(struct vz_quota_ugid_stat __user *)
+								addr);
+			break;
+		case VZ_DQ_UGID_SETCONFIG:
+			ret = quota_ugid_setconfig(quota_id,
+					(struct vz_quota_ugid_stat __user *)
+								addr);
+			break;
+		case VZ_DQ_UGID_SETLIMIT:
+			ret = quota_ugid_setlimit(quota_id,
+					(struct vz_quota_ugid_setlimit __user *)
+								addr);
+			break;
+		case VZ_DQ_UGID_SETINFO:
+			ret = quota_ugid_setinfo(quota_id,
+					(struct vz_quota_ugid_setinfo __user *)
+								addr);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+	}
+out:
+	return ret;
+}
+
+static void ugid_quota_on_sb(struct super_block *sb)
+{
+	struct super_block *real_sb;
+	struct vz_quota_master *qmblk;
+
+	if (!sb->s_op->get_quota_root)
+		return;
+
+	real_sb = sb->s_op->get_quota_root(sb)->i_sb;
+	if (!IS_VZ_QUOTA(real_sb))
+		return;
+
+	sb->dq_op = &vz_quota_operations2;
+	sb->s_qcop = &vz_quotactl_operations;
+	INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+	INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+	sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
+	sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
+
+	qmblk = vzquota_find_qmblk(sb);
+	if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD))
+		return;
+	mutex_lock(&vz_quota_mutex);
+	if (qmblk->dq_flags & VZDQ_USRQUOTA)
+		sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED |
+				DQUOT_LIMITS_ENABLED, USRQUOTA);
+	if (qmblk->dq_flags & VZDQ_GRPQUOTA)
+		sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED |
+				DQUOT_LIMITS_ENABLED, GRPQUOTA);
+	mutex_unlock(&vz_quota_mutex);
+	qmblk_put(qmblk);
+}
+
+static void ugid_quota_off_sb(struct super_block *sb)
+{
+	/* can't make quota off on mounted super block */
+	BUG_ON(sb->s_root != NULL);
+}
+
+static int ugid_notifier_call(struct vnotifier_block *self,
+		unsigned long n, void *data, int old_ret)
+{
+	struct virt_info_quota *viq;
+
+	viq = (struct virt_info_quota *)data;
+
+	switch (n) {
+	case VIRTINFO_QUOTA_ON:
+		ugid_quota_on_sb(viq->super);
+		break;
+	case VIRTINFO_QUOTA_OFF:
+		ugid_quota_off_sb(viq->super);
+		break;
+	case VIRTINFO_QUOTA_GETSTAT:
+		break;
+	default:
+		return old_ret;
+	}
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block ugid_notifier_block = {
+	.notifier_call = ugid_notifier_call,
+};
+
+/* ----------------------------------------------------------------------
+ * Init/exit.
+ * --------------------------------------------------------------------- */
+
+int vzquota_ugid_init(void)
+{
+	int err;
+
+	vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid",
+				      sizeof(struct vz_quota_ugid),
+				      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (vz_quota_ugid_cachep == NULL)
+		goto err_slab;
+
+	err = register_quota_format(&vz_quota_empty_v2_format);
+	if (err)
+		goto err_reg;
+
+	virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block);
+	return 0;
+
+err_reg:
+	kmem_cache_destroy(vz_quota_ugid_cachep);
+	return err;
+
+err_slab:
+	printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
+	return -ENOMEM;
+}
+
+void vzquota_ugid_release(void)
+{
+	virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block);
+	unregister_quota_format(&vz_quota_empty_v2_format);
+
+	kmem_cache_destroy(vz_quota_ugid_cachep);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdquot.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdquot.c
--- linux-2.6.32-504.3.3.el6.orig/fs/quota/vzdquota/vzdquot.c	2015-01-21 12:02:53.120963637 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/quota/vzdquota/vzdquot.c	2015-01-21 12:02:53.401956177 +0300
@@ -0,0 +1,2133 @@
+/*
+ * Copyright (C) 2001, 2002, 2004, 2005  SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains the core of Virtuozzo disk quota implementation:
+ * maintenance of VZDQ information in inodes,
+ * external interfaces,
+ * module entry.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/quota.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include <linux/vzctl.h>
+#include <linux/vzctl_quota.h>
+#include <linux/vzquota.h>
+#include <linux/virtinfo.h>
+#include <linux/vzdq_tree.h>
+#include <linux/mount.h>
+#include <linux/quotaops.h>
+
+/* ----------------------------------------------------------------------
+ *
+ * Locking
+ *
+ * ---------------------------------------------------------------------- */
+
+/*
+ * Serializes on/off and all other do_vzquotactl operations.
+ * Protects qmblk hash.
+ */
+struct mutex vz_quota_mutex;
+
+/*
+ * Data access locks
+ *  inode_qmblk
+ *	protects qmblk pointers in all inodes and qlnk content in general
+ *	(but not qmblk content);
+ *	also protects related qmblk invalidation procedures;
+ *	can't be per-inode because of vzquota_dtree_qmblk complications
+ *	and problems with serialization with quota_on,
+ *	but can be per-superblock;
+ *  qmblk_data
+ *	protects qmblk fields (such as current usage)
+ *  quota_data
+ *	protects charge/uncharge operations, thus, implies
+ *	qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock
+ *	(to protect ugid pointers).
+ *
+ * Lock order:
+ *  inode_qmblk_lock -> dcache_lock
+ *  inode_qmblk_lock -> qmblk_data
+ */
+static DEFINE_SPINLOCK(vzdq_qmblk_lock);
+
+inline void inode_qmblk_lock(struct super_block *sb)
+{
+	spin_lock(&vzdq_qmblk_lock);
+}
+
+inline void inode_qmblk_unlock(struct super_block *sb)
+{
+	spin_unlock(&vzdq_qmblk_lock);
+}
+
+inline void qmblk_data_read_lock(struct vz_quota_master *qmblk)
+{
+	spin_lock(&qmblk->dq_data_lock);
+}
+
+inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk)
+{
+	spin_unlock(&qmblk->dq_data_lock);
+}
+
+inline void qmblk_data_write_lock(struct vz_quota_master *qmblk)
+{
+	spin_lock(&qmblk->dq_data_lock);
+}
+
+inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk)
+{
+	spin_unlock(&qmblk->dq_data_lock);
+}
+
+struct quota_format_type vz_quota_empty_v2_format = {
+	.qf_fmt_id	= QFMT_VFS_V0,
+	.qf_ops		= NULL,
+	.qf_owner	= THIS_MODULE,
+};
+
+/* ----------------------------------------------------------------------
+ *
+ * Master hash table handling.
+ *
+ * SMP not safe, serialied by vz_quota_mutex within quota syscalls
+ *
+ * --------------------------------------------------------------------- */
+
+static struct kmem_cache *vzquota_cachep;
+
+/*
+ * Hash function.
+ */
+#define QHASH_BITS		6
+#define	VZ_QUOTA_HASH_SIZE	(1 << QHASH_BITS)
+#define QHASH_MASK		(VZ_QUOTA_HASH_SIZE - 1)
+
+struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE];
+int vzquota_hash_size = VZ_QUOTA_HASH_SIZE;
+
+static inline int vzquota_hash_func(unsigned int qid)
+{
+	return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK);
+}
+
+/**
+ * vzquota_alloc_master - alloc and instantiate master quota record
+ *
+ * Returns:
+ *	pointer to newly created record if SUCCESS
+ *	-ENOMEM if out of memory
+ *	-EEXIST if record with given quota_id already exist
+ */
+struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
+		struct vz_quota_kstat *qstat)
+{
+	int err;
+	struct vz_quota_master *qmblk;
+
+	err = -EEXIST;
+	if (vzquota_find_master(quota_id) != NULL)
+		goto out;
+
+	err = -ENOMEM;
+	qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL);
+	if (qmblk == NULL)
+		goto out;
+#ifdef CONFIG_VZ_QUOTA_UGID
+	qmblk->dq_uid_tree = quotatree_alloc();
+	if (!qmblk->dq_uid_tree)
+		goto out_free;
+
+	qmblk->dq_gid_tree = quotatree_alloc();
+	if (!qmblk->dq_gid_tree)
+		goto out_free_tree;
+#endif
+
+	qmblk->dq_state = VZDQ_STARTING;
+	mutex_init(&qmblk->dq_mutex);
+	mutex_init(&qmblk->dq_write_lock);
+	spin_lock_init(&qmblk->dq_data_lock);
+
+	qmblk->dq_id = quota_id;
+	qmblk->dq_stat = qstat->dq_stat;
+	qmblk->dq_info = qstat->dq_info;
+	qmblk->dq_root_path.dentry = NULL;
+	qmblk->dq_root_path.mnt = NULL;
+	qmblk->dq_sb = NULL;
+	qmblk->dq_ugid_count = 0;
+	qmblk->dq_ugid_max = 0;
+	qmblk->dq_flags = 0;
+	qmblk->qfile = NULL;
+	qmblk->dq_snap = NULL;
+	memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info));
+	INIT_LIST_HEAD(&qmblk->dq_ilink_list);
+
+	atomic_set(&qmblk->dq_count, 1);
+
+	/* insert in hash chain */
+	list_add(&qmblk->dq_hash,
+		&vzquota_hash_table[vzquota_hash_func(quota_id)]);
+
+	/* success */
+	return qmblk;
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+out_free_tree:
+	quotatree_free(qmblk->dq_uid_tree, NULL);
+out_free:
+	kmem_cache_free(vzquota_cachep, qmblk);
+#endif
+out:
+	return ERR_PTR(err);
+}
+
+static struct vz_quota_master *vzquota_alloc_fake(void)
+{
+	struct vz_quota_master *qmblk;
+
+	qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL);
+	if (qmblk == NULL)
+		return NULL;
+	memset(qmblk, 0, sizeof(*qmblk));
+	qmblk->dq_state = VZDQ_STOPING;
+	qmblk->dq_flags = VZDQ_NOQUOT;
+	spin_lock_init(&qmblk->dq_data_lock);
+	INIT_LIST_HEAD(&qmblk->dq_ilink_list);
+	atomic_set(&qmblk->dq_count, 1);
+	return qmblk;
+}
+
+/**
+ * vzquota_find_master - find master record with given id
+ *
+ * Returns qmblk without touching its refcounter.
+ * Called under vz_quota_mutex.
+ */
+struct vz_quota_master *vzquota_find_master(unsigned int quota_id)
+{
+	int i;
+	struct vz_quota_master *qp;
+
+	i = vzquota_hash_func(quota_id);
+	list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) {
+		if (qp->dq_id == quota_id)
+			return qp;
+	}
+	return NULL;
+}
+
+/**
+ * vzquota_free_master - release resources taken by qmblk, freeing memory
+ *
+ * qmblk is assumed to be already taken out from the hash.
+ * Should be called outside vz_quota_mutex.
+ */
+void vzquota_free_master(struct vz_quota_master *qmblk)
+{
+#ifdef CONFIG_VZ_QUOTA_UGID
+	vzquota_kill_ugid(qmblk);
+#endif
+	BUG_ON(!list_empty(&qmblk->dq_ilink_list));
+	kmem_cache_free(vzquota_cachep, qmblk);
+}
+
+
+static inline int vzquota_cur_qmblk_check(void)
+{
+	return current->magic == VZDQ_CUR_MAGIC;
+}
+
+static inline struct inode *vzquota_cur_qmblk_fetch(void)
+{
+	return current->ino;
+}
+
+static inline struct vz_quota_master *vzquota_cur_qmblk_orphan_cleanup(void)
+{
+	struct task_struct *tsk;
+	struct vz_quota_master *qmblk;
+
+	tsk = current;
+	if (tsk->magic != VZDQ_CUR_CLEANUP)
+		return NULL;
+
+	qmblk = (struct vz_quota_master *)current->ino;
+	BUG_ON(qmblk->dq_state != VZDQ_ORPHAN_CLEANUP);
+	return qmblk;
+}
+
+void vzquota_cur_qmblk_orphan_set(struct vz_quota_master *qmblk)
+{
+	struct task_struct *tsk;
+
+	tsk = current;
+	if (qmblk) {
+		tsk->magic = VZDQ_CUR_CLEANUP;
+		tsk->ino = (struct inode *)qmblk;
+	} else {
+		tsk->magic = 0;
+		tsk->ino = NULL;
+	}
+}
+
+#if 0
+static inline void vzquota_cur_qmblk_reset(void)
+{
+	current->magic = 0;
+}
+#endif
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Superblock quota operations
+ *
+ * --------------------------------------------------------------------- */
+
+/*
+ * Kernel structure abuse.
+ * We use files[0] pointer as an int variable:
+ * reference counter of how many quota blocks uses this superblock.
+ * files[1] is used for generations structure which helps us to track
+ * when traversing of dentries is really required.
+ */
+#define __VZ_QUOTA_NOQUOTA(sb)		sb->s_dquot.vzdq_master
+#define __VZ_QUOTA_TSTAMP(sb)		((struct timeval *)\
+						&sb->s_dquot.dqio_mutex)
+
+#if defined(VZ_QUOTA_UNLOAD)
+
+#define __VZ_QUOTA_SBREF(sb)		sb->s_dquot.vzdq_count
+
+/**
+ * quota_get_super - account for new a quoted tree under the superblock
+ *
+ * One superblock can have multiple directory subtrees with different VZ
+ * quotas.  We keep a counter of such subtrees and set VZ quota operations or
+ * reset the default ones.
+ *
+ * Called under vz_quota_mutex (from quota_on).
+ */
+int vzquota_get_super(struct super_block *sb)
+{
+	if (!IS_VZ_QUOTA(sb)) {
+		down(&sb->s_dquot.dqonoff_sem);
+		if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) {
+			up(&sb->s_dquot.dqonoff_sem);
+			return -EEXIST;
+		}
+		sb->s_dquot.dq_op_orig = sb->dq_op;
+		if (sb->s_dquot.dq_op_orig->reserve_space)
+			sb->dq_op = &vz_quota_operations_rsv;
+		else
+			sb->dq_op = &vz_quota_operations;
+		/* XXX this may race with sys_quotactl */
+#ifdef CONFIG_VZ_QUOTA_UGID
+		sb->s_dquot.qcop_orig = sb->s_qcop;
+		sb->s_qcop = &vz_quotactl_operations;
+#else
+		sb->s_qcop = NULL;
+#endif
+		do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
+		memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
+
+		INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+		INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+		sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
+		sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
+		/*
+		 * To get quotaops.h call us we need to mark superblock
+		 * as having quota.  These flags mark the moment when
+		 * our dq_op start to be called.
+		 *
+		 * The ordering of dq_op and s_dquot.flags assignment
+		 * needs to be enforced, but other CPUs do not do rmb()
+		 * between s_dquot.flags and dq_op accesses.
+		 */
+		wmb(); synchronize_sched();
+		sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED;
+		__module_get(THIS_MODULE);
+		up(&sb->s_dquot.dqonoff_sem);
+	}
+	/* protected by vz_quota_mutex */
+	__VZ_QUOTA_SBREF(sb)++;
+	return 0;
+}
+
+/**
+ * quota_put_super - release superblock when one quota tree goes away
+ *
+ * Called under vz_quota_mutex.
+ */
+void vzquota_put_super(struct super_block *sb)
+{
+	int count;
+
+	count = --__VZ_QUOTA_SBREF(sb);
+	if (count == 0) {
+		down(&sb->s_dquot.dqonoff_sem);
+		sb->s_dquot.flags = 0;
+		wmb(); synchronize_sched();
+		sema_init(&sb->s_dquot.dqio_sem, 1);
+
+		sb->s_qcop = sb->s_dquot.qcop_orig;
+		sb->dq_op = sb->s_dquot.dq_op_orig;
+		inode_qmblk_lock(sb);
+		quota_gen_put(SB_QGEN(sb));
+		SB_QGEN(sb) = NULL;
+		/* release qlnk's without qmblk */
+		remove_inode_quota_links_list(&non_vzquota_inodes_lh,
+				sb, NULL);
+		/*
+		 * Races with quota initialization:
+		 * after this inode_qmblk_unlock all inode's generations are
+		 * invalidated, quota_inode_qmblk checks superblock operations.
+		 */
+		inode_qmblk_unlock(sb);
+		/*
+		 * Module refcounting: in theory, this is the best place
+		 * to call module_put(THIS_MODULE).
+		 * In reality, it can't be done because we can't be sure that
+		 * other CPUs do not enter our code segment through dq_op
+		 * cached long time ago.  Quotaops interface isn't supposed to
+		 * go into modules currently (that is, into unloadable
+		 * modules).  By omitting module_put, our module isn't
+		 * unloadable.
+		 */
+		up(&sb->s_dquot.dqonoff_sem);
+	}
+}
+
+#else
+
+/**
+ * vzquota_shutdown_super - callback on umount
+ */
+void vzquota_shutdown_super(struct super_block *sb)
+{
+	struct vz_quota_master *qmblk;
+
+	qmblk = __VZ_QUOTA_NOQUOTA(sb);
+	__VZ_QUOTA_NOQUOTA(sb) = NULL;
+	if (qmblk != NULL)
+		qmblk_put(qmblk);
+}
+
+/**
+ * vzquota_get_super - account for new a quoted tree under the superblock
+ *
+ * One superblock can have multiple directory subtrees with different VZ
+ * quotas.
+ *
+ * Called under vz_quota_mutex (from vzquota_on).
+ */
+int vzquota_get_super(struct super_block *sb)
+{
+	struct vz_quota_master *qnew;
+	int err;
+
+	mutex_lock(&sb->s_dquot.dqonoff_mutex);
+	err = -EEXIST;
+	if (sb_any_quota_loaded(sb) && !IS_VZ_QUOTA(sb))
+		goto out_up;
+
+	/*
+	 * This allocation code should be under sb->dq_op check below, but
+	 * it doesn't really matter...
+	 */
+	if (__VZ_QUOTA_NOQUOTA(sb) == NULL) {
+		qnew = vzquota_alloc_fake();
+		if (qnew == NULL)
+			goto out_up;
+		__VZ_QUOTA_NOQUOTA(sb) = qnew;
+	}
+
+	if (!IS_VZ_QUOTA(sb)) {
+		sb->s_dquot.dq_op_orig = sb->dq_op;
+		if (sb->s_dquot.dq_op_orig->reserve_space)
+			sb->dq_op = &vz_quota_operations_rsv;
+		else
+			sb->dq_op = &vz_quota_operations;
+#ifdef CONFIG_VZ_QUOTA_UGID
+		sb->s_dquot.qcop_orig = sb->s_qcop;
+		sb->s_qcop = &vz_quotactl_operations;
+#else
+		sb->s_qcop = NULL;
+#endif
+		do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
+
+		memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
+		/* these 2 list heads are checked in sync_dquots() */
+		INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+		INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+		sb->s_dquot.info[USRQUOTA].dqi_format =
+						&vz_quota_empty_v2_format;
+		sb->s_dquot.info[GRPQUOTA].dqi_format =
+						&vz_quota_empty_v2_format;
+
+		/*
+		 * To get quotaops.h to call us we need to mark superblock
+		 * as having quota.  These flags mark the moment when
+		 * our dq_op start to be called.
+		 *
+		 * The ordering of dq_op and s_dquot.flags assignment
+		 * needs to be enforced, but other CPUs do not do rmb()
+		 * between s_dquot.flags and dq_op accesses.
+		 */
+		wmb(); synchronize_sched();
+		sb->s_dquot.flags =
+			dquot_state_flag(DQUOT_USAGE_ENABLED |
+					DQUOT_LIMITS_ENABLED,
+					USRQUOTA) |
+			dquot_state_flag(DQUOT_USAGE_ENABLED |
+					DQUOT_LIMITS_ENABLED,
+					GRPQUOTA);
+	}
+	err = 0;
+
+out_up:
+	mutex_unlock(&sb->s_dquot.dqonoff_mutex);
+	return err;
+}
+
+/**
+ * vzquota_put_super - one quota tree less on this superblock
+ *
+ * Called under vz_quota_mutex.
+ */
+void vzquota_put_super(struct super_block *sb)
+{
+	/*
+	 * Even if this put is the last one,
+	 * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop
+	 * won't be called and the remaining qmblk references won't be put.
+	 */
+}
+
+#endif
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Helpers for inode -> qmblk link maintenance
+ *
+ * --------------------------------------------------------------------- */
+
+#define __VZ_QUOTA_EMPTY		((void *)0xbdbdbdbd)
+#define VZ_QUOTA_IS_NOQUOTA(qm, sb)	((qm)->dq_flags & VZDQ_NOQUOT)
+#define VZ_QUOTA_EMPTY_IOPS		(&vfs_empty_iops)
+extern struct inode_operations vfs_empty_iops;
+
+static int VZ_QUOTA_IS_ACTUAL(struct inode *inode)
+{
+	struct vz_quota_master *qmblk;
+
+	qmblk = INODE_QLNK(inode)->qmblk;
+	if (qmblk == VZ_QUOTA_BAD)
+		return 1;
+	if (qmblk == __VZ_QUOTA_EMPTY)
+		return 0;
+	if (qmblk->dq_flags & VZDQ_NOACT)
+		/* not actual (invalidated) qmblk */
+		return 0;
+	return 1;
+}
+
+static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk)
+{
+	return qlnk->qmblk == __VZ_QUOTA_EMPTY;
+}
+
+static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk,
+		unsigned char origin)
+{
+	qlnk->origin[0] = qlnk->origin[1];
+	qlnk->origin[1] = origin;
+}
+
+static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk)
+{
+	qlnk->qmblk = __VZ_QUOTA_EMPTY;
+	set_qlnk_origin(qlnk, VZ_QUOTAO_SETE);
+}
+
+void vzquota_qlnk_init(struct vz_quota_ilink *qlnk)
+{
+	memset(qlnk, 0, sizeof(*qlnk));
+	INIT_LIST_HEAD(&qlnk->list);
+	vzquota_qlnk_set_empty(qlnk);
+	set_qlnk_origin(qlnk, VZ_QUOTAO_INIT);
+}
+
+void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk)
+{
+	might_sleep();
+	if (vzquota_qlnk_is_empty(qlnk))
+		return;
+#if defined(CONFIG_VZ_QUOTA_UGID)
+	if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) {
+		struct vz_quota_master *qmblk;
+		struct vz_quota_ugid *quid, *qgid;
+		qmblk = qlnk->qmblk;
+		quid = qlnk->qugid[USRQUOTA];
+		qgid = qlnk->qugid[GRPQUOTA];
+		if (quid != NULL || qgid != NULL) {
+			mutex_lock(&qmblk->dq_mutex);
+			if (qgid != NULL)
+				vzquota_put_ugid(qmblk, qgid);
+			if (quid != NULL)
+				vzquota_put_ugid(qmblk, quid);
+			mutex_unlock(&qmblk->dq_mutex);
+		}
+	}
+#endif
+	if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD)
+		qmblk_put(qlnk->qmblk);
+	set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR);
+}
+
+/**
+ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents
+ * @qlt: temporary
+ * @qli: inode's
+ *
+ * Locking is provided by the caller (depending on the context).
+ * After swap, @qli is inserted into the corresponding dq_ilink_list,
+ * @qlt list is reinitialized.
+ */
+static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt,
+		struct vz_quota_ilink *qli)
+{
+	struct vz_quota_master *qb;
+	struct vz_quota_ugid *qu;
+	int i;
+
+	qb = qlt->qmblk;
+	qlt->qmblk = qli->qmblk;
+	qli->qmblk = qb;
+	list_del_init(&qli->list);
+	if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD)
+		list_add(&qli->list, &qb->dq_ilink_list);
+	INIT_LIST_HEAD(&qlt->list);
+	set_qlnk_origin(qli, VZ_QUOTAO_SWAP);
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		qu = qlt->qugid[i];
+		qlt->qugid[i] = qli->qugid[i];
+		qli->qugid[i] = qu;
+	}
+}
+
+/**
+ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks
+ *
+ * Called under dcache_lock and inode_qmblk locks.
+ * Returns 1 if locks were dropped inside, 0 if atomic.
+ */
+static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk,
+		struct inode *inode)
+{
+	if (vzquota_qlnk_is_empty(qlnk))
+		return 0;
+	if (qlnk->qmblk == VZ_QUOTA_BAD) {
+		vzquota_qlnk_set_empty(qlnk);
+		set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK);
+		return 0;
+	}
+	spin_unlock(&dcache_lock);
+	inode_qmblk_unlock(inode->i_sb);
+	vzquota_qlnk_destroy(qlnk);
+	vzquota_qlnk_init(qlnk);
+	inode_qmblk_lock(inode->i_sb);
+	spin_lock(&dcache_lock);
+	return 1;
+}
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+/**
+ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content
+ *
+ * Similar to vzquota_qlnk_reinit_locked, called under different locks.
+ */
+static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk,
+		struct inode *inode,
+		struct vz_quota_master *qmblk)
+{
+	if (vzquota_qlnk_is_empty(qlnk))
+		return 0;
+	/* may be optimized if qlnk->qugid all NULLs */
+	qmblk_data_write_unlock(qmblk);
+	inode_qmblk_unlock(inode->i_sb);
+	vzquota_qlnk_destroy(qlnk);
+	vzquota_qlnk_init(qlnk);
+	inode_qmblk_lock(inode->i_sb);
+	qmblk_data_write_lock(qmblk);
+	return 1;
+}
+#endif
+
+/**
+ * vzquota_qlnk_fill - fill vz_quota_ilink content
+ * @qlnk: vz_quota_ilink to fill
+ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid)
+ * @qmblk: qmblk to which this @qlnk will belong
+ *
+ * Called under dcache_lock and inode_qmblk locks.
+ * Returns 1 if locks were dropped inside, 0 if atomic.
+ * @qlnk is expected to be empty.
+ */
+static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk,
+		struct inode *inode,
+		struct vz_quota_master *qmblk)
+{
+	if (qmblk != VZ_QUOTA_BAD)
+		qmblk_get(qmblk);
+	qlnk->qmblk = qmblk;
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+	if (qmblk != VZ_QUOTA_BAD &&
+	    !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
+	    (qmblk->dq_flags & VZDQUG_ON)) {
+		struct vz_quota_ugid *quid, *qgid;
+
+		spin_unlock(&dcache_lock);
+		inode_qmblk_unlock(inode->i_sb);
+
+		mutex_lock(&qmblk->dq_mutex);
+		quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0);
+		qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0);
+		mutex_unlock(&qmblk->dq_mutex);
+
+		inode_qmblk_lock(inode->i_sb);
+		spin_lock(&dcache_lock);
+		qlnk->qugid[USRQUOTA] = quid;
+		qlnk->qugid[GRPQUOTA] = qgid;
+		return 1;
+	}
+#endif
+
+	return 0;
+}
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+/**
+ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid
+ *
+ * This function is a helper for vzquota_transfer, and differs from
+ * vzquota_qlnk_fill only by locking.
+ */
+static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk,
+		struct inode *inode,
+		struct iattr *iattr,
+		int mask,
+		struct vz_quota_master *qmblk)
+{
+	qmblk_get(qmblk);
+	qlnk->qmblk = qmblk;
+
+	if (mask) {
+		struct vz_quota_ugid *quid, *qgid;
+
+		quid = qgid = NULL; /* to make gcc happy */
+		if (!(mask & (1 << USRQUOTA)))
+			quid = vzquota_get_ugid(INODE_QLNK(inode)->
+							qugid[USRQUOTA]);
+		if (!(mask & (1 << GRPQUOTA)))
+			qgid = vzquota_get_ugid(INODE_QLNK(inode)->
+							qugid[GRPQUOTA]);
+
+		qmblk_data_write_unlock(qmblk);
+		inode_qmblk_unlock(inode->i_sb);
+
+		mutex_lock(&qmblk->dq_mutex);
+		if (mask & (1 << USRQUOTA))
+			quid = __vzquota_find_ugid(qmblk, iattr->ia_uid,
+					USRQUOTA, 0);
+		if (mask & (1 << GRPQUOTA))
+			qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid,
+					GRPQUOTA, 0);
+		mutex_unlock(&qmblk->dq_mutex);
+
+		inode_qmblk_lock(inode->i_sb);
+		qmblk_data_write_lock(qmblk);
+		qlnk->qugid[USRQUOTA] = quid;
+		qlnk->qugid[GRPQUOTA] = qgid;
+		return 1;
+	}
+
+	return 0;
+}
+#endif
+
+/**
+ * __vzquota_inode_init - make sure inode's qlnk is initialized
+ *
+ * May be called if qlnk is already initialized, detects this situation itself.
+ * Called under inode_qmblk_lock.
+ */
+static void __vzquota_inode_init(struct inode *inode, unsigned char origin)
+{
+	if (inode->i_dquot[USRQUOTA] == NULL) {
+		vzquota_qlnk_init(INODE_QLNK(inode));
+		inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NULL;
+	}
+	set_qlnk_origin(INODE_QLNK(inode), origin);
+}
+
+/**
+ * vzquota_inode_drop - destroy VZ quota information in the inode
+ *
+ * Inode must not be externally accessible or dirty.
+ */
+static void vzquota_inode_drop(struct inode *inode)
+{
+	struct vz_quota_ilink qlnk;
+
+	vzquota_qlnk_init(&qlnk);
+	inode_qmblk_lock(inode->i_sb);
+	vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode));
+	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL);
+	inode->i_dquot[USRQUOTA] = NULL;
+	inode_qmblk_unlock(inode->i_sb);
+	vzquota_qlnk_destroy(&qlnk);
+}
+
+/**
+ * vzquota_inode_qmblk_set - initialize inode's qlnk
+ * @inode: inode to be initialized
+ * @qmblk: quota master block to which this inode should belong (may be BAD)
+ * @qlnk: placeholder to store data to resolve locking issues
+ *
+ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise.
+ * Called under dcache_lock and inode_qmblk locks.
+ * @qlnk will be destroyed in the caller chain.
+ *
+ * It is not mandatory to restart parent checks since quota on/off currently
+ * shrinks dentry tree and checks that there are not outside references.
+ * But if at some time that shink is removed, restarts will be required.
+ * Additionally, the restarts prevent inconsistencies if the dentry tree
+ * changes (inode is moved).  This is not a big deal, but anyway...
+ */
+static int vzquota_inode_qmblk_set(struct inode *inode,
+		struct vz_quota_master *qmblk,
+		struct vz_quota_ilink *qlnk)
+{
+	if (qmblk == NULL) {
+		printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, "
+				"dev %s, inode %lu, fs %s\n",
+				INODE_QLNK(inode)->origin[0],
+				INODE_QLNK(inode)->origin[1],
+				inode->i_sb->s_id, inode->i_ino,
+				inode->i_sb->s_type->name);
+		printk(KERN_ERR "current %d (%s), VE %d\n",
+				current->pid, current->comm,
+				VEID(get_exec_env()));
+		dump_stack();
+		qmblk = VZ_QUOTA_BAD;
+	}
+	while (1) {
+		if (vzquota_qlnk_is_empty(qlnk) &&
+		    vzquota_qlnk_fill(qlnk, inode, qmblk))
+			return 1;
+		if (qlnk->qmblk == qmblk)
+			break;
+		if (vzquota_qlnk_reinit_locked(qlnk, inode))
+			return 1;
+	}
+	vzquota_qlnk_swap(qlnk, INODE_QLNK(inode));
+	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET);
+	return 0;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * vzquota_inode_qmblk (inode -> qmblk lookup) parts
+ *
+ * --------------------------------------------------------------------- */
+
+static char *vzquota_check_parent(struct inode *parent, struct inode *inode)
+{
+	char *msg;
+
+	msg = "uninitialized parent";
+	if (vzquota_qlnk_is_empty(INODE_QLNK(parent)))
+		goto out;
+	msg = "parent not in tree";
+	if (list_empty(&parent->i_dentry))
+		goto out;
+	msg = "parent has 0 refcount";
+	if (!atomic_read(&parent->i_count))
+		goto out;
+	msg = "parent has different sb";
+	if (parent->i_sb != inode->i_sb)
+		goto out;
+
+	msg = NULL;
+out:
+	return msg;
+}
+
+static int vzquota_dparents_check_attach(struct inode *inode)
+{
+	if (!list_empty(&inode->i_dentry))
+		return 0;
+	printk(KERN_ERR "VZDQ: no parent for "
+			"dev %s, inode %lu, fs %s\n",
+			inode->i_sb->s_id,
+			inode->i_ino,
+			inode->i_sb->s_type->name);
+	return -1;
+}
+
+static struct inode *vzquota_dparents_check_actual(struct inode *inode)
+{
+	struct dentry *de;
+
+	list_for_each_entry(de, &inode->i_dentry, d_alias) {
+		if (de->d_parent == de) /* detached dentry, perhaps */
+			continue;
+		/* first access to parent, make sure its qlnk initialized */
+		__vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT);
+		if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode))
+			return de->d_parent->d_inode;
+	}
+	return NULL;
+}
+
+static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode)
+{
+	struct dentry *de;
+	struct vz_quota_master *qmblk;
+	char *msg = "";
+
+	qmblk = NULL;
+	list_for_each_entry(de, &inode->i_dentry, d_alias) {
+		if (de->d_parent == de) /* detached dentry, perhaps */
+			continue;
+		if (qmblk == NULL) {
+			qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk;
+			continue;
+		}
+		if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) {
+			printk(KERN_WARNING "VZDQ: multiple quotas for "
+					"dev %s, inode %lu, fs %s\n",
+					inode->i_sb->s_id,
+					inode->i_ino,
+					inode->i_sb->s_type->name);
+			qmblk = VZ_QUOTA_BAD;
+			break;
+		}
+	}
+
+	if (qmblk != NULL)
+		goto out;
+
+	if (vzquota_cur_qmblk_check()) {
+		struct inode *parent;
+
+		parent = vzquota_cur_qmblk_fetch();
+
+		msg = vzquota_check_parent(parent, inode);
+		if (msg != NULL)
+			goto fail;
+
+		msg = "parent not actual";
+		if (!VZ_QUOTA_IS_ACTUAL(parent))
+			goto fail;
+
+		qmblk = INODE_QLNK(parent)->qmblk;
+		goto out;
+	}
+fail:
+	printk(KERN_WARNING "VZDQ: not attached to tree, "
+			"dev %s, inode %lu, fs %s. %s\n",
+			inode->i_sb->s_id,
+			inode->i_ino,
+			inode->i_sb->s_type->name, msg);
+	qmblk = VZ_QUOTA_BAD;
+out:
+	return qmblk;
+}
+
+/* NFS root is disconnected dentry. */
+
+static int is_nfs_root(struct inode * inode)
+{
+	struct dentry *de;
+
+	if (inode->i_sb->s_magic != 0x6969)
+		return 0;
+
+	if (list_empty(&inode->i_dentry))
+		return 0;
+
+	list_for_each_entry(de, &inode->i_dentry, d_alias) {
+		if (de->d_parent != de)
+			return 0;
+		if (d_unhashed(de))
+			return 0;
+		if (!(de->d_flags & DCACHE_DISCONNECTED))
+			return 0;
+	}
+	return 1;
+}
+
+static void vzquota_dbranch_actualize(struct inode *inode,
+		struct inode *refinode)
+{
+	struct inode *pinode;
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ilink qlnk;
+
+	vzquota_qlnk_init(&qlnk);
+
+start:
+	if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) {
+		/* filesystem root */
+		atomic_inc(&inode->i_count);
+		do {
+			qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
+		} while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk));
+		goto out;
+	}
+
+	if (!vzquota_dparents_check_attach(inode)) {
+		pinode = vzquota_dparents_check_actual(inode);
+		if (pinode != NULL) {
+			inode = pinode;
+			goto start;
+		}
+	}
+
+	atomic_inc(&inode->i_count);
+	while (1) {
+		if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */
+			break;
+		/*
+		 * Need to check parents again if we have slept inside
+		 * vzquota_inode_qmblk_set() in the loop.
+		 * If the state of parents is different, just return and repeat
+		 * the actualizing process again from the inode passed to
+		 * vzquota_inode_qmblk_recalc().
+		 */
+		if (!vzquota_dparents_check_attach(inode)) {
+			if (vzquota_dparents_check_actual(inode) != NULL)
+				break;
+			qmblk = vzquota_dparents_check_same(inode);
+		} else
+			qmblk = VZ_QUOTA_BAD;
+		if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */
+			set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT);
+			break;
+		}
+	}
+
+out:
+	spin_unlock(&dcache_lock);
+	inode_qmblk_unlock(refinode->i_sb);
+	vzquota_qlnk_destroy(&qlnk);
+	iput(inode);
+	inode_qmblk_lock(refinode->i_sb);
+	spin_lock(&dcache_lock);
+}
+
+static void vzquota_dtree_qmblk_recalc(struct inode *inode,
+		struct vz_quota_ilink *qlnk)
+{
+	struct inode *pinode;
+	struct vz_quota_master *qmblk;
+
+	if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) {
+		/* filesystem root */
+		do {
+			qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
+		} while (vzquota_inode_qmblk_set(inode, qmblk, qlnk));
+		return;
+	}
+
+start:
+	if (VZ_QUOTA_IS_ACTUAL(inode))
+		return;
+	/*
+	 * Here qmblk is (re-)initialized for all ancestors.
+	 * This is not a very efficient procedure, but it guarantees that
+	 * the quota tree is consistent (that is, the inode doesn't have two
+	 * ancestors with different qmblk).
+	 */
+	if (!vzquota_dparents_check_attach(inode)) {
+		pinode = vzquota_dparents_check_actual(inode);
+		if (pinode != NULL) {
+			vzquota_dbranch_actualize(pinode, inode);
+			goto start;
+		}
+		qmblk = vzquota_dparents_check_same(inode);
+	} else
+		qmblk = VZ_QUOTA_BAD;
+
+	if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
+		goto start;
+	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE);
+}
+
+static void vzquota_det_qmblk_recalc(struct inode *inode,
+		struct vz_quota_ilink *qlnk)
+{
+	struct inode *parent;
+	struct vz_quota_master *qmblk;
+	char *msg;
+	int cnt;
+	time_t timeout;
+
+	cnt = 0;
+	parent = NULL;
+start:
+	/*
+	 * qmblk of detached inodes shouldn't be considered as not actual.
+	 * They are not in any dentry tree, so quota on/off shouldn't affect
+	 * them.
+	 */
+	if (!vzquota_qlnk_is_empty(INODE_QLNK(inode)))
+		return;
+
+	qmblk = vzquota_cur_qmblk_orphan_cleanup();
+	if (qmblk)
+		goto set;
+
+	timeout = 3;
+	qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
+	/*
+	 * Scenario:
+	 *	open
+	 *	unlink
+	 * 	quotaon
+	 *	generic_delete_inode
+	 *
+	 * This is the first time vzquota sees inode. inode is outside of
+	 * vzquota area of interest, otherwise quotaon would have got -EBUSY
+	 * due to shrink_dcache_parent().
+	 * inode is almost completely destroyed, so don't intervene.
+	 * 
+	 * dev@:
+	 * However, there is a small race here...
+	 * dput() first removes itself from all the lists,
+	 * so shrink_dcache_parent() can succeed while dentry_iput is not
+	 * done yet.
+	 */
+	if (inode->i_state & I_FREEING)
+		goto set;
+
+	msg = "detached inode not in creation";
+	if (inode->i_op != VZ_QUOTA_EMPTY_IOPS)
+		goto fail;
+	qmblk = VZ_QUOTA_BAD;
+	msg = "unexpected creation context";
+	if (!vzquota_cur_qmblk_check())
+		goto fail;
+	timeout = 0;
+	parent = vzquota_cur_qmblk_fetch();
+	msg = vzquota_check_parent(parent, inode);
+	if (msg != NULL)
+		goto fail;
+
+	if (!VZ_QUOTA_IS_ACTUAL(parent)) {
+		vzquota_dbranch_actualize(parent, inode);
+		goto start;
+	}
+
+	qmblk = INODE_QLNK(parent)->qmblk;
+set:
+	if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
+		goto start;
+	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET);
+	return;
+
+fail:
+	{
+		struct timeval tv, tvo;
+		do_gettimeofday(&tv);
+		memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo));
+		tv.tv_sec -= tvo.tv_sec;
+		if (tv.tv_usec < tvo.tv_usec) {
+			tv.tv_sec--;
+			tv.tv_usec += USEC_PER_SEC - tvo.tv_usec;
+		} else
+			tv.tv_usec -= tvo.tv_usec;
+		if (tv.tv_sec < timeout)
+			goto set;
+		printk(KERN_ERR "VZDQ: %s, orig {%u, %u},"
+			" dev %s, inode %lu, fs %s\n",
+			msg,
+			INODE_QLNK(inode)->origin[0],
+			INODE_QLNK(inode)->origin[1],
+			inode->i_sb->s_id, inode->i_ino,
+			inode->i_sb->s_type->name);
+		printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count));
+		printk(KERN_ERR "i_mode %o, ", inode->i_mode);
+		printk(KERN_ERR "i_state %lx, ", inode->i_state);
+		printk(KERN_ERR "i_flags %x\n", inode->i_flags);
+		printk(KERN_ERR "i_op %p, vfs_empty_iops %p, "
+				"i_fop %p, i_mapping %p\n",
+				inode->i_op, &vfs_empty_iops,
+				inode->i_fop, inode->i_mapping);
+		if (!cnt++) {
+			printk(KERN_ERR "current %d (%s), VE %d,"
+				" time %ld.%06ld\n",
+				current->pid, current->comm,
+				VEID(get_exec_env()),
+				tv.tv_sec, (long)tv.tv_usec);
+			dump_stack();
+		}
+		if (parent != NULL)
+			printk(KERN_ERR "VZDQ: parent of %lu is %lu\n",
+				inode->i_ino, parent->i_ino);
+	}
+	goto set;
+}
+
+static void vzquota_inode_qmblk_recalc(struct inode *inode,
+		struct vz_quota_ilink *qlnk)
+{
+	spin_lock(&dcache_lock);
+	if (!list_empty(&inode->i_dentry))
+		vzquota_dtree_qmblk_recalc(inode, qlnk);
+	else
+		vzquota_det_qmblk_recalc(inode, qlnk);
+	spin_unlock(&dcache_lock);
+}
+
+/**
+ * vzquota_inode_qmblk - obtain inode's qmblk
+ *
+ * Returns qmblk with refcounter taken, %NULL if not under
+ * VZ quota or %VZ_QUOTA_BAD.
+ *
+ * FIXME: This function should be removed when vzquota_find_qmblk /
+ * get_quota_root / vzquota_dstat code is cleaned up.
+ */
+struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ilink qlnk;
+
+	might_sleep();
+
+	if (!IS_VZ_QUOTA(inode->i_sb))
+		return NULL;
+#if defined(VZ_QUOTA_UNLOAD)
+#error Make sure qmblk does not disappear
+#endif
+
+	vzquota_qlnk_init(&qlnk);
+	inode_qmblk_lock(inode->i_sb);
+	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+
+	if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
+	    !VZ_QUOTA_IS_ACTUAL(inode))
+		vzquota_inode_qmblk_recalc(inode, &qlnk);
+
+	qmblk = INODE_QLNK(inode)->qmblk;
+	if (qmblk != VZ_QUOTA_BAD) {
+		if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb))
+			qmblk_get(qmblk);
+		else
+			qmblk = NULL;
+	}
+
+	inode_qmblk_unlock(inode->i_sb);
+	vzquota_qlnk_destroy(&qlnk);
+	return qmblk;
+}
+
+/**
+ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems
+ *
+ * This function finds a quota master block corresponding to the root of
+ * a virtual filesystem.
+ * Returns a quota master block with reference taken, or %NULL if not under
+ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation
+ * operations will fail).
+ *
+ * Note: this function uses vzquota_inode_qmblk().
+ * The latter is a rather confusing function: it returns qmblk that used to be
+ * on the inode some time ago (without guarantee that it still has any
+ * relations to the inode).  So, vzquota_find_qmblk() leaves it up to the
+ * caller to think whether the inode could have changed its qmblk and what to
+ * do in that case.
+ * Currently, the callers appear to not care :(
+ */
+struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb)
+{
+	struct inode *qrinode;
+	struct vz_quota_master *qmblk;
+
+	qmblk = NULL;
+	qrinode = NULL;
+	if (sb->s_op->get_quota_root != NULL)
+		qrinode = sb->s_op->get_quota_root(sb);
+	if (qrinode != NULL)
+		qmblk = vzquota_inode_qmblk(qrinode);
+	return qmblk;
+}
+
+/* ----------------------------------------------------------------------
+ *
+ * Calls from quota operations
+ *
+ * --------------------------------------------------------------------- */
+
+/**
+ * vzquota_inode_init_call - call from DQUOT_INIT
+ */
+void vzquota_inode_init_call(struct inode *inode)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+
+	/* initializes inode's quota inside */
+	qmblk = vzquota_inode_data(inode, &data);
+	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+		vzquota_data_unlock(inode, &data);
+
+	/*
+	 * The check is needed for repeated new_inode() calls from a single
+	 * ext3 call like create or mkdir in case of -ENOSPC.
+	 */
+	spin_lock(&dcache_lock);
+	if (!list_empty(&inode->i_dentry))
+		vzquota_cur_qmblk_set(inode);
+	spin_unlock(&dcache_lock);
+}
+
+void vzquota_inode_swap_call(struct inode *inode, struct inode *tmpl)
+{
+	struct vz_quota_master *qmblk;
+
+	__vzquota_inode_init(inode, VZ_QUOTAO_INIT);
+
+	might_sleep();
+
+	inode_qmblk_lock(tmpl->i_sb);
+	if (unlikely(tmpl->i_flags & S_NOQUOTA)) {
+		inode_qmblk_unlock(tmpl->i_sb);
+		return;
+	}
+	__vzquota_inode_init(tmpl, VZ_QUOTAO_INICAL);
+
+	qmblk = INODE_QLNK(tmpl)->qmblk;
+	if (qmblk != VZ_QUOTA_BAD) {
+		void * uq;
+		list_del_init(&INODE_QLNK(tmpl)->list);
+		vzquota_qlnk_swap(INODE_QLNK(tmpl), INODE_QLNK(inode));
+		uq = inode->i_dquot[USRQUOTA];
+		inode->i_dquot[USRQUOTA] = tmpl->i_dquot[USRQUOTA];
+		tmpl->i_dquot[USRQUOTA] = uq;
+		tmpl->i_flags |= S_NOQUOTA;
+		inode_qmblk_unlock(inode->i_sb);
+
+		vzquota_inode_drop(tmpl);
+	} else {
+		inode_qmblk_unlock(tmpl->i_sb);
+	}
+}
+
+
+/**
+ * vzquota_inode_drop_call - call from DQUOT_DROP
+ */
+void vzquota_inode_drop_call(struct inode *inode)
+{
+	vzquota_inode_drop(inode);
+}
+
+/**
+ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs
+ * @inode: the inode
+ * @data: storage space
+ *
+ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk.
+ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD:
+ *   qmblk in inode's qlnk is the same as returned,
+ *   ugid pointers inside inode's qlnk are valid,
+ *   some locks are taken (and should be released by vzquota_data_unlock).
+ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken.
+ */
+struct vz_quota_master *vzquota_inode_data(struct inode *inode,
+		struct vz_quota_datast *data)
+{
+	struct vz_quota_master *qmblk;
+
+	might_sleep();
+
+	vzquota_qlnk_init(&data->qlnk);
+	inode_qmblk_lock(inode->i_sb);
+	if (unlikely(inode->i_flags & S_NOQUOTA)) {
+		inode_qmblk_unlock(inode->i_sb);
+		return NULL;
+	}
+	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+
+	if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
+	    !VZ_QUOTA_IS_ACTUAL(inode))
+		vzquota_inode_qmblk_recalc(inode, &data->qlnk);
+
+	qmblk = INODE_QLNK(inode)->qmblk;
+	if (qmblk != VZ_QUOTA_BAD) {
+		if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) {
+			/*
+			 * Note that in the current implementation,
+			 * inode_qmblk_lock can theoretically be dropped here.
+			 * This place is serialized with quota_off because
+			 * quota_off fails when there are extra dentry
+			 * references and syncs inodes before removing quota
+			 * information from them.
+			 * However, quota usage information should stop being
+			 * updated immediately after vzquota_off.
+			 */
+			qmblk_data_write_lock(qmblk);
+		} else {
+			inode_qmblk_unlock(inode->i_sb);
+			vzquota_qlnk_destroy(&data->qlnk);
+			qmblk = NULL;
+		}
+	} else {
+		inode_qmblk_unlock(inode->i_sb);
+	}
+	return qmblk;
+}
+
+void vzquota_data_unlock(struct inode *inode,
+		struct vz_quota_datast *data)
+{
+	qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk);
+	inode_qmblk_unlock(inode->i_sb);
+	vzquota_qlnk_destroy(&data->qlnk);
+}
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+static void vzquota_handle_dirty_ugids(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid **dirty)
+{
+	int i;
+
+	if (qmblk->qfile != NULL)
+		__vzquota_mark_dirty_ugids(qmblk, dirty);
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (dirty[i] == NULL)
+			continue;
+
+		vzquota_put_ugid(qmblk, dirty[i]);
+		vzquota_put_ugid(qmblk, dirty[i + MAXQUOTAS]);
+	}
+}
+
+/**
+ * vzquota_inode_transfer_call - call from vzquota_transfer
+ */
+int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_datast data;
+	struct vz_quota_ilink qlnew;
+	struct vz_quota_ugid *dirty_ugids[MAXQUOTAS * 2];
+	int mask;
+	int ret;
+
+	might_sleep();
+	vzquota_qlnk_init(&qlnew);
+	memset(dirty_ugids, 0, sizeof(dirty_ugids));
+start:
+	qmblk = vzquota_inode_data(inode, &data);
+	ret = NO_QUOTA;
+	if (qmblk == VZ_QUOTA_BAD)
+		goto out_destr;
+	ret = QUOTA_OK;
+	if (qmblk == NULL)
+		goto out_destr;
+	qmblk_get(qmblk);
+
+	ret = QUOTA_OK;
+	if (!(qmblk->dq_flags & VZDQUG_ON))
+		/* no ugid quotas */
+		goto out_unlock;
+
+	mask = 0;
+	if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid)
+		mask |= 1 << USRQUOTA;
+	if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid)
+		mask |= 1 << GRPQUOTA;
+	while (1) {
+		if (vzquota_qlnk_is_empty(&qlnew) &&
+		    vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk))
+			break;
+		if (qlnew.qmblk == INODE_QLNK(inode)->qmblk &&
+		    qlnew.qmblk == qmblk)
+			goto finish;
+		if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk))
+			break;
+	}
+
+	/* prepare for restart */
+	vzquota_data_unlock(inode, &data);
+	qmblk_put(qmblk);
+	goto start;
+
+finish:
+	/* all references obtained successfully */
+	ret = vzquota_transfer_usage(inode, mask, &qlnew, dirty_ugids);
+	if (!ret) {
+		vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode));
+		set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS);
+	}
+out_unlock:
+	vzquota_data_unlock(inode, &data);
+	vzquota_handle_dirty_ugids(qmblk, dirty_ugids);
+	qmblk_put(qmblk);
+out_destr:
+	vzquota_qlnk_destroy(&qlnew);
+	return ret;
+}
+#endif
+
+int vzquota_rename_check(struct inode *inode,
+		struct inode *old_dir, struct inode *new_dir)
+{
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ilink qlnk1, qlnk2, qlnk3;
+	int c, ret;
+
+	if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb)
+		return -1;
+
+	might_sleep();
+
+	vzquota_qlnk_init(&qlnk1);
+	vzquota_qlnk_init(&qlnk2);
+	vzquota_qlnk_init(&qlnk3);
+	inode_qmblk_lock(inode->i_sb);
+	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+	__vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL);
+	__vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL);
+
+	do {
+		c = 0;
+		if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
+		    !VZ_QUOTA_IS_ACTUAL(inode)) {
+			vzquota_inode_qmblk_recalc(inode, &qlnk1);
+			c++;
+		}
+		if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) ||
+		    !VZ_QUOTA_IS_ACTUAL(new_dir)) {
+			vzquota_inode_qmblk_recalc(new_dir, &qlnk2);
+			c++;
+		}
+	} while (c);
+
+	ret = 0;
+	qmblk = INODE_QLNK(inode)->qmblk;
+	if (qmblk != INODE_QLNK(new_dir)->qmblk) {
+		ret = -1;
+		while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) ||
+		       !VZ_QUOTA_IS_ACTUAL(old_dir))
+			vzquota_inode_qmblk_recalc(old_dir, &qlnk3);
+		if (qmblk != VZ_QUOTA_BAD &&
+		    !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
+		    qmblk->dq_root_path.dentry->d_inode == inode &&
+		    VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk,
+			    				inode->i_sb) &&
+		    VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk,
+			    				inode->i_sb))
+			/* quota root rename is allowed */
+			ret = 0;
+	}
+
+	inode_qmblk_unlock(inode->i_sb);
+	vzquota_qlnk_destroy(&qlnk3);
+	vzquota_qlnk_destroy(&qlnk2);
+	vzquota_qlnk_destroy(&qlnk1);
+	return ret;
+}
+
+/*
+ * Scan parent subdirs and find busy dentries names/path
+ * @parent: parent dentry
+ * @buf: buffer to store path.
+ */
+static void vzdquota_read_busy_dentries(struct path *parent,
+		char *buf, int buflen)
+{
+	struct dentry *this_parent = parent->dentry;
+	struct list_head *next;
+	char *res, *end, *start;
+	struct path root, path;
+	int len;
+
+	if (!buf || buflen <= 0)
+		return;
+
+	path.mnt = parent->mnt;
+	/* From d_path() ... */
+	get_fs_root(current->fs, &root);
+
+	spin_lock(&dcache_lock);
+
+	end = buf + buflen;
+	start = buf;
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry;
+		int subdirs;
+
+		dentry = list_entry(tmp, struct dentry, d_u.d_child);
+		next = tmp->next;
+		subdirs = !list_empty(&dentry->d_subdirs); 
+
+		if (atomic_read(&dentry->d_count) && !subdirs) {
+			if (!buflen)
+				goto out;
+			/*
+			 * Note: __d_path will store filename at the
+			 * end of buf.
+			 */
+			path.dentry = dentry;
+			res = __d_path(&path, &root, buf, buflen);
+			/* Exit if name is too long */
+			if (IS_ERR(res))
+				goto out;
+
+			/*
+			 * Move the string obtained by __d_path,
+			 * behind the last dentry path in buf.
+			 */
+			len = end - res;
+			BUG_ON(len <= 0);
+
+			memmove(buf, res, len);
+
+			/* Trick: replace \0 by \n */
+			if (buf != start)
+				*(char *)(buf - 1) = '\n';
+
+			buf += len;
+			buflen -= len;
+		}
+
+		/*
+		 * Descend a level if the d_subdirs list is non-empty.
+		 */
+		if (subdirs) {
+			this_parent = dentry;
+			goto repeat;
+		}
+	}
+	/*
+	 * All done at this level ... ascend and resume the search.
+	 */
+	if (this_parent != parent->dentry) {
+		next = this_parent->d_u.d_child.next;
+		this_parent = this_parent->d_parent;
+		goto resume;
+	}
+out:
+	/* From d_path() ... */
+	spin_unlock(&dcache_lock);
+	path_put(&root);
+}
+
+/* ----------------------------------------------------------------------
+ *
+ * qmblk-related parts of on/off operations
+ *
+ * --------------------------------------------------------------------- */
+
+/**
+ * vzquota_check_dtree - check dentry tree if quota on/off is allowed
+ *
+ * This function doesn't allow quota to be turned on/off if some dentries in
+ * the tree have external references.
+ * In addition to technical reasons, it enforces user-space correctness:
+ * current usage (taken from or reported to the user space) can be meaningful
+ * and accurate only if the tree is not being modified.
+ * Side effect: additional vfsmount structures referencing the tree (bind
+ * mounts of tree nodes to some other places) are not allowed at on/off time.
+ *
+ * Store busy dentries path to the buf (if passed) in case of vzquota_off
+ * ioctl fail.
+ */
+int vzquota_check_dtree(struct vz_quota_master *qmblk, int off,
+						char *buf, int buflen)
+{
+	struct dentry *dentry;
+	int err, count;
+
+	err = -EBUSY;
+	dentry = qmblk->dq_root_path.dentry;
+
+	if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root)
+		goto unhashed;
+
+	/* attempt to shrink */
+  	if (!list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dcache_lock);
+		inode_qmblk_unlock(dentry->d_sb);
+		shrink_dcache_parent(dentry);
+		inode_qmblk_lock(dentry->d_sb);
+		spin_lock(&dcache_lock);
+		if (!list_empty(&dentry->d_subdirs)) {
+        		spin_unlock(&dcache_lock);
+			vzdquota_read_busy_dentries(&qmblk->dq_root_path,
+								buf, buflen);
+			spin_lock(&dcache_lock);
+			goto out;
+		}
+
+		count = 1;
+		if (dentry == dentry->d_sb->s_root)
+			count += 2;	/* sb and mnt refs */
+		if (atomic_read(&dentry->d_count) < count) {
+			printk(KERN_ERR "%s: too small count %d vs %d.\n",
+					__FUNCTION__,
+					atomic_read(&dentry->d_count), count);
+			goto out;
+		}
+		if (atomic_read(&dentry->d_count) > count)
+			goto out;
+	}
+
+	err = 0;
+out:
+	return err;
+
+unhashed:
+	/*
+	 * Quota root is removed.
+	 * Allow to turn quota off, but not on.
+	 */
+	if (off)
+		err = 0;
+	goto out;
+}
+
+int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
+		struct vz_quota_master *qmblk, char __user *ubuf)
+{
+	struct vz_quota_ilink qlnk;
+	struct vz_quota_master *qold, *qnew;
+	int err;
+	char *buf;
+
+	buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL;
+
+	might_sleep();
+
+	qold = NULL;
+	qnew = vzquota_alloc_fake();
+	if (qnew == NULL) {
+		free_page((unsigned long)buf);
+		return -ENOMEM;
+	}
+
+	vzquota_qlnk_init(&qlnk);
+	inode_qmblk_lock(sb);
+	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+
+	spin_lock(&dcache_lock);
+	while (1) {
+		err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE);
+		if (err)
+			break;
+		if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk))
+			break;
+	}
+	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON);
+	spin_unlock(&dcache_lock);
+
+	if (!err) {
+		qold = __VZ_QUOTA_NOQUOTA(sb);
+		qold->dq_flags |= VZDQ_NOACT;
+		__VZ_QUOTA_NOQUOTA(sb) = qnew;
+	} else
+		qold = qnew;
+
+	inode_qmblk_unlock(sb);
+	vzquota_qlnk_destroy(&qlnk);
+	if (qold != NULL)
+		qmblk_put(qold);
+
+	if (buf) {
+		if (copy_to_user(ubuf, buf, PAGE_SIZE))
+			;
+		free_page((unsigned long)buf);
+	}
+	return err;
+}
+
+int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk,
+						char __user *ubuf, int force)
+{
+	int ret;
+	char *buf;
+
+	buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL;
+
+	ret = 0;
+	inode_qmblk_lock(sb);
+
+	spin_lock(&dcache_lock);
+	if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force)
+		ret = -EBUSY;
+	spin_unlock(&dcache_lock);
+
+	if (!ret)
+		qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT;
+	inode_qmblk_unlock(sb);
+
+	if (buf) {
+		if (copy_to_user(ubuf, buf, PAGE_SIZE))
+			;
+		free_page((unsigned long)buf);
+	}
+	return ret;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * External interfaces
+ *
+ * ---------------------------------------------------------------------*/
+
+static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch (cmd) {
+	case VZCTL_QUOTA_NEW_CTL: {
+		struct vzctl_quotactl qb;
+
+		err = -EFAULT;
+		if (copy_from_user(&qb, (void __user *)arg, sizeof(qb)))
+			break;
+		err = do_vzquotactl(qb.cmd, qb.quota_id,
+				qb.qstat, qb.ve_root, 0);
+		break;
+	}
+#ifdef CONFIG_VZ_QUOTA_UGID
+	case VZCTL_QUOTA_UGID_CTL: {
+		struct vzctl_quotaugidctl qub;
+
+		err = -EFAULT;
+		if (copy_from_user(&qub, (void __user *)arg, sizeof(qub)))
+			break;
+		err = do_vzquotaugidctl(qub.cmd, qub.quota_id,
+				qub.ugid_index, qub.ugid_size, qub.addr, 0);
+		break;
+	}
+#endif
+	default:
+		err = -ENOTTY;
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch (cmd) {
+	case VZCTL_COMPAT_QUOTA_CTL: {
+		struct compat_vzctl_quotactl cs;
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		err = do_vzquotactl(cs.cmd, cs.quota_id,
+				compat_ptr(cs.qstat),
+				compat_ptr(cs.ve_root), 1);
+		break;
+	}
+#ifdef CONFIG_VZ_QUOTA_UGID
+	case VZCTL_COMPAT_QUOTA_UGID_CTL: {
+		struct compat_vzctl_quotaugidctl cs;
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index,
+				cs.ugid_size, compat_ptr(cs.addr), 1);
+		break;
+	}
+#endif
+	default:
+		err = -ENOIOCTLCMD;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo vzdqcalls = {
+	.type		= VZDQCTLTYPE,
+	.ioctl		= vzquota_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_vzquota_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+/**
+ * vzquota_dstat - get quota usage info for virtual superblock
+ */
+static int vzquota_dstat(struct inode *inode, struct dq_kstat *qstat)
+{
+	struct vz_quota_master *qmblk;
+
+	qmblk = vzquota_inode_qmblk(inode);
+	if (qmblk == NULL)
+		return -ENOENT;
+	if (qmblk == VZ_QUOTA_BAD) {
+		memset(qstat, 0, sizeof(*qstat));
+		return 0;
+	}
+
+	qmblk_data_read_lock(qmblk);
+	memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat));
+	qmblk_data_read_unlock(qmblk);
+	qmblk_put(qmblk);
+	return 0;
+}
+
+int
+vzquota_snap_init(struct super_block *vsuper, void *vzs, struct path *path)
+{
+	int err;
+	struct vz_quota_master *qmblk;
+
+	qmblk = vzquota_find_qmblk(vsuper);
+	if (qmblk == NULL)
+		return -ENOENT;
+	if (qmblk == VZ_QUOTA_BAD)
+		return -ENOENT;
+
+	err = -EBUSY;
+	qmblk_data_write_lock(qmblk);
+	if (!qmblk->dq_snap && qmblk->dq_root_path.mnt &&
+			qmblk->dq_root_path.dentry &&
+			qmblk->dq_root_path.mnt->mnt_sb->s_bdev) {
+		qmblk->dq_snap = vzs;
+		*path = qmblk->dq_root_path;
+		path_get(path);
+		err = 0;
+	}
+	qmblk_data_write_unlock(qmblk);
+
+	qmblk_put(qmblk);
+	return err;
+}
+EXPORT_SYMBOL(vzquota_snap_init);
+
+int vzquota_snap_stop(struct super_block *super, void *vzs)
+{
+	int err;
+	struct vz_quota_master *qmblk;
+
+	qmblk = vzquota_find_qmblk(super);
+	if (qmblk == NULL)
+		return -ENOENT;
+	if (qmblk == VZ_QUOTA_BAD)
+		return -ENOENT;
+
+	err = -ENOENT;
+	qmblk_data_write_lock(qmblk);
+	if (qmblk->dq_snap == vzs) {
+		err = 0;
+		qmblk->dq_snap = NULL;
+	}
+	qmblk_data_write_unlock(qmblk);
+
+	qmblk_put(qmblk);
+	return err;
+}
+EXPORT_SYMBOL(vzquota_snap_stop);
+
+/* ----------------------------------------------------------------------
+ *
+ * Init/exit helpers
+ *
+ * ---------------------------------------------------------------------*/
+
+static int vzquota_cache_init(void)
+{
+	int i;
+
+	vzquota_cachep = kmem_cache_create("vz_quota_master",
+					 sizeof(struct vz_quota_master),
+					 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (vzquota_cachep == NULL) {
+		printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
+		goto nomem2;
+	}
+	for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&vzquota_hash_table[i]);
+
+	return 0;
+
+nomem2:
+	return -ENOMEM;
+}
+
+static void vzquota_cache_release(void)
+{
+	int i;
+
+	/* sanity check */
+	for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
+		if (!list_empty(&vzquota_hash_table[i]))
+			BUG();
+
+	/* release caches */
+	kmem_cache_destroy(vzquota_cachep);
+	vzquota_cachep = NULL;
+}
+
+static int quota_notifier_call(struct vnotifier_block *self,
+		unsigned long n, void *data, int err)
+{
+	struct virt_info_quota *viq;
+	struct super_block *sb;
+
+	viq = (struct virt_info_quota *)data;
+	switch (n) {
+	case VIRTINFO_QUOTA_ON:
+		err = NOTIFY_BAD;
+		if (!try_module_get(THIS_MODULE))
+			break;
+		sb = viq->super;
+		memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
+		INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+		INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+		err = NOTIFY_OK;
+		break;
+	case VIRTINFO_QUOTA_OFF:
+		module_put(THIS_MODULE);
+		err = NOTIFY_OK;
+		break;
+	case VIRTINFO_QUOTA_GETSTAT:
+		err = NOTIFY_BAD;
+		if (vzquota_dstat(viq->inode, viq->qstat))
+			break;
+		err = NOTIFY_OK;
+		break;
+	case VIRTINFO_QUOTA_DISABLE:
+		err = NOTIFY_OK;
+		vzquota_inode_off((struct inode *)data);
+		break;
+	case VIRTINFO_ORPHAN_CLEAN: {
+		struct virt_info_orphan *vi = (struct virt_info_orphan *)data;
+
+		if (vzquota_on_cookie(vi->super, vi->cookie))
+			err = NOTIFY_BAD;
+		else
+			err = NOTIFY_OK;
+		break;
+	}
+	case VIRTINFO_ORPHAN_DONE: {
+		struct virt_info_orphan *vi = (struct virt_info_orphan *)data;
+
+		vzquota_off_cookies(vi->super);
+		break;
+	}
+	}
+	return err;
+}
+
+struct vnotifier_block quota_notifier_block = {
+	.notifier_call = quota_notifier_call,
+	.priority = INT_MAX,
+};
+
+/* ----------------------------------------------------------------------
+ *
+ * Init/exit procedures
+ *
+ * ---------------------------------------------------------------------*/
+
+static int __init vzquota_init(void)
+{
+	int err;
+
+	if ((err = vzquota_cache_init()) != 0)
+		goto out_cache;
+
+	if ((err = vzquota_proc_init()) != 0)
+		goto out_proc;
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+	if ((err = vzquota_ugid_init()) != 0)
+		goto out_ugid;
+#endif
+
+	mutex_init(&vz_quota_mutex);
+	vzioctl_register(&vzdqcalls);
+	virtinfo_notifier_register(VITYPE_QUOTA, &quota_notifier_block);
+#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS)
+	vzaquota_init();
+#endif
+
+	return 0;
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+out_ugid:
+	vzquota_proc_release();
+#endif
+out_proc:
+	vzquota_cache_release();
+out_cache:
+	return err;
+}
+
+#if defined(VZ_QUOTA_UNLOAD)
+static void __exit vzquota_release(void)
+{
+	virtinfo_notifier_unregister(VITYPE_QUOTA, &quota_notifier_block);
+	vzioctl_unregister(&vzdqcalls);
+#ifdef CONFIG_VZ_QUOTA_UGID
+#ifdef CONFIG_PROC_FS
+	vzaquota_fini();
+#endif
+	vzquota_ugid_release();
+#endif
+	vzquota_proc_release();
+	vzquota_cache_release();
+}
+#endif
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Disk Quota");
+MODULE_LICENSE("GPL v2");
+
+module_init(vzquota_init)
+#if defined(VZ_QUOTA_UNLOAD)
+module_exit(vzquota_release)
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ramfs/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/ramfs/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/ramfs/Makefile	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ramfs/Makefile	2015-01-21 12:02:52.624976804 +0300
@@ -7,3 +7,4 @@ obj-y += ramfs.o
 file-mmu-y := file-nommu.o
 file-mmu-$(CONFIG_MMU) := file-mmu.o
 ramfs-objs += inode.o $(file-mmu-y)
+ramfs-$(CONFIG_PRAMFS) += persistent.o
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ramfs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ramfs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ramfs/inode.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ramfs/inode.c	2015-01-21 12:02:52.784972555 +0300
@@ -163,10 +163,6 @@ static const struct super_operations ram
 	.show_options	= generic_show_options,
 };
 
-struct ramfs_mount_opts {
-	umode_t mode;
-};
-
 enum {
 	Opt_mode,
 	Opt_err
@@ -177,10 +173,6 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
-struct ramfs_fs_info {
-	struct ramfs_mount_opts mount_opts;
-};
-
 static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
 {
 	substring_t args[MAX_OPT_ARGS];
@@ -213,7 +205,7 @@ static int ramfs_parse_options(char *dat
 	return 0;
 }
 
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 {
 	struct ramfs_fs_info *fsi;
 	struct inode *inode = NULL;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ramfs/persistent.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ramfs/persistent.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ramfs/persistent.c	2015-01-21 12:02:52.624976804 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ramfs/persistent.c	2015-01-21 12:02:52.901969449 +0300
@@ -0,0 +1,805 @@
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/mmgang.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/parser.h>
+#include <linux/pram.h>
+#include <linux/ramfs.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+static void pram_fs_msg(struct super_block *sb, const char *prefix,
+			const char *fmt, ...)
+{
+	va_list ap;
+	struct ramfs_fs_info *fsi = sb->s_fs_info;
+
+	va_start(ap, fmt);
+	if (fsi && fsi->pram_name[0])
+		printk("%sPRAMFS (node=%s): ", prefix, fsi->pram_name);
+	else
+		printk("%sPRAMFS: ", prefix);
+	vprintk(fmt, ap);
+	printk("\n");
+	va_end(ap);
+}
+
+static int save_str(const char *str, int len, struct pram_stream *stream)
+{
+	__u32 __len = len;
+
+	if (pram_write(stream, &__len, 4) != 4 ||
+	    pram_write(stream, str, len) != len)
+		return -EIO;
+	return 0;
+}
+
+static int load_str(char *buf, int buflen, struct pram_stream *stream)
+{
+	__u32 __len;
+	int len;
+
+	if (pram_read(stream, &__len, 4) != 4)
+		return -EIO;
+	len = __len;
+	if (len > buflen)
+		return -ENAMETOOLONG;
+	if (pram_read(stream, buf, len) != len)
+		return -EIO;
+	return len;
+}
+
+static int save_mapping_pages(struct address_space *mapping,
+			      struct pram_stream *meta_stream,
+			      struct pram_stream *data_stream)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	__u64 __offset;
+	int err = 0;
+
+	pagevec_init(&pvec, 0);
+	while (!err && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		int i;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			unsigned long pfn;
+			pgoff_t offset;
+
+			lock_page(page);
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			offset = page->index;
+			if (offset > next)
+				next = offset;
+			next++;
+
+			__offset = offset;
+			if (pram_write(meta_stream, &__offset, 8) != 8 ||
+			    pram_push_page(data_stream, page, &pfn) != 0) {
+				unlock_page(page);
+				err = -EIO;
+				break;
+			}
+
+			remove_from_page_cache(page);
+			page_cache_release(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+#define OFFSET_END_MARK ((__u64)~0ULL)
+	__offset = OFFSET_END_MARK;
+	if (pram_write(meta_stream, &__offset, 8) != 8)
+		err = -EIO;
+
+	return err;
+}
+
+static int load_mapping_pages(struct address_space *mapping,
+			      struct pram_stream *meta_stream,
+			      struct pram_stream *data_stream)
+{
+	int err = 0;
+
+	for ( ; ; ) {
+		struct page *page;
+		__u64 __offset;
+		pgoff_t offset;
+
+		if (pram_read(meta_stream, &__offset, 8) != 8) {
+			err = -EIO;
+			break;
+		}
+		if (__offset == OFFSET_END_MARK)
+			break;
+
+		page = pram_pop_page(data_stream);
+		if (IS_ERR_OR_NULL(page)) {
+			err = -EIO;
+			break;
+		}
+
+		offset = __offset;
+		if (!pram_page_dirty(page)) {
+			err = add_to_page_cache_lru(page, mapping, offset,
+						    GFP_KERNEL);
+		} else {
+			/* page already accounted and in lru */
+			__set_page_locked(page);
+			err = add_to_page_cache_nogang(page, mapping, offset,
+						       GFP_KERNEL);
+			if (err)
+				__clear_page_locked(page);
+		}
+		if (err) {
+			put_page(page);
+			break;
+		}
+
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		put_page(page);
+	}
+
+	return err;
+}
+
+static int save_symlink_value(struct dentry *dentry,
+			      struct pram_stream *meta_stream)
+{
+	mm_segment_t oldfs;
+	int len;
+	char *buf;
+	int err;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		return -ENOMEM;
+
+	BUG_ON(!dentry->d_inode->i_op->readlink);
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	err = len = dentry->d_inode->i_op->readlink(dentry, buf, PAGE_SIZE);
+	set_fs(oldfs);
+	if (len >= 0)
+		err = save_str(buf, len, meta_stream);
+
+	free_page((unsigned long)buf);
+	return err;
+}
+
+static inline int load_make_symlink(struct dentry *parent,
+				    struct dentry *dentry,
+				    struct pram_stream *meta_stream)
+{
+	int len;
+	char *buf;
+	int err;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		return -ENOMEM;
+
+	err = len = load_str(buf, PAGE_SIZE - 1, meta_stream);
+	if (len >= 0) {
+		buf[len] = '\0';
+		err = vfs_symlink(parent->d_inode, dentry, buf);
+	}
+
+	free_page((unsigned long)buf);
+	return err;
+}
+
+struct file_header {
+	__u32	mode;
+	__u32	uid;
+	__u32	gid;
+	__u32	dev;
+	__u32	atime;
+	__u32	mtime;
+	__u32	ctime;
+	__u64	size;
+};
+
+static int save_file(struct dentry *dentry,
+		     struct pram_stream *meta_stream,
+		     struct pram_stream *data_stream)
+{
+	struct inode *inode = dentry->d_inode;
+	struct file_header hdr;
+	umode_t mode;
+	int err;
+
+	mode = inode->i_mode;
+	hdr.mode = mode;
+	hdr.uid = inode->i_uid;
+	hdr.gid = inode->i_gid;
+	hdr.dev = inode->i_rdev;
+	hdr.atime = inode->i_atime.tv_sec;
+	hdr.mtime = inode->i_mtime.tv_sec;
+	hdr.ctime = inode->i_ctime.tv_sec;
+	hdr.size = i_size_read(inode);
+
+	if (pram_write(meta_stream, &hdr, sizeof(hdr)) != sizeof(hdr))
+		return -EIO;
+
+	err = save_str(dentry->d_name.name, dentry->d_name.len, meta_stream);
+	if (err)
+		return err;
+
+	if (S_ISLNK(mode))
+		err = save_symlink_value(dentry, meta_stream);
+	else if (S_ISREG(mode))
+		err = save_mapping_pages(inode->i_mapping,
+					 meta_stream, data_stream);
+
+	return err;
+}
+
+static struct dentry *load_file(struct dentry *parent,
+				struct pram_stream *meta_stream,
+				struct pram_stream *data_stream)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file_header hdr;
+	umode_t mode;
+	int len;
+	char *buf;
+	int err;
+
+	if (pram_read(meta_stream, &hdr, sizeof(hdr)) != sizeof(hdr))
+		return ERR_PTR(-EIO);
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	err = len = load_str(buf, PAGE_SIZE, meta_stream);
+	if (len < 0)
+		goto out;
+
+	mutex_lock_nested(&parent->d_inode->i_mutex, I_MUTEX_PARENT);
+
+	dentry = lookup_one_len(buf, parent, len);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+
+	free_page((unsigned long)buf);
+	buf = NULL;
+
+	mode = hdr.mode;
+	if (S_ISLNK(mode))
+		err = load_make_symlink(parent, dentry, meta_stream);
+	else if (S_ISREG(mode))
+		err = vfs_create(parent->d_inode, dentry, mode, NULL);
+	else if (S_ISDIR(mode))
+		err = vfs_mkdir(parent->d_inode, dentry, mode);
+	else if (S_ISCHR(mode) || S_ISBLK(mode) ||
+		 S_ISFIFO(mode) || S_ISSOCK(mode))
+		err = vfs_mknod(parent->d_inode, dentry, mode, (dev_t)hdr.dev);
+	else
+		err = -EINVAL;
+	if (err)
+		goto out_dput;
+
+	inode = dentry->d_inode;
+
+	inode->i_mode = mode;
+	inode->i_uid = hdr.uid;
+	inode->i_gid = hdr.gid;
+	inode->i_atime.tv_sec = hdr.atime;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_sec = hdr.mtime;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_ctime.tv_sec = hdr.ctime;
+	inode->i_ctime.tv_nsec = 0;
+
+	if (S_ISREG(mode)) {
+		i_size_write(inode, hdr.size);
+		err = load_mapping_pages(inode->i_mapping,
+					 meta_stream, data_stream);
+		if (err)
+			goto out_dput;
+	}
+out_unlock:
+	mutex_unlock(&parent->d_inode->i_mutex);
+out:
+	if (buf)
+		free_page((unsigned long)buf);
+	if (err)
+		dentry = ERR_PTR(err);
+	return dentry;
+out_dput:
+	dput(dentry);
+	goto out_unlock;
+}
+
+static int save_link(struct dentry *dentry, struct dentry *target,
+		     struct pram_stream *stream)
+{
+	char *str;
+	void *buf;
+	int err = 0;
+
+	buf = (void *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		return -ENOMEM;
+
+	str = dentry_path(target, buf, PAGE_SIZE);
+	if (IS_ERR(str))
+		err = PTR_ERR(str);
+
+	if (!err)
+		err = save_str(str, strlen(str), stream);
+
+	free_page((unsigned long)buf);
+
+	if (!err)
+		err = save_str(dentry->d_name.name, dentry->d_name.len, stream);
+
+	return err;
+}
+
+static int load_link(struct dentry *parent, struct vfsmount *mnt,
+		     struct pram_stream *stream)
+{
+	struct dentry *dentry;
+	struct nameidata nd;
+	int len;
+	char *buf;
+	int err;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		return -ENOMEM;
+
+	err = len = load_str(buf, PAGE_SIZE - 1, stream);
+	if (len < 0)
+		goto out;
+	buf[len] = '\0';
+
+	err = vfs_path_lookup(mnt->mnt_root, mnt, buf, 0, &nd);
+	if (err)
+		goto out;
+
+	err = len = load_str(buf, PAGE_SIZE, stream);
+	if (len < 0)
+		goto out_path_put;
+
+	mutex_lock_nested(&parent->d_inode->i_mutex, I_MUTEX_PARENT);
+
+	dentry = lookup_one_len(buf, parent, len);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+
+	free_page((unsigned long)buf);
+	buf = NULL;
+
+	err = vfs_link(nd.path.dentry, parent->d_inode, dentry);
+
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&parent->d_inode->i_mutex);
+out_path_put:
+	path_put(&nd.path);
+out:
+	if (buf)
+		free_page((unsigned long)buf);
+	return err;
+}
+
+#define CONTENT_FILE	1
+#define CONTENT_LINK	2
+#define ENDOFDIR_MARK	3
+
+static int save_tree(struct dentry *root,
+		     struct pram_stream *meta_stream,
+		     struct pram_stream *data_stream)
+{
+	struct dentry *dget_list = NULL;
+	struct dentry *dir, *dentry;
+	__u16 __content;
+	int err = 0;
+
+	dir = root;
+	dentry = NULL;
+next_dir:
+	spin_lock(&dcache_lock);
+	if (!dentry)
+		dentry = list_entry(&dir->d_subdirs, struct dentry, d_u.d_child);
+	list_for_each_entry_continue(dentry, &dir->d_subdirs, d_u.d_child) {
+		struct inode *inode;
+		int content;
+
+		if (d_unhashed(dentry) || !dentry->d_inode)
+			continue;
+
+		dget(dentry);
+		spin_unlock(&dcache_lock);
+
+		BUG_ON(dentry->d_fsdata);
+		dentry->d_fsdata = dget_list;
+		dget_list = dentry;
+
+		inode = dentry->d_inode;
+		if (inode->i_private) {
+			BUG_ON(S_ISDIR(inode->i_mode));
+			content = CONTENT_LINK;
+		} else {
+			content = CONTENT_FILE;
+		}
+
+		__content = content;
+		if (pram_write(meta_stream, &__content, 2) != 2) {
+			err = -EIO;
+			goto out;
+		}
+
+		switch (content) {
+		case CONTENT_FILE:
+			err = save_file(dentry, meta_stream, data_stream);
+			if (!err && S_ISDIR(inode->i_mode)) {
+				dir = dentry;
+				dentry = NULL;
+				goto next_dir;
+			}
+			inode->i_private = dentry;
+			break;
+		case CONTENT_LINK:
+			err = save_link(dentry, inode->i_private, meta_stream);
+			break;
+		}
+		if (err)
+			goto out;
+		spin_lock(&dcache_lock);
+	}
+	spin_unlock(&dcache_lock);
+out:
+	if (!err && dir != root) {
+		__content = ENDOFDIR_MARK;
+		if (pram_write(meta_stream, &__content, 2) != 2) {
+			err = -EIO;
+		} else {
+			dentry = dir;
+			dir = dir->d_parent;
+			goto next_dir;
+		}
+	}
+
+	while (dget_list) {
+		dentry = dget_list;
+		dget_list = dentry->d_fsdata;
+		dentry->d_inode->i_private = dentry->d_fsdata = NULL;
+		dput(dentry);
+	}
+	return err;
+}
+
+static int load_tree(struct vfsmount *mnt,
+		     struct pram_stream *meta_stream,
+		     struct pram_stream *data_stream)
+{
+	struct dentry *root = mnt->mnt_root;
+	struct dentry *dir, *dentry;
+	__u16 __content;
+	int content;
+	ssize_t ret;
+	int err = 0;
+
+	dir = root;
+next:
+	ret = pram_read(meta_stream, &__content, 2);
+	if (!ret)
+		goto out;
+	if (ret != 2) {
+		err = -EIO;
+		goto out;
+	}
+
+	content = __content;
+	switch (content) {
+	case CONTENT_FILE:
+		dentry = load_file(dir, meta_stream, data_stream);
+		if (IS_ERR(dentry))
+			err = PTR_ERR(dentry);
+		else if (S_ISDIR(dentry->d_inode->i_mode))
+			dir = dentry;
+		else
+			dput(dentry);
+		break;
+	case CONTENT_LINK:
+		err = load_link(dir, mnt, meta_stream);
+		break;
+	case ENDOFDIR_MARK:
+		if (dir != root) {
+			dentry = dir;
+			dir = dir->d_parent;
+			dput(dentry);
+			goto next;
+		}
+	default:
+		err = -EIO;
+		break;
+	}
+	if (!err)
+		goto next;
+out:
+	if (dir != root && !err)
+		err = -EIO;
+	while (dir != root) {
+		dentry = dir;
+		dir = dir->d_parent;
+		dput(dentry);
+	}
+	return err;
+}
+
+static inline const char *pram_fs_node_basename(struct super_block *sb,
+						char *buf, size_t size)
+{
+	struct ramfs_fs_info *fsi = sb->s_fs_info;
+
+	if (fsi && fsi->pram_name[0])
+		snprintf(buf, size, "pram.%s.", fsi->pram_name);
+	else
+		snprintf(buf, size, "pram.");
+	return buf;
+}
+
+/*
+ * Meta and data streams must be opened and closed atomically, otherwise we can
+ * get a data storage without corresponding meta storage, which will lead to
+ * open_streams() failures.
+ */
+static DEFINE_MUTEX(streams_mutex);
+
+static int open_streams(struct super_block *sb, int mode,
+			struct pram_stream *meta_stream,
+			struct pram_stream *data_stream)
+{
+	char *buf;
+	size_t basename_len;
+	int err = -ENOMEM;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		goto out;
+
+	pram_fs_node_basename(sb, buf, PAGE_SIZE);
+	basename_len = strlen(buf);
+
+	mutex_lock(&streams_mutex);
+
+	strlcat(buf, "meta", PAGE_SIZE);
+	err = pram_open(buf, mode, meta_stream);
+	if (err)
+		goto out_unlock;
+
+	buf[basename_len] = '\0';
+	strlcat(buf, "data", PAGE_SIZE);
+	err = pram_open(buf, mode, data_stream);
+	if (err)
+		goto out_close_meta;
+
+	mutex_unlock(&streams_mutex);
+	free_page((unsigned long)buf);
+	return 0;
+
+out_close_meta:
+	pram_close(meta_stream, -1);
+out_unlock:
+	mutex_unlock(&streams_mutex);
+	free_page((unsigned long)buf);
+out:
+	return err;
+}
+
+static inline void close_streams(struct pram_stream *meta_stream,
+				 struct pram_stream *data_stream, int err)
+{
+	mutex_lock(&streams_mutex);
+	pram_close(meta_stream, err);
+	pram_close(data_stream, err);
+	mutex_unlock(&streams_mutex);
+}
+
+static void save_pram_fs(struct super_block *sb)
+{
+	struct pram_stream meta_stream, data_stream;
+	int err;
+
+	err = open_streams(sb, PRAM_WRITE, &meta_stream, &data_stream);
+	if (err)
+		goto out;
+
+	err = save_tree(sb->s_root, &meta_stream, &data_stream);
+	close_streams(&meta_stream, &data_stream, err);
+out:
+	if (err)
+		pram_fs_msg(sb, KERN_ERR, "Failed to save FS tree: %d", err);
+}
+
+static int load_pram_fs(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct pram_stream meta_stream, data_stream;
+	int err;
+
+	err = open_streams(sb, PRAM_READ, &meta_stream, &data_stream);
+	if (err)
+		goto out;
+
+	err = load_tree(mnt, &meta_stream, &data_stream);
+	close_streams(&meta_stream, &data_stream, 0);
+out:
+	if (err)
+		pram_fs_msg(sb, KERN_ERR, "Failed to load FS tree: %d", err);
+	else
+		pram_fs_msg(sb, KERN_INFO, "loaded");
+	return err;
+}
+
+static int destroy_pram_fs(struct super_block *sb)
+{
+	struct pram_stream meta_stream, data_stream;
+	int err;
+
+	err = open_streams(sb, PRAM_READ, &meta_stream, &data_stream);
+	if (!err) {
+		close_streams(&meta_stream, &data_stream, 0);
+		pram_fs_msg(sb, KERN_INFO, "discarded");
+	}
+	if (err == -ENOENT)
+		err = 0;
+	if (err)
+		pram_fs_msg(sb, KERN_ERR,
+			    "Failed to destroy PRAM node: %d", err);
+	return err;
+}
+
+enum {
+	Opt_noload,
+	Opt_pram_name,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_noload, "noload"},
+	{Opt_pram_name, "pram_name=%s"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, int *load,
+			 char *name, size_t name_size)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+	char *p;
+
+	*load = 1;
+	memset(name, 0, name_size);
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_noload:
+			*load = 0;
+			break;
+		case Opt_pram_name:
+			if (match_strlcpy(name, &args[0],
+					  name_size) >= name_size)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int pram_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int err;
+	char *options;
+	struct ramfs_fs_info *fsi;
+
+	err = -ENOMEM;
+	options = kstrdup(data, GFP_KERNEL);
+	if (!options && data)
+		goto out;
+
+	err = ramfs_fill_super(sb, data, silent);
+	if (err)
+		goto out_free_opts;
+
+	fsi = sb->s_fs_info;
+	BUG_ON(!fsi);
+
+	fsi->pram_save = 0;
+	err = parse_options(options, &fsi->pram_load,
+			    fsi->pram_name, PRAM_FS_NAME_MAX);
+out_free_opts:
+	kfree(options);
+out:
+	return err;
+}
+
+static int pram_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	int err;
+	struct super_block *sb;
+	struct ramfs_fs_info *fsi;
+
+	err = get_sb_nodev(fs_type, flags, data, pram_fill_super, mnt);
+	if (err)
+		return err;
+
+	sb = mnt->mnt_sb;
+	fsi = sb->s_fs_info;
+	BUG_ON(!fsi);
+
+	err = fsi->pram_load ? load_pram_fs(sb, mnt) : destroy_pram_fs(sb);
+	if (err) {
+		dput(sb->s_root);
+		deactivate_locked_super(sb);
+		return err;
+	}
+
+	fsi->pram_save = 1;
+	return 0;
+}
+
+static void pram_kill_sb(struct super_block *sb)
+{
+	struct ramfs_fs_info *fsi = sb->s_fs_info;
+
+	if (fsi && fsi->pram_save)
+		save_pram_fs(sb);
+	kfree(sb->s_fs_info);
+	kill_litter_super(sb);
+}
+
+static struct file_system_type pram_fs_type = {
+	.name		= "pram",
+	.get_sb		= pram_get_sb,
+	.kill_sb	= pram_kill_sb,
+};
+
+static int __init init_pram_fs(void)
+{
+	return register_filesystem(&pram_fs_type);
+}
+module_init(init_pram_fs);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/read_write.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/read_write.c
--- linux-2.6.32-504.3.3.el6.orig/fs/read_write.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/read_write.c	2015-01-21 12:02:43.343223213 +0300
@@ -21,6 +21,8 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+#include <bc/beancounter.h>
+
 const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -369,23 +371,28 @@ EXPORT_SYMBOL(vfs_read);
 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
-	struct kiocb kiocb;
+	struct kiocb *kiocb;
 	ssize_t ret;
 
-	init_sync_kiocb(&kiocb, filp);
-	kiocb.ki_pos = *ppos;
-	kiocb.ki_left = len;
+	kiocb = kzalloc(sizeof(struct kiocb), GFP_KERNEL);
+	if (!kiocb)
+		return -ENOMEM;
+
+	init_sync_kiocb(kiocb, filp);
+	kiocb->ki_pos = *ppos;
+	kiocb->ki_left = len;
 
 	for (;;) {
-		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+		ret = filp->f_op->aio_write(kiocb, &iov, 1, kiocb->ki_pos);
 		if (ret != -EIOCBRETRY)
 			break;
-		wait_on_retry_sync_kiocb(&kiocb);
+		wait_on_retry_sync_kiocb(kiocb);
 	}
 
 	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	*ppos = kiocb.ki_pos;
+		ret = wait_on_sync_kiocb(kiocb);
+	*ppos = kiocb->ki_pos;
+	kfree(kiocb);
 	return ret;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/reiserfs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/reiserfs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/reiserfs/inode.c	2014-12-12 23:28:53.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/reiserfs/inode.c	2015-01-21 12:02:52.022992783 +0300
@@ -1578,8 +1578,13 @@ int reiserfs_encode_fh(struct dentry *de
 	struct inode *inode = dentry->d_inode;
 	int maxlen = *lenp;
 
-	if (maxlen < 3)
+	if (need_parent && (maxlen < 5)) {
+		*lenp = 5;
 		return 255;
+	} else if (maxlen < 3) {
+		*lenp = 3;
+		return 255;
+	}
 
 	data[0] = inode->i_ino;
 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/reiserfs/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/reiserfs/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/reiserfs/namei.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/reiserfs/namei.c	2015-01-21 12:02:53.103964088 +0300
@@ -826,6 +826,9 @@ static int reiserfs_rmdir(struct inode *
 	INITIALIZE_PATH(path);
 	struct reiserfs_dir_entry de;
 
+	inode = dentry->d_inode;
+	vfs_dq_init(inode);
+
 	/* we will be doing 2 balancings and update 2 stat data, we change quotas
 	 * of the owner of the directory and of the owner of the parent directory.
 	 * The quota structure is possibly deleted only on last iput => outside
@@ -850,8 +853,6 @@ static int reiserfs_rmdir(struct inode *
 		goto end_rmdir;
 	}
 
-	inode = dentry->d_inode;
-
 	reiserfs_update_inode_transaction(inode);
 	reiserfs_update_inode_transaction(dir);
 
@@ -915,6 +916,7 @@ static int reiserfs_unlink(struct inode 
 	unsigned long savelink;
 
 	inode = dentry->d_inode;
+	vfs_dq_init(inode);
 
 	/* in this transaction we can be doing at max two balancings and update
 	 * two stat datas, we change quotas of the owner of the directory and of
@@ -1108,10 +1110,6 @@ static int reiserfs_link(struct dentry *
 		reiserfs_write_unlock(dir->i_sb);
 		return -EMLINK;
 	}
-	if (inode->i_nlink == 0) {
-		reiserfs_write_unlock(dir->i_sb);
-		return -ENOENT;
-	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
 	inc_nlink(inode);
@@ -1228,6 +1226,8 @@ static int reiserfs_rename(struct inode 
 
 	old_inode = old_dentry->d_inode;
 	new_dentry_inode = new_dentry->d_inode;
+	if (new_dentry_inode)
+		vfs_dq_init(new_dentry_inode);
 
 	// make sure, that oldname still exists and points to an object we
 	// are going to rename
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/select.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/select.c
--- linux-2.6.32-504.3.3.el6.orig/fs/select.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/select.c	2015-01-21 12:02:57.966835006 +0300
@@ -30,6 +30,7 @@
 
 #include <asm/uaccess.h>
 
+#include <bc/kmem.h>
 
 /*
  * Estimate expected accuracy in ns from a timeval.
@@ -580,7 +581,8 @@ int core_sys_select(int n, fd_set __user
 	if (size > sizeof(stack_fds) / 6) {
 		/* Not enough space in on-stack array; must use kmalloc */
 		ret = -ENOMEM;
-		bits = kmalloc(6 * size, GFP_KERNEL);
+		bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ?
+				GFP_KERNEL_UBC : GFP_KERNEL);
 		if (!bits)
 			goto out_nofds;
 	}
@@ -894,7 +896,7 @@ int do_sys_poll(struct pollfd __user *uf
 
 		len = min(todo, POLLFD_PER_PAGE);
 		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
-		walk = walk->next = kmalloc(size, GFP_KERNEL);
+		walk = walk->next = kmalloc(size, GFP_KERNEL_UBC);
 		if (!walk) {
 			err = -ENOMEM;
 			goto out_fds;
@@ -926,7 +928,7 @@ out_fds:
 	return err;
 }
 
-static long do_restart_poll(struct restart_block *restart_block)
+long do_restart_poll(struct restart_block *restart_block)
 {
 	struct pollfd __user *ufds = restart_block->poll.ufds;
 	int nfds = restart_block->poll.nfds;
@@ -947,6 +949,7 @@ static long do_restart_poll(struct resta
 	}
 	return ret;
 }
+EXPORT_SYMBOL(do_restart_poll);
 
 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		int, timeout_msecs)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/seq_file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/seq_file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/seq_file.c	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/seq_file.c	2015-01-21 12:02:51.431008498 +0300
@@ -32,7 +32,7 @@ int seq_open(struct file *file, const st
 	struct seq_file *p = file->private_data;
 
 	if (!p) {
-		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		p = kmalloc(sizeof(*p), GFP_KERNEL_UBC);
 		if (!p)
 			return -ENOMEM;
 		file->private_data = p;
@@ -76,7 +76,7 @@ static int traverse(struct seq_file *m, 
 		return 0;
 	}
 	if (!m->buf) {
-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC);
 		if (!m->buf)
 			return -ENOMEM;
 	}
@@ -116,7 +116,7 @@ static int traverse(struct seq_file *m, 
 Eoverflow:
 	m->op->stop(m, p);
 	kfree(m->buf);
-	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC);
 	return !m->buf ? -ENOMEM : -EAGAIN;
 }
 
@@ -169,7 +169,7 @@ ssize_t seq_read(struct file *file, char
 	m->version = file->f_version;
 	/* grab buffer if we didn't have one */
 	if (!m->buf) {
-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC);
 		if (!m->buf)
 			goto Enomem;
 	}
@@ -210,7 +210,7 @@ ssize_t seq_read(struct file *file, char
 			goto Fill;
 		m->op->stop(m, p);
 		kfree(m->buf);
-		m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+		m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC);
 		if (!m->buf)
 			goto Enomem;
 		m->count = 0;
@@ -445,6 +445,8 @@ int seq_path(struct seq_file *m, struct 
 
 	if (size) {
 		char *p = d_path(path, buf, size);
+		if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG)
+			return 0;
 		if (!IS_ERR(p)) {
 			char *end = mangle_path(buf, p, esc);
 			if (end)
@@ -510,6 +512,7 @@ int seq_dentry(struct seq_file *m, struc
 
 	return res;
 }
+EXPORT_SYMBOL_GPL(seq_dentry);
 
 int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 				   unsigned int nr_bits)
@@ -561,7 +564,7 @@ static void single_stop(struct seq_file 
 int single_open(struct file *file, int (*show)(struct seq_file *, void *),
 		void *data)
 {
-	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC);
 	int res = -ENOMEM;
 
 	if (op) {
@@ -605,7 +608,7 @@ void *__seq_open_private(struct file *f,
 	void *private;
 	struct seq_file *seq;
 
-	private = kzalloc(psize, GFP_KERNEL);
+	private = kzalloc(psize, GFP_KERNEL_UBC);
 	if (private == NULL)
 		goto out;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/signalfd.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/signalfd.c
--- linux-2.6.32-504.3.3.el6.orig/fs/signalfd.c	2014-12-12 23:28:59.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/signalfd.c	2015-01-21 12:02:47.957100721 +0300
@@ -28,10 +28,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/signalfd.h>
 #include <linux/syscalls.h>
-
-struct signalfd_ctx {
-	sigset_t sigmask;
-};
+#include <linux/module.h>
 
 static int signalfd_release(struct inode *inode, struct file *file)
 {
@@ -209,17 +206,17 @@ static ssize_t signalfd_read(struct file
 	return total ? total: ret;
 }
 
-static const struct file_operations signalfd_fops = {
+const struct file_operations signalfd_fops = {
 	.release	= signalfd_release,
 	.poll		= signalfd_poll,
 	.read		= signalfd_read,
 };
+EXPORT_SYMBOL(signalfd_fops);
 
 SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
 		size_t, sizemask, int, flags)
 {
 	sigset_t sigmask;
-	struct signalfd_ctx *ctx;
 
 	/* Check the SFD_* constants for consistency.  */
 	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
@@ -234,12 +231,19 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sig
 	sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
 	signotset(&sigmask);
 
+	return do_signalfd(ufd, &sigmask, flags);
+}
+
+long do_signalfd(int ufd, sigset_t *sigmask, int flags)
+{
+	struct signalfd_ctx *ctx;
+
 	if (ufd == -1) {
 		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 		if (!ctx)
 			return -ENOMEM;
 
-		ctx->sigmask = sigmask;
+		ctx->sigmask = *sigmask;
 
 		/*
 		 * When we call this, the initialization must be complete, since
@@ -259,7 +263,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sig
 			return -EINVAL;
 		}
 		spin_lock_irq(&current->sighand->siglock);
-		ctx->sigmask = sigmask;
+		ctx->sigmask = *sigmask;
 		spin_unlock_irq(&current->sighand->siglock);
 
 		wake_up(&current->sighand->signalfd_wqh);
@@ -268,6 +272,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sig
 
 	return ufd;
 }
+EXPORT_SYMBOL_GPL(do_signalfd);
 
 SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
 		size_t, sizemask)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/simfs.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/simfs.c
--- linux-2.6.32-504.3.3.el6.orig/fs/simfs.c	2015-01-21 12:02:51.805998544 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/simfs.c	2015-01-21 12:02:51.832997826 +0300
@@ -0,0 +1,390 @@
+/*
+ *  fs/simfs.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/vzquota.h>
+#include <linux/statfs.h>
+#include <linux/virtinfo.h>
+#include <linux/genhd.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+
+#include <asm/unistd.h>
+#include <asm/uaccess.h>
+
+#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb
+
+static struct super_operations sim_super_ops;
+
+static void quota_get_stat(struct inode *ino, struct kstatfs *buf)
+{
+	int err;
+	struct dq_kstat qstat;
+	struct virt_info_quota q;
+	long free_file, adj_file;
+	s64 blk, free_blk, adj_blk;
+	int bsize_bits;
+
+	q.inode = ino;
+	q.qstat = &qstat;
+	err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q);
+	if (err != NOTIFY_OK)
+		return;
+
+	bsize_bits = ffs(buf->f_bsize) - 1;
+	
+	if (qstat.bsoftlimit > qstat.bcurrent)
+		free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits;
+	else
+		free_blk = 0;
+	/*
+	 * In the regular case, we always set buf->f_bfree and buf->f_blocks to
+	 * the values reported by quota.  In case of real disk space shortage,
+	 * we adjust the values.  We want this adjustment to look as if the
+	 * total disk space were reduced, not as if the usage were increased.
+	 *    -- SAW
+	 */
+	adj_blk = 0;
+	if (buf->f_bfree < free_blk)
+		adj_blk = free_blk - buf->f_bfree;
+	buf->f_bfree = free_blk - adj_blk;
+
+	if (free_blk < buf->f_bavail)
+		buf->f_bavail = free_blk;
+
+	blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk;
+	buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk;
+
+
+	free_file = 0;
+	if (qstat.icurrent < qstat.isoftlimit)
+		free_file = qstat.isoftlimit - qstat.icurrent;
+
+	if (buf->f_type == REISERFS_SUPER_MAGIC)
+		/*
+		 * reiserfs doesn't initialize f_ffree and f_files values of
+		 * kstatfs because it doesn't have an inode limit.
+		 */
+		buf->f_ffree = free_file;
+	adj_file = 0;
+	if (buf->f_ffree < free_file)
+		adj_file = free_file - buf->f_ffree;
+	buf->f_ffree = free_file - adj_file;
+	buf->f_files = qstat.isoftlimit - adj_file;
+}
+
+static int sim_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int err;
+
+	err = statfs_by_dentry(dentry, buf);
+	if (err)
+		return err;
+
+	quota_get_stat(dentry->d_inode, buf);
+	return 0;
+}
+
+static int sim_start_write(struct super_block *sb, int level, bool wait)
+{
+	struct super_block *root_sb = SIMFS_GET_LOWER_FS_SB(sb);
+
+	if (!__sb_start_write(root_sb, level, wait))
+		return 0;
+
+	if (!(current->trans_count++)) {
+		current->transaction_info = sb;
+		current->flags |= PF_FSTRANS;
+	} else
+		WARN_ONCE((current->transaction_info != sb), "Broken fs-transaction");
+	return 1;
+}
+
+static void sim_end_write(struct super_block *sb, int level)
+{
+	struct super_block *root_sb = SIMFS_GET_LOWER_FS_SB(sb);
+
+	WARN_ONCE((current->transaction_info != sb), "Broken fs-transaction");
+
+	if (!(--current->trans_count)) {
+		current->flags &= ~PF_FSTRANS;
+		current->transaction_info = NULL;
+	}
+	__sb_end_write(root_sb, level);
+}
+
+#ifdef CONFIG_QUOTA
+static struct inode *sim_quota_root(struct super_block *sb)
+{
+	return sb->s_root->d_inode;
+}
+#endif
+
+/*
+ * NOTE: We need to setup s_bdev field on super block, since sys_quotactl()
+ * does lookup_bdev() and get_super() which are comparing sb->s_bdev.
+ * so this is a MUST if we want unmodified sys_quotactl
+ * to work correctly on /dev/simfs inside VE
+ */
+static int sim_init_blkdev(struct super_block *sb)
+{
+	static struct hd_struct fake_hd;
+	struct block_device *blkdev;
+
+	blkdev = bdget(sb->s_dev);
+	if (blkdev == NULL)
+		return -ENOMEM;
+
+	blkdev->bd_part = &fake_hd;	/* required for bdev_read_only() */
+	sb->s_bdev = blkdev;
+
+	return 0;
+}
+
+static void sim_free_blkdev(struct super_block *sb)
+{
+	if (sb->s_bdev) {
+		/* set bd_part back to NULL */
+		sb->s_bdev->bd_part = NULL;
+		bdput(sb->s_bdev);
+	}
+}
+
+static void sim_quota_init(struct super_block *sb)
+{
+	struct virt_info_quota viq;
+
+	viq.super = sb;
+	virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq);
+}
+
+static void sim_quota_free(struct super_block *sb)
+{
+	struct virt_info_quota viq;
+
+	viq.super = sb;
+	virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq);
+}
+
+static void sim_show_type(struct seq_file *m, struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+	if (vzquota_fake_fstype(current))
+		seq_escape(m, VZQUOTA_FAKE_FSTYPE, " \t\n\\");
+	else
+#endif
+		seq_escape(m, sb->s_type->name, " \t\n\\");
+}
+
+static int sim_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+#ifdef CONFIG_QUOTA
+	if (sb_has_quota_loaded(mnt->mnt_sb, USRQUOTA))
+		seq_puts(m, ",usrquota");
+	if (sb_has_quota_loaded(mnt->mnt_sb, GRPQUOTA))
+		seq_puts(m, ",grpquota");
+#endif
+	return 0;
+}
+static struct super_operations sim_super_ops = {
+#ifdef CONFIG_QUOTA
+	.show_type	= &sim_show_type,
+	.show_options	= &sim_show_options,
+	.get_quota_root	= &sim_quota_root,
+#endif
+	.statfs = sim_statfs,
+	.start_write	= &sim_start_write,
+	.end_write	= &sim_end_write,
+};
+
+#if defined(CONFIG_EXPORTFS) || defined(CONFIG_EXPORTFS_MODULE)
+
+#define SIM_CALL_LOWER(method, sb, args...)		\
+	struct super_block *lsb;			\
+	const struct export_operations *lop;		\
+							\
+	lsb = SIMFS_GET_LOWER_FS_SB(sb);		\
+	lop = lsb->s_export_op;				\
+	return lop->method(lsb, ## args)
+
+#define SIM_CALL_DENTRY(method, dentry, args...)	\
+	struct super_block *lsb;			\
+	const struct export_operations *lop;		\
+							\
+	lsb = (dentry)->d_sb;				\
+	lop = lsb->s_export_op;				\
+	return lop->method(dentry, ## args)
+
+static int sim_encode_fh(struct dentry *de, __u32 *fh, int *max_len,
+			int connectable)
+{
+	SIM_CALL_DENTRY(encode_fh, de, fh, max_len, connectable);
+}
+
+static struct dentry * sim_fh_to_dentry(struct super_block *sb, struct fid *fid,
+			int fh_len, int fh_type)
+{
+	SIM_CALL_LOWER(fh_to_dentry, sb, fid, fh_len, fh_type);
+}
+
+static struct dentry * sim_fh_to_parent(struct super_block *sb, struct fid *fid,
+			int fh_len, int fh_type)
+{
+	SIM_CALL_LOWER(fh_to_parent, sb, fid, fh_len, fh_type);
+}
+
+static int sim_get_name(struct dentry *parent, char *name,
+			struct dentry *child)
+{
+	SIM_CALL_DENTRY(get_name, parent, name, child);
+}
+
+static struct dentry * sim_get_parent(struct dentry *child)
+{
+	SIM_CALL_DENTRY(get_parent, child);
+}
+
+static int sim_init_export_op(struct super_block *sb, struct super_block *rsb)
+{
+	struct export_operations *op;
+
+	if (rsb->s_export_op == NULL)
+		return 0;
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (op == NULL)
+		return -ENOMEM;
+
+	if (rsb->s_export_op->encode_fh)
+		op->encode_fh = sim_encode_fh;
+	if (rsb->s_export_op->fh_to_dentry)
+		op->fh_to_dentry = sim_fh_to_dentry;
+	if (rsb->s_export_op->fh_to_parent)
+		op->fh_to_parent = sim_fh_to_parent;
+	if (rsb->s_export_op->get_name)
+		op->get_name = sim_get_name;
+	if (rsb->s_export_op->get_parent)
+		op->get_parent = sim_get_parent;
+
+	sb->s_export_op = op;
+	return 0;
+}
+
+static void sim_free_export_op(struct super_block *sb)
+{
+	kfree(sb->s_export_op);
+}
+#else
+static int sim_init_export_op(struct super_block *sb, struct super_block *rsb)
+{
+	return 0;
+}
+static void sim_free_export_op(struct super_block *sb)
+{
+}
+#endif
+
+static int sim_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct nameidata *nd = data;
+	int err;
+
+	err = sim_init_export_op(s, nd->path.dentry->d_sb);
+	if (err)
+		goto out;
+
+	err = sim_init_blkdev(s);
+	if (err)
+		goto out;
+
+	err = 0;
+	s->s_fs_info = mntget(nd->path.mnt);
+	s->s_root = dget(nd->path.dentry);
+	s->s_op = &sim_super_ops;
+
+	sim_quota_init(s);
+out:
+	return err;
+}
+
+static int sim_get_sb(struct file_system_type *type, int flags,
+		const char *dev_name, void *opt, struct vfsmount *mnt)
+{
+	int err;
+	struct nameidata nd;
+
+	err = -EINVAL;
+	if (opt == NULL)
+		goto out;
+
+	err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+	if (err)
+		goto out;
+
+	err = get_sb_nodev(type, flags, &nd, sim_fill_super, mnt);
+
+	path_put(&nd.path);
+out:
+	return err;
+}
+
+static void sim_kill_sb(struct super_block *sb)
+{
+	dput(sb->s_root);
+	sb->s_root = NULL;
+	mntput((struct vfsmount *)(sb->s_fs_info));
+	sim_free_export_op(sb);
+
+	sim_quota_free(sb);
+	sim_free_blkdev(sb);
+
+	kill_anon_super(sb);
+}
+
+static struct file_system_type sim_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "simfs",
+	.get_sb		= sim_get_sb,
+	.kill_sb	= sim_kill_sb,
+	.fs_flags	= FS_MANGLE_PROC | FS_HAS_NEW_FREEZE,
+};
+
+static int __init init_simfs(void)
+{
+	int err;
+
+	err = register_filesystem(&sim_fs_type);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void __exit exit_simfs(void)
+{
+	unregister_filesystem(&sim_fs_type);
+}
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System");
+MODULE_LICENSE("GPL v2");
+
+module_init(init_simfs);
+module_exit(exit_simfs);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/smbfs/sock.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/smbfs/sock.c
--- linux-2.6.32-504.3.3.el6.orig/fs/smbfs/sock.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/smbfs/sock.c	2015-01-21 12:02:41.443273658 +0300
@@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *ser
 
 		VERBOSE("closing socket %p\n", sock);
 		sock->sk->sk_data_ready = server->data_ready;
+		sock->sk->sk_user_data = NULL;
 		server->sock_file = NULL;
 		fput(file);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/splice.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/splice.c
--- linux-2.6.32-504.3.3.el6.orig/fs/splice.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/splice.c	2015-01-21 12:02:57.854837977 +0300
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/virtinfo.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -100,6 +101,7 @@ static int page_cache_pipe_buf_confirm(s
 	int err;
 
 	if (!PageUptodate(page)) {
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 		lock_page(page);
 
 		/*
@@ -290,12 +292,23 @@ __generic_file_splice_read(struct file *
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
 
+	check_pagecache_limits(mapping, mapping_gfp_mask(mapping));
+
 	/*
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 	index += spd.nr_pages;
 
+	while (spd.nr_pages < nr_pages && mapping->host->i_peer_file) {
+		page = pick_peer_page(mapping->host, &in->f_ra, index,
+				      req_pages - spd.nr_pages);
+		if (!page)
+			break;
+		pages[spd.nr_pages++] = page;
+		index++;
+	}
+
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
 	 * readahead/allocate the rest and fill in the holes.
@@ -370,12 +383,17 @@ __generic_file_splice_read(struct file *
 			 * for an in-flight io page
 			 */
 			if (flags & SPLICE_F_NONBLOCK) {
-				if (!trylock_page(page)) {
+				if ((virtinfo_notifier_call(VITYPE_IO,
+						VIRTINFO_IO_CONGESTION, NULL) &
+							NOTIFY_FAIL) ||
+						!trylock_page(page)) {
 					error = -EAGAIN;
 					break;
 				}
-			} else
+			} else {
+				virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 				lock_page(page);
+			}
 
 			/*
 			 * Page was truncated, or invalidated by the
@@ -968,7 +986,7 @@ generic_file_splice_write(struct pipe_in
 		if (ret <= 0)
 			break;
 
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		mutex_lock(&inode->i_mutex);
 		ret = file_remove_suid(out);
 		if (!ret) {
 			file_update_time(out);
@@ -1272,6 +1290,7 @@ long do_splice_direct(struct file *in, l
 
 	return ret;
 }
+EXPORT_SYMBOL(do_splice_direct);
 
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			       struct pipe_inode_info *opipe,
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/stat.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/stat.c
--- linux-2.6.32-504.3.3.el6.orig/fs/stat.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/stat.c	2015-01-21 12:02:42.107256027 +0300
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
+#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -47,10 +48,14 @@ int vfs_getattr(struct vfsmount *mnt, st
 		return retval;
 
 	if (inode->i_op->getattr)
-		return inode->i_op->getattr(mnt, dentry, stat);
+		retval = inode->i_op->getattr(mnt, dentry, stat);
+	else
+		generic_fillattr(inode, stat);
 
-	generic_fillattr(inode, stat);
-	return 0;
+	if (!retval)
+		stat->dev = mnt->mnt_sb->s_dev;
+
+	return retval;
 }
 
 EXPORT_SYMBOL(vfs_getattr);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/statfs.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/statfs.c
--- linux-2.6.32-504.3.3.el6.orig/fs/statfs.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/statfs.c	2015-01-21 12:02:43.596216496 +0300
@@ -7,6 +7,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
+#include <linux/ve_proto.h>
 
 static int flags_by_mnt(int mnt_flags)
 {
@@ -45,28 +46,38 @@ static int calculate_f_flags(struct vfsm
 		flags_by_sb(mnt->mnt_sb->s_flags);
 }
 
-int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+static int statfs_by_sb(struct super_block *sb, struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval;
 
-	if (!dentry->d_sb->s_op->statfs)
+	if (!sb->s_op->statfs)
 		return -ENOSYS;
 
 	memset(buf, 0, sizeof(*buf));
-	retval = security_sb_statfs(dentry);
-	if (retval)
-		return retval;
-	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	retval = sb->s_op->statfs(dentry, buf);
 	if (retval == 0 && buf->f_frsize == 0)
 		buf->f_frsize = buf->f_bsize;
 	return retval;
 }
 
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval;
+
+	retval = security_sb_statfs(dentry);
+	if (!retval)
+		retval = statfs_by_sb(dentry->d_sb, dentry, buf);
+	return retval;
+}
+EXPORT_SYMBOL(statfs_by_dentry);
+
 int vfs_statfs(struct path *path, struct kstatfs *buf)
 {
 	int error;
 
-	error = statfs_by_dentry(path->dentry, buf);
+	error = security_sb_statfs(path->dentry);
+	if (!error)
+		error = statfs_by_sb(path->mnt->mnt_sb, path->dentry, buf);
 	if (!error)
 		buf->f_flags = calculate_f_flags(path->mnt);
 	return error;
@@ -218,8 +229,14 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, st
 	struct ustat tmp;
 	struct kstatfs sbuf;
 	int err;
+	dev_t kdev;
+
+	kdev = new_decode_dev(dev);
+	err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ);
+	if (err)
+		return err;
 
-	s = user_get_super(new_decode_dev(dev));
+	s = user_get_super(kdev);
 	if (!s)
 		return -EINVAL;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/super.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/super.c	2015-01-21 12:02:58.302826088 +0300
@@ -37,6 +37,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/ve_proto.h>
 #include <asm/uaccess.h>
 #include <linux/lockdep.h>
 #include "internal.h"
@@ -48,7 +49,9 @@ static char *sb_writers_name[SB_FREEZE_L
 };
 
 LIST_HEAD(super_blocks);
+EXPORT_SYMBOL(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
+EXPORT_SYMBOL(sb_lock);
 
 static int init_sb_writers(struct super_block *s, struct file_system_type *type)
 {
@@ -64,7 +67,7 @@ static int init_sb_writers(struct super_
 		if (err < 0)
 			goto err_out;
 		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
-				 &type->s_writers_key[i], 0);
+				 &type->proto->s_writers_key[i], 0);
 	}
 	init_waitqueue_head(&s->s_writers.wait);
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
@@ -109,22 +112,41 @@ static struct super_block *alloc_super(s
 			s = NULL;
 			goto out;
 		}
+#ifdef CONFIG_SMP
+		s->s_files = alloc_percpu(struct list_head);
+		if (!s->s_files) {
+			security_sb_free(s);
+			kfree(s);
+			s = NULL;
+			goto out;
+		} else {
+			int i;
+
+			for_each_possible_cpu(i)
+				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+		}
+#else
 		INIT_LIST_HEAD(&s->s_files);
+#endif
 		if (init_sb_writers(s, type))
 			goto err_out;
+
+		s->s_bdi = &default_backing_dev_info;
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
-		lockdep_set_class(&s->s_umount, &type->s_umount_key);
+		lockdep_set_class(&s->s_umount,
+				&type->proto->s_umount_key);
 		/*
 		 * The locking rules for s_lock are up to the
 		 * filesystem. For example ext3fs has different
 		 * lock ordering than usbfs:
 		 */
-		lockdep_set_class(&s->s_lock, &type->s_lock_key);
+		lockdep_set_class(&s->s_lock,
+				&type->proto->s_lock_key);
 		/*
 		 * sget() can have s_umount recursion.
 		 *
@@ -144,6 +166,8 @@ static struct super_block *alloc_super(s
 		s->s_count = 1;
 		atomic_set(&s->s_active, 1);
 		mutex_init(&s->s_vfs_rename_mutex);
+		lockdep_set_class(&s->s_vfs_rename_mutex,
+				&type->proto->s_rename_mutex_key);
 		mutex_init(&s->s_dquot.dqio_mutex);
 		mutex_init(&s->s_dquot.dqonoff_mutex);
 		init_rwsem(&s->s_dquot.dqptr_sem);
@@ -176,6 +200,9 @@ err_out:
  */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+	free_percpu(s->s_files);
+#endif
 	security_sb_free(s);
 	destroy_sb_writers(s);
 	kfree(s->s_subtype);
@@ -220,6 +247,7 @@ int __put_super_and_need_restart(struct 
 	BUG_ON(sb->s_count == 0);
 	return 0;
 }
+EXPORT_SYMBOL(__put_super_and_need_restart);
 
 /**
  *	put_super	-	drop a temporary reference to superblock
@@ -234,7 +262,7 @@ void put_super(struct super_block *sb)
 	__put_super(sb);
 	spin_unlock(&sb_lock);
 }
-
+EXPORT_SYMBOL(put_super);
 
 /**
  *	deactivate_super	-	drop an active reference to superblock
@@ -249,8 +277,9 @@ void deactivate_super(struct super_block
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
-		vfs_dq_off(s, 0);
 		down_write(&s->s_umount);
+		if (!(s->s_type->fs_flags & FS_HANDLE_QUOTA))
+			vfs_dq_off(s, 0);
 		fs->kill_sb(s);
 		put_filesystem(fs);
 		put_super(s);
@@ -274,7 +303,8 @@ void deactivate_locked_super(struct supe
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
-		vfs_dq_off(s, 0);
+		if (!(s->s_type->fs_flags & FS_HANDLE_QUOTA))
+			vfs_dq_off(s, 0);
 		fs->kill_sb(s);
 		put_filesystem(fs);
 		put_super(s);
@@ -361,11 +391,13 @@ void generic_shutdown_super(struct super
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb, true);
 
+		if (sb->dq_op && sb->dq_op->shutdown)
+			sb->dq_op->shutdown(sb);
 		if (sop->put_super)
 			sop->put_super(sb);
 
 		/* Forget any remaining inodes */
-		if (invalidate_inodes(sb, true)) {
+		if (invalidate_inodes_check(sb, true, 1)) {
 			printk("VFS: Busy inodes after unmount of %s. "
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
@@ -612,6 +644,7 @@ rescan:
 	spin_unlock(&sb_lock);
 	return NULL;
 }
+EXPORT_SYMBOL(user_get_super);
 
 /*
  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
@@ -632,6 +665,9 @@ void __sb_end_write(struct super_block *
 	if (waitqueue_active(&sb->s_writers.wait))
 		wake_up(&sb->s_writers.wait);
 	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+
+	if (sb->s_op->end_write)
+		sb->s_op->end_write(sb, level);
 }
 EXPORT_SYMBOL(__sb_end_write);
 
@@ -646,7 +682,7 @@ EXPORT_SYMBOL(__sb_end_write);
  * already hold a freeze protection for a higher freeze level.
  */
 static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
-				unsigned long ip)
+				int force_write, unsigned long ip)
 {
 	int i;
 
@@ -656,6 +692,12 @@ static void acquire_freeze_lock(struct s
 				trylock = true;
 				break;
 			}
+		if (!trylock && force_write && debug_locks) {
+			if (lock_is_held(&sb->s_writers.lock_map[level-1]))
+				trylock = true;
+			else
+				WARN(1, "Unprotected force-write");
+		}
 	}
 	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
 }
@@ -667,19 +709,29 @@ static void acquire_freeze_lock(struct s
  */
 int __sb_start_write(struct super_block *sb, int level, bool wait)
 {
+	int force_write = wait && (current->flags & PF_FSTRANS);
+
 	/* Out of tree modules don't use this mechanism */
 	if (unlikely(!sb_has_new_freeze(sb)))
 		return 1;
 retry:
+	if (sb->s_op->start_write && !sb->s_op->start_write(sb, level, wait))
+		return 0;
+
+	/* CAP_FS_FREEZE is aplicable only if task may block */
 	if (unlikely(sb->s_writers.frozen >= level)) {
-		if (!wait)
+		if (!wait) {
+			if (sb->s_op->end_write)
+				sb->s_op->end_write(sb, level);
 			return 0;
-		wait_event(sb->s_writers.wait_unfrozen,
-			   sb->s_writers.frozen < level);
+		}
+		if (!force_write)
+			wait_event(sb->s_writers.wait_unfrozen,
+				   sb->s_writers.frozen < level);
 	}
 
 #ifdef CONFIG_LOCKDEP
-	acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+	acquire_freeze_lock(sb, level, !wait, force_write, _RET_IP_);
 #endif
 	percpu_counter_inc(&sb->s_writers.counter[level-1]);
 	/*
@@ -687,7 +739,7 @@ retry:
 	 * freeze_super() first sets frozen and then checks the counter.
 	 */
 	smp_mb();
-	if (unlikely(sb->s_writers.frozen >= level)) {
+	if (unlikely(sb->s_writers.frozen >= level && !force_write)) {
 		__sb_end_write(sb, level);
 		goto retry;
 	}
@@ -787,8 +839,13 @@ int do_remount_sb(struct super_block *sb
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval)
+		if (retval) {
+			/* Remount failed, fallback quota to original state */
+			if (remount_ro &&
+			    !(sb->s_type->fs_flags & FS_HANDLE_QUOTA))
+				vfs_dq_quota_on_remount(sb);
 			return retval;
+		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 	if (remount_rw && !(sb->s_type->fs_flags & FS_HANDLE_QUOTA))
@@ -852,6 +909,13 @@ static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 static int unnamed_dev_start = 0; /* don't bother trying below it */
 
+/* for compatibility with coreutils still unaware of new minor sizes */
+int unnamed_dev_majors[] = {
+	0, 144, 145, 146, 242, 243, 244, 245,
+	246, 247, 248, 249, 250, 251, 252, 253
+};
+EXPORT_SYMBOL(unnamed_dev_majors);
+
 int set_anon_super(struct super_block *s, void *data)
 {
 	int dev;
@@ -871,7 +935,7 @@ int set_anon_super(struct super_block *s
 	else if (error)
 		return -EAGAIN;
 
-	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+	if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) {
 		spin_lock(&unnamed_dev_lock);
 		ida_remove(&unnamed_dev_ida, dev);
 		if (unnamed_dev_start > dev)
@@ -879,7 +943,8 @@ int set_anon_super(struct super_block *s
 		spin_unlock(&unnamed_dev_lock);
 		return -EMFILE;
 	}
-	s->s_dev = MKDEV(0, dev & MINORMASK);
+	s->s_dev = make_unnamed_dev(dev);
+	s->s_bdi = &noop_backing_dev_info;
 	return 0;
 }
 
@@ -887,8 +952,9 @@ EXPORT_SYMBOL(set_anon_super);
 
 void kill_anon_super(struct super_block *sb)
 {
-	int slot = MINOR(sb->s_dev);
+	int slot;
 
+	slot = unnamed_dev_idx(sb->s_dev);
 	generic_shutdown_super(sb);
 	spin_lock(&unnamed_dev_lock);
 	ida_remove(&unnamed_dev_ida, slot);
@@ -1009,12 +1075,27 @@ int get_sb_bdev(struct file_system_type 
 		close_bdev_exclusive(bdev, mode);
 	} else {
 		char b[BDEVNAME_SIZE];
-
+#ifdef CONFIG_VE
+		void *data_orig = data;
+		struct ve_struct *ve = get_exec_env();
+
+		if (!ve_is_super(ve)) {
+			error = ve_devmnt_process(ve, bdev->bd_dev, &data, 0);
+			if (error) {
+				deactivate_locked_super(s);
+				goto error;
+			}
+		}
+#endif
 		s->s_flags = flags;
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+#ifdef CONFIG_VE
+		if (data_orig != data)
+			free_page((unsigned long)data);
+#endif
 		if (error) {
 			deactivate_locked_super(s);
 			goto error;
@@ -1138,6 +1219,8 @@ vfs_kern_mount(struct file_system_type *
 	if (error < 0)
 		goto out_free_secdata;
 	BUG_ON(!mnt->mnt_sb);
+	WARN_ON(!mnt->mnt_sb->s_bdi);
+	WARN_ON(mnt->mnt_sb->s_bdi == &default_backing_dev_info);
 	mnt->mnt_sb->s_flags |= MS_BORN;
 
  	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sync.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sync.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sync.c	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sync.c	2015-01-21 12:02:48.857076828 +0300
@@ -11,11 +11,18 @@
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
 #include <linux/linkage.h>
+#include <linux/pid_namespace.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
@@ -26,34 +33,29 @@
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+int __sync_filesystem(struct super_block *sb,
+		struct user_beancounter *ub, int wait)
 {
-	/*
-	 * This should be safe, as we require bdi backing to actually
-	 * write out data in the first place
-	 */
-	if (!sb->s_bdi)
-		return 0;
-
 	/* Avoid doing twice syncing and cache pruning for quota sync */
 	if (!wait) {
 		writeout_quota_sb(sb, -1);
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb_ub(sb, ub);
 	} else {
 		sync_quota_sb(sb, -1);
-		sync_inodes_sb(sb);
+		sync_inodes_sb_ub(sb, ub);
 	}
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
 }
+EXPORT_SYMBOL(__sync_filesystem);
 
 /*
  * Write out and wait upon all dirty data associated with this
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int sync_filesystem(struct super_block *sb)
+static int sync_filesystem_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	int ret;
 
@@ -69,13 +71,106 @@ int sync_filesystem(struct super_block *
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = __sync_filesystem(sb, 0);
+	ret = __sync_filesystem(sb, ub, 0);
 	if (ret < 0)
 		return ret;
-	return __sync_filesystem(sb, 1);
+	return __sync_filesystem(sb, ub, 1);
+}
+
+int sync_filesystem(struct super_block *sb)
+{
+	return sync_filesystem_ub(sb, NULL);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
+struct sync_sb {
+	struct list_head list;
+	struct super_block *sb;
+};
+
+static void sync_release_filesystems(struct list_head *sync_list)
+{
+	struct sync_sb *ss, *tmp;
+
+	list_for_each_entry_safe(ss, tmp, sync_list, list) {
+		list_del(&ss->list);
+		put_super(ss->sb);
+		kfree(ss);
+	}
+}
+
+static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
+{
+	struct sync_sb *ss;
+
+	list_for_each_entry(ss, sync_list, list)
+		if (ss->sb == sb)
+			return 1;
+	return 0;
+}
+
+static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
+{
+	struct vfsmount *root = ve->root_path.mnt;
+	struct vfsmount *mnt;
+	struct sync_sb *ss;
+	int ret = 0;
+
+	BUG_ON(!list_empty(sync_list));
+
+	down_read(&namespace_sem);
+	for (mnt = root; mnt; mnt = next_mnt(mnt, root)) {
+		if (sync_filesystem_collected(sync_list, mnt->mnt_sb))
+			continue;
+
+		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+		if (ss == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
+		ss->sb = mnt->mnt_sb;
+		/*
+		 * We hold mount point and thus can be sure, that superblock is
+		 * alive. And it means, that we can safely increase it's usage
+		 * counter.
+		 */
+		spin_lock(&sb_lock);
+		ss->sb->s_count++;
+		spin_unlock(&sb_lock);
+		list_add_tail(&ss->list, sync_list);
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static void sync_filesystems_ve(struct ve_struct *ve, struct user_beancounter *ub, int wait)
+{
+	struct super_block *sb;
+	LIST_HEAD(sync_list);
+	struct sync_sb *ss;
+
+	mutex_lock(&ve->sync_mutex);		/* Could be down_interruptible */
+
+	/*
+	 * We don't need to care about allocating failure here. At least we
+	 * don't need to skip sync on such error.
+	 * Let's sync what we collected already instead.
+	 */
+	sync_collect_filesystems(ve, &sync_list);
+
+	list_for_each_entry(ss, &sync_list, list) {
+		sb = ss->sb;
+		down_read(&sb->s_umount);
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
+			__sync_filesystem(sb, ub, wait);
+		up_read(&sb->s_umount);
+	}
+
+	sync_release_filesystems(&sync_list);
+
+	mutex_unlock(&ve->sync_mutex);
+}
+
 /*
  * Sync all the data for all the filesystems (called by sys_sync() and
  * emergency sync)
@@ -90,7 +185,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
  * flags again, which will cause process A to resync everything.  Fix that with
  * a local mutex.
  */
-static void sync_filesystems(int wait)
+static void sync_filesystems_ve0(struct user_beancounter *ub, int wait)
 {
 	struct super_block *sb;
 	static DEFINE_MUTEX(mutex);
@@ -110,7 +205,7 @@ restart:
 
 		down_read(&sb->s_umount);
 		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
-			__sync_filesystem(sb, wait);
+			__sync_filesystem(sb, ub, wait);
 		up_read(&sb->s_umount);
 
 		/* restart only when sb is no longer on the list */
@@ -122,17 +217,76 @@ restart:
 	mutex_unlock(&mutex);
 }
 
+static void sync_filesystems(struct user_beancounter *ub, int wait)
+{
+	if (!ub || (ub == get_ub0()))
+		sync_filesystems_ve0(ub, wait);
+	else
+		sync_filesystems_ve(get_exec_env(), ub, wait);
+}
+
+static int __ve_fsync_behavior(struct ve_struct *ve)
+{
+	if (ve->fsync_enable == 2)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
+}
+
+int ve_fsync_behavior(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (ve_is_super(ve))
+		return FSYNC_ALWAYS;
+	else
+		return __ve_fsync_behavior(ve);
+}
+
 /*
  * sync everything.  Start out by waking pdflush, because that writes back
  * all queues in parallel.
  */
 SYSCALL_DEFINE0(sync)
 {
-	wakeup_flusher_threads(0);
-	sync_filesystems(0);
-	sync_filesystems(1);
-	if (unlikely(laptop_mode))
+	struct user_beancounter *ub, *sync_ub = NULL;
+	struct ve_struct *ve;
+
+	ub = get_exec_ub();
+	ve = get_exec_env();
+	ub_percpu_inc(ub, sync);
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (current == get_env_init(ve))
+			goto skip;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto skip;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+	}
+
+	wakeup_flusher_threads(sync_ub, 0);
+	sync_filesystems(sync_ub, 0);
+	sync_filesystems(sync_ub, 1);
+	if (unlikely(laptop_mode) && !sync_ub)
 		laptop_sync_completion();
+skip:
+	ub_percpu_inc(ub, sync_done);
 	return 0;
 }
 
@@ -142,8 +296,8 @@ static void do_sync_work(struct work_str
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	sync_filesystems(0);
-	sync_filesystems(0);
+	sync_filesystems(NULL, 0);
+	sync_filesystems(NULL, 0);
 	printk("Emergency Sync complete\n");
 	kfree(work);
 }
@@ -193,19 +347,52 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct file *file;
 	struct super_block *sb;
-	int ret;
+	int ret = 0;
 	int fput_needed;
+	struct user_beancounter *ub, *sync_ub = NULL;
+	struct ve_struct *ve;
+
+	ub = get_exec_ub();
+	ve = get_exec_env();
+	ub_percpu_inc(ub, sync);
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (current == get_env_init(ve))
+			goto skip;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto skip;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+	}
 
 	file = fget_light(fd, &fput_needed);
-	if (!file)
-		return -EBADF;
+	if (!file) {
+		ret = -EBADF;
+		goto skip;
+	}
+
 	sb = file->f_dentry->d_sb;
 
 	down_read(&sb->s_umount);
-	ret = sync_filesystem(sb);
+	if (sb->s_root)
+		ret = sync_filesystem_ub(sb, sync_ub);
 	up_read(&sb->s_umount);
 
 	fput_light(file, fput_needed);
+skip:
+	ub_percpu_inc(ub, sync_done);
 	return ret;
 }
 
@@ -231,6 +418,7 @@ int vfs_fsync_range(struct file *file, s
 	const struct file_operations *fop;
 	struct address_space *mapping;
 	int err, ret;
+	struct user_beancounter *ub;
 
 	/*
 	 * Get mapping and operations from the file in case we have
@@ -250,6 +438,12 @@ int vfs_fsync_range(struct file *file, s
 		goto out;
 	}
 
+	ub = get_exec_ub();
+	if (datasync)
+		ub_percpu_inc(ub, fdsync);
+	else
+		ub_percpu_inc(ub, fsync);
+
 	ret = filemap_write_and_wait_range(mapping, start, end);
 
 	/*
@@ -262,6 +456,10 @@ int vfs_fsync_range(struct file *file, s
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
 
+	if (datasync)
+		ub_percpu_inc(ub, fdsync_done);
+	else
+		ub_percpu_inc(ub, fsync_done);
 out:
 	return ret;
 }
@@ -291,9 +489,14 @@ static int do_fsync(unsigned int fd, int
 	struct file *file;
 	int ret = -EBADF;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		return 0;
+
 	file = fget(fd);
 	if (file) {
+		sb_start_write(file->f_mapping->host->i_sb);
 		ret = vfs_fsync(file, file->f_path.dentry, datasync);
+		sb_end_write(file->f_mapping->host->i_sb);
 		fput(file);
 	}
 	return ret;
@@ -427,8 +630,9 @@ SYSCALL_DEFINE(sync_file_range)(int fd, 
 	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
 			!S_ISLNK(i_mode))
 		goto out_put;
-
+	sb_start_write(file->f_mapping->host->i_sb);
 	ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
+	sb_end_write(file->f_mapping->host->i_sb);
 out_put:
 	fput_light(file, fput_needed);
 out:
@@ -468,12 +672,16 @@ int do_sync_mapping_range(struct address
 			  loff_t endbyte, unsigned int flags)
 {
 	int ret;
+	struct user_beancounter *ub;
 
 	if (!mapping) {
 		ret = -EINVAL;
-		goto out;
+		goto out_noacct;
 	}
 
+	ub = get_exec_ub();
+	ub_percpu_inc(ub, frsync);
+
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
 		ret = wait_on_page_writeback_range(mapping,
@@ -496,6 +704,8 @@ int do_sync_mapping_range(struct address
 					endbyte >> PAGE_CACHE_SHIFT);
 	}
 out:
+	ub_percpu_inc(ub, frsync_done);
+out_noacct:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/Makefile
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/Makefile	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/Makefile	2015-01-21 12:02:44.364196107 +0300
@@ -3,4 +3,4 @@
 #
 
 obj-y		:= inode.o file.o dir.o symlink.o mount.o bin.o \
-		   group.o
+		   group.o dirlink.o
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/bin.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/bin.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/bin.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/bin.c	2015-01-21 12:02:43.669214558 +0300
@@ -398,6 +398,9 @@ static int open(struct inode * inode, st
 	struct bin_buffer *bb = NULL;
 	int error;
 
+	if (!ve_sysfs_alowed())
+		return 0;
+
 	/* binary file operations requires both @sd and its parent */
 	if (!sysfs_get_active_two(attr_sd))
 		return -ENODEV;
@@ -485,6 +488,9 @@ void unmap_bin_file(struct sysfs_dirent 
 
 int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 {
+	if (!ve_sysfs_alowed())
+		return 0;
+
 	BUG_ON(!kobj || !attr);
 
 	/* RHEL specific
@@ -509,6 +515,8 @@ int sysfs_create_bin_file(struct kobject
 
 void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 {
+	if (!ve_sysfs_alowed())
+		return;
 	sysfs_hash_and_remove(kobj->sd, attr->attr.name);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/dir.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/dir.c	2015-01-21 12:02:44.494192655 +0300
@@ -159,7 +159,7 @@ struct dentry *sysfs_get_dentry(struct s
  *	RETURNS:
  *	Pointer to @sd on success, NULL on failure.
  */
-static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 {
 	if (unlikely(!sd))
 		return NULL;
@@ -188,7 +188,7 @@ static struct sysfs_dirent *sysfs_get_ac
  *	Put an active reference to @sd.  This function is noop if @sd
  *	is NULL.
  */
-static void sysfs_put_active(struct sysfs_dirent *sd)
+void sysfs_put_active(struct sysfs_dirent *sd)
 {
 	int v;
 
@@ -305,6 +305,9 @@ void release_sysfs_dirent(struct sysfs_d
 
 	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
 		sysfs_put(sd->s_symlink.target_sd);
+	else if (sysfs_type(sd) == SYSFS_DIR_LINK)
+		sysfs_put(sd->s_dir_link.target_sd);
+
 	if (sysfs_type(sd) & SYSFS_COPY_NAME)
 		kfree(sd->s_name);
 	if (sd->s_iattr && sd->s_iattr->ia_secdata)
@@ -327,8 +330,19 @@ static void sysfs_d_iput(struct dentry *
 	iput(inode);
 }
 
+static int sysfs_d_revalidate(struct dentry * d, struct nameidata *nd)
+{
+	struct sysfs_dirent *sd = d->d_fsdata;
+	if (sd->s_flags & SYSFS_FLAG_REMOVED) {
+		d_drop(d);
+		return 0;
+	}
+	return 1;
+}
+
 static const struct dentry_operations sysfs_dentry_ops = {
 	.d_iput		= sysfs_d_iput,
+	.d_revalidate	= sysfs_d_revalidate,
 };
 
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
@@ -560,6 +574,9 @@ static void sysfs_drop_dentry(struct sys
 	struct inode *inode;
 	struct dentry *dentry;
 
+	if (!ve_sysfs_alowed())
+		return;
+
 	inode = ilookup(sysfs_sb, sd->s_ino);
 	if (!inode)
 		return;
@@ -743,12 +760,15 @@ int sysfs_create_dir(struct kobject * ko
 	struct sysfs_dirent *parent_sd, *sd;
 	int error = 0;
 
+	if (!ve_sysfs_alowed())
+		return 0;
+
 	BUG_ON(!kobj);
 
 	if (kobj->parent)
 		parent_sd = kobj->parent->sd;
 	else
-		parent_sd = &sysfs_root;
+		parent_sd = ve_sysfs_root;
 
 	error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
 	if (!error)
@@ -756,11 +776,10 @@ int sysfs_create_dir(struct kobject * ko
 	return error;
 }
 
-static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry * __sysfs_lookup_at(struct sysfs_dirent *parent_sd, struct dentry *dentry,
 				struct nameidata *nd)
 {
 	struct dentry *ret = NULL;
-	struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
 	struct sysfs_dirent *sd;
 	struct inode *inode;
 
@@ -792,10 +811,19 @@ static struct dentry * sysfs_lookup(stru
 	return ret;
 }
 
+static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd)
+{
+	struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+	return __sysfs_lookup_at(parent_sd, dentry, nd);
+}
+
+
 const struct inode_operations sysfs_dir_inode_operations = {
 	.lookup		= sysfs_lookup,
 	.setattr	= sysfs_setattr,
 	.setxattr	= sysfs_setxattr,
+	.getattr	= sysfs_getattr,
 };
 
 static void remove_dir(struct sysfs_dirent *sd)
@@ -848,6 +876,9 @@ void sysfs_remove_dir(struct kobject * k
 {
 	struct sysfs_dirent *sd = kobj->sd;
 
+	if (!ve_sysfs_alowed())
+		return;
+
 	spin_lock(&sysfs_assoc_lock);
 	kobj->sd = NULL;
 	spin_unlock(&sysfs_assoc_lock);
@@ -863,6 +894,9 @@ int sysfs_rename_dir(struct kobject * ko
 	const char *dup_name = NULL;
 	int error;
 
+	if (!ve_sysfs_alowed())
+		return 0;
+
 	mutex_lock(&sysfs_rename_mutex);
 
 	error = 0;
@@ -930,7 +964,7 @@ int sysfs_move_dir(struct kobject *kobj,
 	mutex_lock(&sysfs_rename_mutex);
 	BUG_ON(!sd->s_parent);
 	new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
-		new_parent_kobj->sd : &sysfs_root;
+		new_parent_kobj->sd : ve_sysfs_root;
 
 	error = 0;
 	if (sd->s_parent == new_parent_sd)
@@ -1051,17 +1085,19 @@ static struct sysfs_dirent *sysfs_dir_ne
 	return pos;
 }
 
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int __sysfs_readdir_at(struct sysfs_dirent *parent_sd, struct file * filp,
+		void * dirent, filldir_t filldir)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
 	struct sysfs_dirent *pos = filp->private_data;
 	ino_t ino;
+	loff_t off;
 
 	if (filp->f_pos == 0) {
 		ino = parent_sd->s_ino;
 		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
 			filp->f_pos++;
+		else
+			return 0;
 	}
 	if (filp->f_pos == 1) {
 		if (parent_sd->s_parent)
@@ -1070,8 +1106,11 @@ static int sysfs_readdir(struct file * f
 			ino = parent_sd->s_ino;
 		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
 			filp->f_pos++;
+		else
+			return 0;
 	}
 	mutex_lock(&sysfs_mutex);
+	off = filp->f_pos;
 	for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
 	     pos;
 	     pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
@@ -1083,23 +1122,34 @@ static int sysfs_readdir(struct file * f
 		len = strlen(name);
 		ino = pos->s_ino;
 		type = dt_type(pos);
-		filp->f_pos = ino;
+		off = filp->f_pos = ino;
 		filp->private_data = sysfs_get(pos);
 
 		mutex_unlock(&sysfs_mutex);
-		ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+		ret = filldir(dirent, name, len, off, ino, type);
 		mutex_lock(&sysfs_mutex);
 		if (ret < 0)
 			break;
 	}
 	mutex_unlock(&sysfs_mutex);
-	if ((filp->f_pos > 1) && !pos) { /* EOF */
-		filp->f_pos = INT_MAX;
+
+	/* don't reference last entry if its refcount is dropped */
+	if (!pos) {
 		filp->private_data = NULL;
+
+		/* EOF and not changed as 0 or 1 in read/write path */
+		if (off == filp->f_pos && off > 1)
+			filp->f_pos = INT_MAX;
 	}
 	return 0;
 }
 
+static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
+	return __sysfs_readdir_at(parent_sd, filp, dirent, filldir);
+}
 
 const struct file_operations sysfs_dir_operations = {
 	.read		= generic_read_dir,
@@ -1107,3 +1157,14 @@ const struct file_operations sysfs_dir_o
 	.release	= sysfs_dir_release,
 	.llseek		= generic_file_llseek,
 };
+
+int init_ve_sysfs_root(struct ve_struct *ve)
+{
+	ve->_sysfs_root = sysfs_new_dirent("",
+			S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, SYSFS_DIR);
+	if (ve->_sysfs_root == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(init_ve_sysfs_root);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/dirlink.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/dirlink.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/dirlink.c	2015-01-21 12:02:44.364196107 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/dirlink.c	2015-01-21 12:02:44.367196028 +0300
@@ -0,0 +1,72 @@
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include "sysfs.h"
+
+static struct dentry * sysfs_dirlink_lookup(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd)
+{
+	struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+	return __sysfs_lookup_at(parent_sd->s_dir_link.target_sd, dentry, nd);
+}
+
+const struct inode_operations sysfs_dirlink_inode_operations = {
+	.lookup		= sysfs_dirlink_lookup,
+};
+
+static int sysfs_dirlink_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
+	return __sysfs_readdir_at(parent_sd->s_dir_link.target_sd, filp, dirent, filldir);
+}
+
+const struct file_operations sysfs_dirlink_operations = {
+	.read		= generic_read_dir,
+	.readdir	= sysfs_dirlink_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+struct sysfs_dirent *sysfs_create_dirlink(struct sysfs_dirent *parent_sd,
+		struct kobject *target)
+{
+	struct sysfs_dirent *sd, *tgt;
+	struct sysfs_addrm_cxt acxt;
+	int rc;
+
+	tgt = target->sd;
+	if (tgt == NULL)
+		return ERR_PTR(-EINVAL);
+	if (!S_ISDIR(tgt->s_mode))
+		return ERR_PTR(-ENOTDIR);
+
+	sd = sysfs_new_dirent(tgt->s_name, tgt->s_mode, SYSFS_DIR_LINK);
+	if (sd == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	sd->s_dir_link.target_sd = sysfs_get(tgt);
+
+	sysfs_addrm_start(&acxt, parent_sd);
+	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+
+	if (rc) {
+		sysfs_put(tgt);
+		sysfs_put(sd);
+		sd = ERR_PTR(rc);
+	}
+
+	return sd;
+}
+EXPORT_SYMBOL(sysfs_create_dirlink);
+
+void sysfs_remove_dirlink(struct sysfs_dirent *sd)
+{
+	struct sysfs_addrm_cxt acxt;
+
+	sysfs_addrm_start(&acxt, sd->s_parent);
+	sysfs_remove_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+}
+EXPORT_SYMBOL(sysfs_remove_dirlink);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/file.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/file.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/file.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/file.c	2015-01-21 12:02:43.669214558 +0300
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode 
 	char *p;
 
 	p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-	if (p)
+	if (!IS_ERR(p))
 		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
@@ -536,6 +536,8 @@ int sysfs_add_file(struct sysfs_dirent *
 
 int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 {
+	if (!ve_sysfs_alowed())
+		return 0;
 	BUG_ON(!kobj || !attr);
 
 	/* RHEL specific
@@ -656,6 +658,8 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
+	if (!ve_sysfs_alowed())
+		return;
 	sysfs_hash_and_remove(kobj->sd, attr->name);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/group.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/group.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/group.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/group.c	2015-01-21 12:02:43.669214558 +0300
@@ -62,6 +62,8 @@ static int internal_create_group(struct 
 	struct sysfs_dirent *sd;
 	int error;
 
+	if (!ve_sysfs_alowed())
+		return 0;
 	BUG_ON(!kobj);
 
 	/* RHEL specific
@@ -141,6 +143,9 @@ void sysfs_remove_group(struct kobject *
 	struct sysfs_dirent *dir_sd = kobj->sd;
 	struct sysfs_dirent *sd;
 
+	if (!ve_sysfs_alowed())
+		return;
+
 	if (grp->name) {
 		sd = sysfs_get_dirent(dir_sd, grp->name);
 		if (!sd) {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/inode.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/inode.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/inode.c	2014-12-12 23:29:09.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/inode.c	2015-01-21 12:02:44.364196107 +0300
@@ -22,8 +22,6 @@
 #include <linux/security.h>
 #include "sysfs.h"
 
-extern struct super_block * sysfs_sb;
-
 static const struct address_space_operations sysfs_aops = {
 	.readpage	= simple_readpage,
 	.write_begin	= simple_write_begin,
@@ -39,6 +37,7 @@ static struct backing_dev_info sysfs_bac
 static const struct inode_operations sysfs_inode_operations ={
 	.setattr	= sysfs_setattr,
 	.setxattr	= sysfs_setxattr,
+	.getattr	= sysfs_getattr,
 };
 
 int __init sysfs_inode_init(void)
@@ -161,6 +160,23 @@ out:
 	return error;
 }
 
+static void sysfs_inode_refresh_nlink(struct inode *inode, struct sysfs_dirent *sd)
+{
+	if (sysfs_type(sd) == SYSFS_DIR)
+		inode->i_nlink = sd->s_dir.subdirs + 2;
+}
+
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct inode *inode = dentry->d_inode;
+
+	sysfs_inode_refresh_nlink(inode, sd);
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
 	inode->i_mode = mode;
@@ -219,7 +235,11 @@ static void sysfs_init_inode(struct sysf
 	case SYSFS_DIR:
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
-		inode->i_nlink = sd->s_dir.subdirs + 2;
+		break;
+	case SYSFS_DIR_LINK:
+		inode->i_op = &sysfs_dirlink_inode_operations;
+		inode->i_fop = &sysfs_dirlink_operations;
+		inode->i_nlink = 2; /* who cares */
 		break;
 	case SYSFS_KOBJ_ATTR:
 		inode->i_size = PAGE_SIZE;
@@ -236,7 +256,7 @@ static void sysfs_init_inode(struct sysf
 	default:
 		BUG();
 	}
-
+	sysfs_inode_refresh_nlink(inode, sd);
 	unlock_new_inode(inode);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/mount.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/mount.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/mount.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/mount.c	2015-01-21 12:02:44.611189548 +0300
@@ -22,10 +22,25 @@
 #include "sysfs.h"
 
 
-static struct vfsmount *sysfs_mount;
+#ifndef CONFIG_VE
+struct vfsmount *sysfs_mount;
 struct super_block * sysfs_sb = NULL;
+#endif
+
 struct kmem_cache *sysfs_dir_cachep;
 
+#ifdef CONFIG_SYSFS_DEPRECATED_DYN
+unsigned _sysfs_deprecated __read_mostly;
+EXPORT_SYMBOL(_sysfs_deprecated);
+
+static int __init sysfs_init_deprecated(char *str)
+{
+	_sysfs_deprecated = simple_strtoul(str, NULL, 10);
+	return 1;
+}
+early_param("sysfs.deprecated", sysfs_init_deprecated);
+#endif
+
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -40,6 +55,13 @@ struct sysfs_dirent sysfs_root = {
 	.s_ino		= 1,
 };
 
+static void init_ve0_sysfs_root(void)
+{
+#ifdef CONFIG_VE
+	get_ve0()->_sysfs_root = &sysfs_root;
+#endif
+}
+
 static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
@@ -54,7 +76,7 @@ static int sysfs_fill_super(struct super
 
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(&sysfs_root);
+	inode = sysfs_get_inode(ve_sysfs_root);
 	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
@@ -68,7 +90,7 @@ static int sysfs_fill_super(struct super
 		iput(inode);
 		return -ENOMEM;
 	}
-	root->d_fsdata = &sysfs_root;
+	root->d_fsdata = ve_sysfs_root;
 	sb->s_root = root;
 	return 0;
 }
@@ -79,16 +101,19 @@ static int sysfs_get_sb(struct file_syst
 	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 
-static struct file_system_type sysfs_fs_type = {
+struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.get_sb		= sysfs_get_sb,
 	.kill_sb	= kill_anon_super,
 };
 
+EXPORT_SYMBOL(sysfs_fs_type);
+
 int __init sysfs_init(void)
 {
 	int err = -ENOMEM;
 
+	init_ve0_sysfs_root();
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
 					      0, 0, NULL);
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/symlink.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/symlink.c
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/symlink.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/symlink.c	2015-01-21 12:02:43.670214531 +0300
@@ -29,10 +29,13 @@ static int sysfs_do_create_link(struct k
 	struct sysfs_addrm_cxt acxt;
 	int error;
 
+	if (!ve_sysfs_alowed())
+		return 0;
+
 	BUG_ON(!name);
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent_sd = ve_sysfs_root;
 	else
 		parent_sd = kobj->sd;
 
@@ -115,8 +118,11 @@ void sysfs_remove_link(struct kobject * 
 {
 	struct sysfs_dirent *parent_sd = NULL;
 
+	if(!ve_sysfs_alowed())
+		return;
+
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent_sd = ve_sysfs_root;
 	else
 		parent_sd = kobj->sd;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/sysfs/sysfs.h linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/sysfs.h
--- linux-2.6.32-504.3.3.el6.orig/fs/sysfs/sysfs.h	2014-12-12 23:29:09.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/sysfs/sysfs.h	2015-01-21 12:02:44.365196081 +0300
@@ -11,95 +11,18 @@
 #include <linux/fs.h>
 #include <linux/rbtree.h>
 
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-	struct kobject		*kobj;
-#ifdef __GENKSYMS__
-	struct sysfs_dirent	*children;
-#endif
-
-#ifndef __GENKSYMS__
-	unsigned long		subdirs;
-
-	struct rb_root		inode_tree;
-	struct rb_root		name_tree;
-#endif
-};
-
-struct sysfs_elem_symlink {
-	struct sysfs_dirent	*target_sd;
-};
-
-struct sysfs_elem_attr {
-	struct attribute	*attr;
-	struct sysfs_open_dirent *open;
-};
-
-struct sysfs_elem_bin_attr {
-	struct bin_attribute	*bin_attr;
-	struct hlist_head	buffers;
-};
-
-struct sysfs_inode_attrs {
-	struct iattr	ia_iattr;
-	void		*ia_secdata;
-	u32		ia_secdata_len;
-};
-
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-	atomic_t		s_count;
-	atomic_t		s_active;
-	struct sysfs_dirent	*s_parent;
-#ifdef __GENKSYMS__
-	struct sysfs_dirent	*s_sibling;
-#endif
-	const char		*s_name;
-
-#ifndef __GENKSYMS__
-	struct rb_node		inode_node;
-	struct rb_node		name_node;
-
-	union {
-		struct completion	*completion;
-		struct sysfs_dirent	*removed_list;
-	} u;
+#ifndef CONFIG_VE
+extern struct vfsmount *sysfs_mount;
+extern struct super_block *sysfs_sb;
+#define ve_sysfs_alowed()	1
+#else
+#include <linux/sched.h>
+#include <linux/ve.h>
+#define sysfs_mount		(get_exec_env()->sysfs_mnt)
+#define sysfs_sb		(get_exec_env()->sysfs_sb)
+#define ve_sysfs_alowed()	(sysfs_sb != NULL)
 #endif
 
-	union {
-		struct sysfs_elem_dir		s_dir;
-		struct sysfs_elem_symlink	s_symlink;
-		struct sysfs_elem_attr		s_attr;
-		struct sysfs_elem_bin_attr	s_bin_attr;
-	};
-
-	unsigned int		s_flags;
-	ino_t			s_ino;
-	umode_t			s_mode;
-	struct sysfs_inode_attrs *s_iattr;
-};
-
-#define SD_DEACTIVATED_BIAS		INT_MIN
-
-#define SYSFS_TYPE_MASK			0x00ff
-#define SYSFS_DIR			0x0001
-#define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_BIN_ATTR		0x0004
-#define SYSFS_KOBJ_LINK			0x0008
-#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
-
-#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_REMOVED		0x0200
-
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
 	return sd->s_flags & SYSFS_TYPE_MASK;
@@ -118,8 +41,12 @@ struct sysfs_addrm_cxt {
 /*
  * mount.c
  */
+#ifdef CONFIG_VE
+#define ve_sysfs_root	(get_exec_env()->_sysfs_root)
+#else
 extern struct sysfs_dirent sysfs_root;
-extern struct super_block *sysfs_sb;
+#define ve_sysfs_root	(&sysfs_root)
+#endif
 extern struct kmem_cache *sysfs_dir_cachep;
 
 /*
@@ -132,6 +59,9 @@ extern spinlock_t sysfs_assoc_lock;
 extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 
+extern const struct file_operations sysfs_dirlink_operations;
+extern const struct inode_operations sysfs_dirlink_inode_operations;
+
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 void sysfs_put_active_two(struct sysfs_dirent *sd);
@@ -171,6 +101,11 @@ static inline void __sysfs_put(struct sy
 }
 #define sysfs_put(sd) __sysfs_put(sd)
 
+struct dentry * __sysfs_lookup_at(struct sysfs_dirent *parent_sd, struct dentry *dentry,
+		struct nameidata *nd);
+int __sysfs_readdir_at(struct sysfs_dirent *parent_sd, struct file * filp,
+		void * dirent, filldir_t filldir);
+
 /*
  * inode.c
  */
@@ -179,6 +114,8 @@ void sysfs_delete_inode(struct inode *in
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 int sysfs_inode_init(void);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/timerfd.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/timerfd.c
--- linux-2.6.32-504.3.3.el6.orig/fs/timerfd.c	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/timerfd.c	2015-01-21 12:02:48.073097641 +0300
@@ -22,19 +22,7 @@
 #include <linux/timerfd.h>
 #include <linux/syscalls.h>
 #include <linux/rcupdate.h>
-
-struct timerfd_ctx {
-	struct hrtimer tmr;
-	ktime_t tintv;
-	ktime_t moffs;
-	wait_queue_head_t wqh;
-	u64 ticks;
-	int expired;
-	int clockid;
-	struct rcu_head rcu;
-	struct list_head clist;
-	bool might_cancel;
-};
+#include <linux/module.h>
 
 static LIST_HEAD(cancel_list);
 static DEFINE_SPINLOCK(cancel_lock);
@@ -116,13 +104,14 @@ static void timerfd_setup_cancel(struct 
 	}
 }
 
-static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
+ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
 	ktime_t remaining;
 
 	remaining = hrtimer_expires_remaining(&ctx->tmr);
 	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
+EXPORT_SYMBOL(timerfd_get_remaining);
 
 static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
 			  const struct itimerspec *ktmr)
@@ -247,11 +236,12 @@ static ssize_t timerfd_read(struct file 
 	return res;
 }
 
-static const struct file_operations timerfd_fops = {
+const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
 	.poll		= timerfd_poll,
 	.read		= timerfd_read,
 };
+EXPORT_SYMBOL(timerfd_fops);
 
 static struct file *timerfd_fget(int fd)
 {
@@ -298,6 +288,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clo
 
 	return ufd;
 }
+EXPORT_SYMBOL(sys_timerfd_create);
 
 SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 		const struct itimerspec __user *, utmr,
@@ -350,6 +341,9 @@ SYSCALL_DEFINE4(timerfd_settime, int, uf
 	/*
 	 * Re-program the timer to the new value ...
 	 */
+	if ((flags & TFD_TIMER_ABSTIME) &&
+	    (ktmr.it_value.tv_sec || ktmr.it_value.tv_nsec))
+		monotonic_ve_to_abs(ctx->clockid, &ktmr.it_value);
 	ret = timerfd_setup(ctx, flags, &ktmr);
 
 	spin_unlock_irq(&ctx->wqh.lock);
@@ -359,6 +353,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, uf
 
 	return ret;
 }
+EXPORT_SYMBOL(sys_timerfd_settime);
 
 SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/ubifs/dir.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/ubifs/dir.c
--- linux-2.6.32-504.3.3.el6.orig/fs/ubifs/dir.c	2014-12-12 23:28:53.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/ubifs/dir.c	2015-01-21 12:02:52.022992783 +0300
@@ -529,24 +529,6 @@ static int ubifs_link(struct dentry *old
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 *
-	 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-	 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-	 * lock 'dirA->i_mutex', so this is possible. Both of the functions
-	 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-	 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-	 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-	 * to the list of orphans. After this, 'vfs_link()' will link
-	 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-	 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-	 * to the list of orphans.
-	 */
-	 if (inode->i_nlink == 0)
-		 return -ENOENT;
-
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/udf/namei.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/udf/namei.c
--- linux-2.6.32-504.3.3.el6.orig/fs/udf/namei.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/udf/namei.c	2015-01-21 12:02:52.022992783 +0300
@@ -1325,8 +1325,13 @@ static int udf_encode_fh(struct dentry *
 	struct fid *fid = (struct fid *)fh;
 	int type = FILEID_UDF_WITHOUT_PARENT;
 
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*lenp = 5;
+		return 255;
+	} else if (len < 3) {
+		*lenp = 3;
 		return 255;
+	}
 
 	*lenp = 3;
 	fid->udf.block = location.logicalBlockNum;
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/udf/super.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/udf/super.c
--- linux-2.6.32-504.3.3.el6.orig/fs/udf/super.c	2014-12-12 23:29:22.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/udf/super.c	2015-01-21 12:02:42.264251860 +0300
@@ -2135,6 +2135,8 @@ static void udf_put_super(struct super_b
 
 	sbi = UDF_SB(sb);
 
+	vfs_dq_off(sb, 0);
+
 	lock_kernel();
 
 	if (sbi->s_vat_inode)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/utimes.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/utimes.c
--- linux-2.6.32-504.3.3.el6.orig/fs/utimes.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/utimes.c	2015-01-21 12:02:42.102256162 +0300
@@ -40,6 +40,20 @@ SYSCALL_DEFINE2(utime, char __user *, fi
 
 #endif
 
+SYSCALL_DEFINE2(lutime, char __user *, filename, struct utimbuf __user *, times)
+{
+	struct timespec tv[2];
+
+	if (times) {
+		if (get_user(tv[0].tv_sec, &times->actime) ||
+		    get_user(tv[1].tv_sec, &times->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, times ? tv : NULL, AT_SYMLINK_NOFOLLOW);
+}
+
 static bool nsec_valid(long nsec)
 {
 	if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/xattr.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/xattr.c
--- linux-2.6.32-504.3.3.el6.orig/fs/xattr.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/xattr.c	2015-01-21 12:02:57.996834210 +0300
@@ -115,6 +115,15 @@ vfs_setxattr(struct dentry *dentry, cons
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+#if defined(CONFIG_VE) && defined(CONFIG_SYSCTL)
+	if (!ve_is_super(get_exec_env())) {
+		if (ve_xattr_policy == VE_XATTR_POLICY_IGNORE)
+			return 0;
+		else if (ve_xattr_policy == VE_XATTR_POLICY_REJECT)
+			return -EPERM;
+	}
+#endif
+
 	error = xattr_permission(inode, name, MAY_WRITE);
 	if (error)
 		return error;
@@ -130,7 +139,7 @@ out:
 	mutex_unlock(&inode->i_mutex);
 	return error;
 }
-EXPORT_SYMBOL_GPL(vfs_setxattr);
+EXPORT_SYMBOL(vfs_setxattr);
 
 ssize_t
 xattr_getsecurity(struct inode *inode, const char *name, void *value,
@@ -193,7 +202,7 @@ nolsm:
 
 	return error;
 }
-EXPORT_SYMBOL_GPL(vfs_getxattr);
+EXPORT_SYMBOL(vfs_getxattr);
 
 ssize_t
 vfs_listxattr(struct dentry *d, char *list, size_t size)
@@ -213,7 +222,7 @@ vfs_listxattr(struct dentry *d, char *li
 	}
 	return error;
 }
-EXPORT_SYMBOL_GPL(vfs_listxattr);
+EXPORT_SYMBOL(vfs_listxattr);
 
 int
 vfs_removexattr(struct dentry *dentry, const char *name)
@@ -240,7 +249,7 @@ vfs_removexattr(struct dentry *dentry, c
 		fsnotify_xattr(dentry);
 	return error;
 }
-EXPORT_SYMBOL_GPL(vfs_removexattr);
+EXPORT_SYMBOL(vfs_removexattr);
 
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/xfs/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/fs/xfs/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/fs/xfs/Kconfig	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/xfs/Kconfig	2015-01-21 12:02:41.363275782 +0300
@@ -2,6 +2,7 @@ config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
 	select EXPORTFS
+	select LIST_SORT
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff -upr linux-2.6.32-504.3.3.el6.orig/fs/xfs/linux-2.6/xfs_export.c linux-2.6.32-504.3.3.el6-042stab103_6/fs/xfs/linux-2.6/xfs_export.c
--- linux-2.6.32-504.3.3.el6.orig/fs/xfs/linux-2.6/xfs_export.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/fs/xfs/linux-2.6/xfs_export.c	2015-01-21 12:02:52.023992756 +0300
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
 	 * seven combinations work.  The real answer is "don't use v2".
 	 */
 	len = xfs_fileid_length(fileid_type);
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return 255;
+	}
 	*max_len = len;
 
 	switch (fileid_type) {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/asm-generic/ioctls.h linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/ioctls.h
--- linux-2.6.32-504.3.3.el6.orig/include/asm-generic/ioctls.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/ioctls.h	2015-01-21 12:02:41.479272702 +0300
@@ -86,6 +86,8 @@
 #define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
 #define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
 
+#define TIOSAK		_IO('T', 0x66)	/* "Secure Attention Key" */
+
 /*
  * some architectures define FIOQSIZE as 0x545E, which is used for
  * TIOCGHAYESESP on others
diff -upr linux-2.6.32-504.3.3.el6.orig/include/asm-generic/mman-common.h linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/mman-common.h
--- linux-2.6.32-504.3.3.el6.orig/include/asm-generic/mman-common.h	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/mman-common.h	2015-01-21 12:02:41.739265799 +0300
@@ -47,6 +47,8 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_DEACTIVATE	32		/* deactivate page */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/asm-generic/mman.h linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/mman.h
--- linux-2.6.32-504.3.3.el6.orig/include/asm-generic/mman.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/mman.h	2015-01-21 12:02:50.151042476 +0300
@@ -12,6 +12,8 @@
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
+#define MAP_EXECPRIO	0x80000		/* soft ubc charge */
+#define MAP_CPT		0x100000	/* mmap from checkpoint-restore */
 
 #define MCL_CURRENT	1		/* lock all current mappings */
 #define MCL_FUTURE	2		/* lock all future mappings */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/asm-generic/tlb.h linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/tlb.h
--- linux-2.6.32-504.3.3.el6.orig/include/asm-generic/tlb.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/tlb.h	2015-01-21 12:02:41.725266170 +0300
@@ -41,6 +41,7 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
+	unsigned int		ptes_freed;
 	struct page *		pages[FREE_PTE_NR];
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/asm-generic/unistd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/unistd.h
--- linux-2.6.32-504.3.3.el6.orig/include/asm-generic/unistd.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/asm-generic/unistd.h	2015-01-21 12:02:52.023992756 +0300
@@ -647,9 +647,9 @@ __SYSCALL(__NR_fanotify_init, sys_ni_sys
 #define __NR_fanotify_mark 263
 __SYSCALL(__NR_fanotify_mark, sys_ni_syscall)
 #define __NR_name_to_handle_at         264
-__SYSCALL(__NR_name_to_handle_at, sys_ni_syscall)
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 #define __NR_open_by_handle_at         265
-__SYSCALL(__NR_open_by_handle_at, sys_ni_syscall)
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 #define __NR_clock_adjtime 266
 __SYSCALL(__NR_clock_adjtime, sys_ni_syscall)
 #define __NR_syncfs 267
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/beancounter.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/beancounter.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/beancounter.h	2015-01-21 12:02:43.388222019 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/beancounter.h	2015-01-21 12:02:58.781813374 +0300
@@ -0,0 +1,648 @@
+/*
+ *  include/bc/beancounter.h
+ *
+ *  Copyright (C) 1999-2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ *  Andrey Savochkin	saw@sw-soft.com
+ *
+ */
+
+#ifndef _LINUX_BEANCOUNTER_H
+#define _LINUX_BEANCOUNTER_H
+
+/*
+ * This magic is used to distinuish user beancounter and pages beancounter
+ * in struct page. page_ub and page_bc are placed in union and MAGIC
+ * ensures us that we don't use pbc as ubc in ub_page_uncharge().
+ */
+#define UB_MAGIC		0x62756275
+
+/*
+ *	Resource list.
+ */
+
+#define UB_KMEMSIZE	0	/* Unswappable kernel memory size including
+				 * struct task, page directories, etc.
+				 */
+#define UB_LOCKEDPAGES	1	/* Mlock()ed pages. */
+#define UB_PRIVVMPAGES	2	/* Total number of pages, counting potentially
+				 * private pages as private and used.
+				 */
+#define UB_SHMPAGES	3	/* IPC SHM segment size. */
+#define UB_DUMMY	4	/* Dummy resource (compatibility) */
+#define UB_NUMPROC	5	/* Number of processes. */
+#define UB_PHYSPAGES	6	/* All resident pages, for swapout guarantee. */
+#define UB_VMGUARPAGES	7	/* Guarantee for virtual memory allocation.
+				 * Only barrier is used, see __vm_enough_memory()
+				 */
+#define UB_OOMGUARPAGES	8	/* Guarantees against OOM kill.
+				 * Only barrier is used, see ub_current_overdraft()
+				 */
+#define UB_NUMTCPSOCK	9	/* Number of TCP sockets. */
+#define UB_NUMFLOCK	10	/* Number of file locks. */
+#define UB_NUMPTY	11	/* Number of PTYs. */
+#define UB_NUMSIGINFO	12	/* Number of siginfos. */
+#define UB_TCPSNDBUF	13	/* Total size of tcp send buffers. */
+#define UB_TCPRCVBUF	14	/* Total size of tcp receive buffers. */
+#define UB_OTHERSOCKBUF	15	/* Total size of other socket
+				 * send buffers (all buffers for PF_UNIX).
+				 */
+#define UB_DGRAMRCVBUF	16	/* Total size of other socket
+				 * receive buffers.
+				 */
+#define UB_NUMOTHERSOCK	17	/* Number of other sockets. */
+#define UB_DCACHESIZE	18	/* Size of busy dentry/inode cache. */
+#define UB_NUMFILE	19	/* Number of open files. */
+
+#define UB_SHADOWPAGES	20	/* "dummy" */
+
+#define UB_RESOURCES_COMPAT	24
+
+/* Add new resources here */
+
+#define UB_NUMXTENT	23
+#define UB_SWAPPAGES	24
+#define UB_RESOURCES	25
+
+struct ubparm {
+	/* 
+	 * A barrier over which resource allocations are failed gracefully.
+	 * If the amount of consumed memory is over the barrier further sbrk()
+	 * or mmap() calls fail, the existing processes are not killed. 
+	 */
+	unsigned long	barrier;
+	/* hard resource limit */
+	unsigned long	limit;
+	/* consumed resources */
+	unsigned long	held;
+	/* maximum amount of consumed resources through the last period */
+	unsigned long	maxheld;
+	/* minimum amount of consumed resources through the last period */
+	unsigned long	minheld;
+	/* count of failed charges */
+	unsigned long	failcnt;
+	/* maximum percpu resource precharge */
+	int		max_precharge;
+};
+
+/*
+ * Kernel internal part.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/percpu_counter.h>
+#include <linux/oom.h>
+#include <bc/debug.h>
+#include <bc/decl.h>
+#include <asm/atomic.h>
+
+/*
+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
+ */
+#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
+
+
+/*
+ *	Resource management structures
+ * Serialization issues:
+ *   beancounter list management is protected via ub_hash_lock
+ *   task pointers are set only for current task and only once
+ *   refcount is managed atomically
+ *   value and limit comparison and change are protected by per-ub spinlock
+ */
+
+struct task_beancounter;
+struct sock_beancounter;
+
+struct page_private {
+	unsigned long		ubp_tmpfs_respages;
+};
+
+struct sock_private {
+	unsigned long		ubp_rmem_thres;
+	unsigned long		ubp_wmem_pressure;
+	unsigned long		ubp_maxadvmss;
+	unsigned long		ubp_rmem_pressure;
+	int			ubp_tw_count;
+#define UB_RMEM_EXPAND          0
+#define UB_RMEM_KEEP            1
+#define UB_RMEM_SHRINK          2
+	struct list_head	ubp_other_socks;
+	struct list_head	ubp_tcp_socks;
+	struct percpu_counter	ubp_orphan_count;
+};
+
+struct ub_percpu_struct {
+	int dirty_pages;
+	int writeback_pages;
+	int wb_requests;
+	int wb_sectors;
+
+	unsigned long fuse_requests;
+	unsigned long fuse_bytes;
+
+	unsigned long swapin;
+	unsigned long swapout;
+
+	unsigned long vswapin;
+	unsigned long vswapout;
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+	unsigned long async_write_complete;
+	unsigned long async_write_canceled;
+	unsigned long long sync_write_bytes;
+	unsigned long long sync_read_bytes;
+#endif
+#ifdef CONFIG_BC_DEBUG_KMEM
+	long	pages_charged;
+	long	vmalloc_charged;
+#endif
+	unsigned long	sync;
+	unsigned long	sync_done;
+
+	unsigned long	fsync;
+	unsigned long	fsync_done;
+
+	unsigned long	fdsync;
+	unsigned long	fdsync_done;
+
+	unsigned long	frsync;
+	unsigned long	frsync_done;
+
+	/* percpu resource precharge */
+	int	precharge[UB_RESOURCES];
+
+	int pincount;
+};
+
+struct user_beancounter
+{
+	unsigned long		ub_magic;
+	atomic_t		ub_refcount;
+	struct list_head	ub_list;
+	struct hlist_node	ub_hash;
+
+	union {
+		struct list_head ub_leaked_list;
+		struct rcu_head rcu;
+		struct work_struct work;
+		struct delayed_work dwork;
+	};
+
+	spinlock_t		ub_lock;
+	uid_t			ub_uid;
+
+	unsigned long		ub_flags;
+
+	struct ratelimit_state	ub_ratelimit;
+
+	struct page_private	ppriv;
+#define ub_tmpfs_respages	ppriv.ubp_tmpfs_respages
+	unsigned long		ub_hugetlb_pages;
+	struct sock_private	spriv;
+#define ub_rmem_thres		spriv.ubp_rmem_thres
+#define ub_maxadvmss		spriv.ubp_maxadvmss
+#define ub_rmem_pressure	spriv.ubp_rmem_pressure
+#define ub_wmem_pressure	spriv.ubp_wmem_pressure
+#define ub_tcp_sk_list		spriv.ubp_tcp_socks
+#define ub_other_sk_list	spriv.ubp_other_socks
+#define ub_orphan_count		spriv.ubp_orphan_count
+#define ub_tw_count		spriv.ubp_tw_count
+
+	atomic_long_t		dirty_pages;
+	atomic_long_t		writeback_pages;
+	atomic_long_t		wb_requests;
+	atomic_long_t		wb_sectors;
+
+	unsigned long		ub_swapentries; /* under swap_lock */
+
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+	struct gang_set		gang_set;
+#endif
+
+	/* reclaim rate-limit */
+	spinlock_t		rl_lock;
+	unsigned		rl_step;	/* ns per page */
+	ktime_t			rl_wall;	/* wall time */
+
+	struct cgroup		*ub_cgroup;
+
+	void			*private_data2;
+
+	struct list_head	ub_dentry_lru;
+	struct list_head	ub_dentry_top;
+	int			ub_dentry_unused;
+	int			ub_dentry_batch;
+	unsigned long		ub_dentry_pruned;
+
+	/* resources statistic and settings */
+	struct ubparm		ub_parms[UB_RESOURCES];
+	/* resources statistic for last interval */
+	struct ubparm		*ub_store;
+
+	struct ub_percpu_struct	*ub_percpu;
+	struct oom_control	oom_ctrl;
+
+	struct rb_node		dc_node;
+	unsigned int		dc_time;
+	unsigned int		dc_shrink_ts;
+	unsigned int		ub_dcache_threshold;
+};
+
+enum ub_flags {
+	UB_DIRTY_EXCEEDED,
+	UB_OOM_NOPROC,
+	UB_PAGECACHE_ISOLATION,
+	UB_UNDERFLOW,
+};
+
+extern int ub_count;
+extern struct oom_control global_oom_ctrl;
+
+enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE };
+
+#define UB_TEST	0x100
+#define UB_SEV_FLAGS	UB_TEST
+
+static inline int ub_barrier_hit(struct user_beancounter *ub, int resource)
+{
+	return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier;
+}
+
+static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource)
+{
+	return (ub->ub_parms[resource].held > 
+		((ub->ub_parms[resource].barrier) >> 1));
+}
+
+static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource)
+{
+	struct ubparm *p;
+	p = ub->ub_parms + resource;
+	return p->held <= (p->barrier >> 3);
+}
+
+static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource)
+{
+	struct ubparm *p;
+	p = ub->ub_parms + resource;
+	return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024;
+}
+
+static inline unsigned long ub_resource_bound(struct user_beancounter *ub,
+		int resource, enum ub_severity strict)
+{
+	switch (strict) {
+		case UB_HARD:
+			return ub->ub_parms[resource].barrier;
+		case UB_SOFT:
+			return ub->ub_parms[resource].limit;
+		case UB_FORCE:
+			return UB_MAXVALUE;
+		default:
+			{
+				extern int no_such_severity(void);
+				return no_such_severity();
+			}
+	}
+}
+
+static inline unsigned long ub_resource_excess(struct user_beancounter *ub,
+		int resource, enum ub_severity strict)
+{
+	unsigned long held, bound;
+
+	held = ub->ub_parms[resource].held;
+	bound = ub_resource_bound(ub, resource, strict);
+	if (likely(held < bound))
+		return bound - held;
+	return 0;
+}
+
+#ifndef CONFIG_BEANCOUNTERS
+
+#define ub_percpu(ub, cpu)		(NULL)
+#define __ub_percpu_sum(ub, field)	(0)
+#define ub_percpu_sum(ub, field)	(0)
+#define ub_percpu_add(ub, f, v)	do { } while (0)
+#define ub_percpu_sub(ub, f, v)	do { } while (0)
+#define ub_percpu_inc(ub, f)	do { } while (0)
+#define ub_percpu_dec(ub, f)	do { } while (0)
+
+#define mm_ub(mm)	(NULL)
+
+extern inline struct user_beancounter *get_beancounter_byuid
+		(uid_t uid, int create) { return NULL; }
+extern inline struct user_beancounter *get_beancounter
+		(struct user_beancounter *ub) { return NULL; }
+extern inline void put_beancounter(struct user_beancounter *ub) { }
+
+static inline void ub_init_late(void) { };
+static inline void ub_init_early(void) { };
+
+static inline int charge_beancounter(struct user_beancounter *ub,
+			int resource, unsigned long val,
+			enum ub_severity strict) { return 0; }
+static inline void uncharge_beancounter(struct user_beancounter *ub,
+			int resource, unsigned long val) { }
+
+static inline void ub_reclaim_rate_limit(struct user_beancounter *ub,
+					 int wait, unsigned count) { }
+
+#else /* CONFIG_BEANCOUNTERS */
+
+extern struct list_head ub_list_head;
+extern struct list_head ub_leaked_list;
+
+#define for_each_beancounter(__ubp) \
+	list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list)
+
+#define ub_percpu(ub, cpu) (per_cpu_ptr((ub)->ub_percpu, (cpu)))
+
+#define __ub_percpu_sum(ub, field)	({			\
+		struct user_beancounter *__ub = (ub);		\
+		typeof(ub_percpu(__ub, 0)->field) __sum = 0;	\
+		int __cpu;					\
+		for_each_possible_cpu(__cpu)			\
+			__sum += ub_percpu(__ub, __cpu)->field;	\
+		__sum;						\
+	})
+
+#define ub_percpu_sum(ub, field)	({			\
+		long __sum = __ub_percpu_sum(ub, field);	\
+		(__sum < 0) ? 0 : __sum;			\
+	})
+
+#define ub_percpu_add(ub, field, v)		do {			\
+		per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v);	\
+		put_cpu();						\
+	} while (0)
+#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1)
+
+#define ub_percpu_sub(ub, field, v)		do {			\
+		per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v);	\
+		put_cpu();						\
+	} while (0)
+#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1)
+
+#define mm_ub(mm)	((mm)->mm_ub)
+/*
+ *  Charge/uncharge operations
+ */
+
+extern int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict);
+
+extern void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val);
+
+extern void uncharge_warn(struct user_beancounter *ub, const char *resource,
+		unsigned long val, unsigned long held);
+
+extern long ub_oomguarpages_left(struct user_beancounter *ub);
+extern void ub_update_resources_locked(struct user_beancounter *ub);
+extern void ub_update_resources(struct user_beancounter *ub);
+
+extern const char *ub_rnames[];
+/*
+ *	Put a beancounter reference
+ */
+
+extern void release_beancounter(struct user_beancounter *ub);
+
+static inline void put_beancounter_longterm(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return;
+
+	if (atomic_dec_and_test(&ub->ub_refcount))
+		release_beancounter(ub);
+}
+
+static inline void __put_beancounter(struct user_beancounter *ub)
+{
+	if (atomic_dec_and_test(&ub->ub_refcount))
+		release_beancounter(ub);
+}
+
+static inline void put_beancounter(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return;
+
+	__put_beancounter(ub);
+}
+
+/*
+ *	Create a new beancounter reference
+ */
+extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create);
+
+static inline
+struct user_beancounter *get_beancounter_longterm(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return NULL;
+
+	atomic_inc(&ub->ub_refcount);
+	return ub;
+}
+
+static inline 
+struct user_beancounter *get_beancounter(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return NULL;
+
+	atomic_inc(&ub->ub_refcount);
+	return ub;
+}
+
+static inline 
+struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub)
+{
+	return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL;
+}
+
+extern void ub_init_late(void);
+extern void ub_init_early(void);
+
+#define UB_STAT_BATCH	64
+
+static inline void __ub_stat_add(atomic_long_t *stat, int *pcpu, long val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	if (*pcpu + val <= UB_STAT_BATCH)
+		*pcpu += val;
+	else {
+		atomic_long_add(*pcpu + val, stat);
+		*pcpu = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static inline void __ub_stat_sub(atomic_long_t *stat, int *pcpu, long val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	if (*pcpu - val >= -UB_STAT_BATCH)
+		*pcpu -= val;
+	else {
+		atomic_long_add(*pcpu - val, stat);
+		*pcpu = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static inline void __ub_stat_flush_pcpu(atomic_long_t *stat, int *pcpu)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	atomic_long_add(*pcpu, stat);
+	*pcpu = 0;
+	local_irq_restore(flags);
+}
+
+#define ub_stat_add(ub, name, val)	__ub_stat_add(&(ub)->name, &(ub)->ub_percpu->name, val)
+#define ub_stat_sub(ub, name, val)	__ub_stat_sub(&(ub)->name, &(ub)->ub_percpu->name, val)
+#define ub_stat_inc(ub, name)		ub_stat_add(ub, name, 1)
+#define ub_stat_dec(ub, name)		ub_stat_sub(ub, name, 1)
+#define ub_stat_mod(ub, name, val)	atomic_long_add(val, &(ub)->name)
+#define __ub_stat_get(ub, name)		atomic_long_read(&(ub)->name)
+#define __ub_stat_get_exact(ub, name)	(__ub_stat_get(ub, name) + __ub_percpu_sum(ub, name))
+#define ub_stat_get(ub, name)		max(0l, atomic_long_read(&(ub)->name))
+#define ub_stat_get_exact(ub, name)	max(0l, __ub_stat_get(ub, name) + __ub_percpu_sum(ub, name))
+#define ub_stat_flush_pcpu(ub, name)	__ub_stat_flush_pcpu(&(ub)->name, &(ub)->ub_percpu->name)
+
+int ubstat_alloc_store(struct user_beancounter *ub);
+
+/*
+ *	Resource charging
+ * Change user's account and compare against limits
+ */
+
+static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource)
+{
+	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
+		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
+	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
+		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
+}
+
+int charge_beancounter(struct user_beancounter *ub, int resource,
+		unsigned long val, enum ub_severity strict);
+void uncharge_beancounter(struct user_beancounter *ub, int resource,
+		unsigned long val);
+
+extern int ub_resource_precharge[UB_RESOURCES];
+void init_beancounter_precharge(struct user_beancounter *ub, int resource);
+
+static inline int __try_charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu, int resource, unsigned long val)
+{
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] >= val)) {
+		ub_pcpu->precharge[resource] -= val;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+static inline int __try_uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu, int resource, unsigned long val)
+{
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] + val <=
+				ub->ub_parms[resource].max_precharge)) {
+		ub_pcpu->precharge[resource] += val;
+		return 0;
+	}
+
+	return -E2BIG;
+}
+
+int __charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val, enum ub_severity strict);
+
+void __uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val);
+
+static inline int charge_beancounter_fast(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+	int retval = 0;
+
+	if (val > UB_MAXVALUE)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_charge_beancounter_percpu(ub, ub_pcpu, resource, val))
+		retval = __charge_beancounter_percpu(ub, ub_pcpu, resource,
+							val, strict);
+	local_irq_restore(flags);
+
+	return retval;
+}
+
+static inline void uncharge_beancounter_fast(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_uncharge_beancounter_percpu(ub, ub_pcpu, resource, val))
+		__uncharge_beancounter_percpu(ub, ub_pcpu, resource, val);
+	local_irq_restore(flags);
+}
+
+unsigned long __get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource);
+unsigned long get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource);
+
+int precharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val);
+void ub_precharge_snapshot(struct user_beancounter *ub, int *precharge);
+
+void ub_reclaim_rate_limit(struct user_beancounter *ub, int wait, unsigned count);
+
+#define UB_IOPRIO_MIN 0
+#define UB_IOPRIO_MAX 8
+#ifdef CONFIG_BC_IO_PRIORITY
+extern int ub_set_ioprio(int id, int ioprio);
+#else
+static inline int ub_set_ioprio(int veid, int ioprio) { return -EINVAL; }
+#endif
+
+extern void ub_init_ioprio(struct user_beancounter *ub);
+extern void ub_fini_ioprio(struct user_beancounter *ub);
+
+#endif /* CONFIG_BEANCOUNTERS */
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_BEANCOUNTER_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/dcache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/dcache.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/dcache.h	2015-01-21 12:02:43.156228178 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/dcache.h	2015-01-21 12:02:58.798812923 +0300
@@ -0,0 +1,28 @@
+#ifndef __UB_DCACHE_H__
+#define __UB_DCACHE_H__
+
+#include <bc/decl.h>
+
+extern unsigned int ub_dcache_thres_ratio;
+extern unsigned int ub_dcache_time_thresh;
+extern unsigned int ub_dcache_lru_popup;
+extern unsigned int ub_dcache_no_vzfs_cache;
+
+UB_DECLARE_FUNC(int, ub_dcache_charge(struct user_beancounter *ub, int name_len))
+UB_DECLARE_VOID_FUNC(ub_dcache_uncharge(struct user_beancounter *ub, int name_len))
+UB_DECLARE_VOID_FUNC(ub_dcache_set_owner(struct dentry *d, struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_dcache_change_owner(struct dentry *dentry, struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_dcache_clear_owner(struct dentry *dentry))
+UB_DECLARE_VOID_FUNC(ub_dcache_unuse(struct user_beancounter *ub))
+UB_DECLARE_FUNC(int, ub_dcache_reclaim(struct user_beancounter *ub, unsigned long, unsigned long))
+UB_DECLARE_FUNC(int, ub_dcache_shrink(struct user_beancounter *ub, unsigned long size, gfp_t gfp_mask))
+UB_DECLARE_FUNC(unsigned long, ub_dcache_get_size(struct dentry *dentry))
+
+extern unsigned int dcache_update_time(void);
+
+bool ub_dcache_shrinkable(gfp_t gfp_mask);
+struct user_beancounter *ub_dcache_next(void);
+void ub_dcache_insert(struct user_beancounter *ub, unsigned int time);
+void ub_update_threshold(void);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/debug.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/debug.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/debug.h	2015-01-21 12:02:43.388222019 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/debug.h	2015-01-21 12:02:43.388222019 +0300
@@ -0,0 +1,103 @@
+/*
+ *  include/bc/debug.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_DEBUG_H_
+#define __BC_DEBUG_H_
+
+/*
+ * general debugging
+ */
+
+#define UBD_ALLOC	0x1
+#define UBD_CHARGE	0x2
+#define UBD_LIMIT	0x4
+#define UBD_TRACE	0x8
+
+/*
+ * ub_net debugging
+ */
+
+#define UBD_NET_SOCKET	0x10
+#define UBD_NET_SLEEP	0x20
+#define UBD_NET_SEND	0x40
+#define UBD_NET_RECV	0x80
+
+/*
+ * Main routines
+ */
+
+#define UB_DEBUG (0)
+#define DEBUG_RESOURCE (0ULL)
+
+#define ub_dbg_cond(__cond, __str, args...)				\
+	do { 								\
+		if ((__cond) != 0)					\
+			printk(__str, ##args);				\
+	} while(0)
+
+#define ub_debug(__section, __str, args...) 				\
+	ub_dbg_cond(UB_DEBUG & (__section), __str, ##args)
+
+#define ub_debug_resource(__resource, __str, args...)			\
+	ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && 				\
+			(DEBUG_RESOURCE & (1 << (__resource))), 	\
+			__str, ##args)
+
+#if UB_DEBUG & UBD_TRACE
+#define ub_debug_trace(__cond, __b, __r)				\
+		do {							\
+			static DEFINE_RATELIMIT_STATE(rl, __r, __b);	\
+			if ((__cond) != 0 && __ratelimit(&rl))		\
+				dump_stack(); 				\
+		} while(0)
+#else
+#define ub_debug_trace(__cond, __burst, __rate)
+#endif
+
+#ifdef CONFIG_BC_DEBUG_KMEM
+#include <linux/list.h>
+
+struct user_beancounter;
+struct ub_cache_counter {
+	struct list_head ulist;
+	struct ub_cache_counter *next;
+	struct user_beancounter *ub;
+	struct kmem_cache *cachep;
+	unsigned long counter;
+};
+
+extern spinlock_t cc_lock;
+extern void init_cache_counters(void);
+extern void ub_free_counters(struct user_beancounter *);
+extern void ub_kmemcache_free(struct kmem_cache *cachep);
+
+struct vm_struct;
+#define inc_vmalloc_charged(vm, flags)	do {				\
+		if (flags & __GFP_UBC)					\
+			ub_percpu_add(get_exec_ub(), vmalloc_charged,	\
+					vm->nr_pages);			\
+	} while (0)
+#define dec_vmalloc_charged(vm)		do {				\
+		struct user_beancounter *ub;				\
+		ub = page_kmem_ub(vm->pages[0]);			\
+		if (ub != NULL)						\
+			ub_percpu_sub(ub, vmalloc_charged,		\
+					vm->nr_pages);			\
+	} while (0)
+#else
+#define init_cache_counters()		do { } while (0)
+#define inc_vmalloc_charged(vm, f)	do { } while (0)
+#define dec_vmalloc_charged(vm)		do { } while (0)
+
+#define ub_free_counters(ub)		do { } while (0)
+#define ub_kmemcache_free(cachep)	do { } while (0)
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/decl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/decl.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/decl.h	2015-01-21 12:02:43.388222019 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/decl.h	2015-01-21 12:02:43.388222019 +0300
@@ -0,0 +1,41 @@
+/*
+ *  include/bc/decl.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_DECL_H_
+#define __BC_DECL_H_
+
+#ifdef __KERNEL__
+
+/*
+ * Naming convension:
+ * ub_<section|object>_<operation>
+ */
+
+#ifdef CONFIG_BEANCOUNTERS
+
+#define UB_DECLARE_FUNC(ret_type, decl)	extern ret_type decl;
+#define UB_DECLARE_VOID_FUNC(decl)	extern void decl;
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define UB_DECLARE_FUNC(ret_type, decl)		\
+	static inline ret_type decl		\
+	{					\
+		return (ret_type)0;		\
+	}
+#define UB_DECLARE_VOID_FUNC(decl)		\
+	static inline void decl			\
+	{					\
+	}
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/io_acct.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/io_acct.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/io_acct.h	2015-01-21 12:02:43.345223160 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/io_acct.h	2015-01-21 12:02:43.345223160 +0300
@@ -0,0 +1,123 @@
+/*
+ *  include/ub/io_acct.h
+ *
+ *  Copyright (C) 2006 SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ *  Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#ifndef __UB_IO_ACCT_H_
+#define __UB_IO_ACCT_H_
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+#include <bc/beancounter.h>
+#include <linux/virtinfo.h>
+
+extern int ub_dirty_radio;
+extern int ub_dirty_background_ratio;
+
+/*
+ * IO ub is required in task context only, so if exec_ub is set
+ * to NULL this means that uses doesn't need to charge some
+ * resources. nevertheless IO activity must be accounted, so we
+ * account it to current's task beancounter.
+ */
+
+static inline struct user_beancounter *get_io_ub(void)
+{
+	struct user_beancounter *ub;
+
+	ub = get_exec_ub();
+	if (unlikely(ub == NULL))
+		ub = get_task_ub(current);
+
+	return ub;
+}
+
+static inline void ub_io_account_read(size_t bytes)
+{
+	ub_percpu_add(get_io_ub(), sync_read_bytes, bytes);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+	ub_percpu_add(get_io_ub(), sync_write_bytes, bytes);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+}
+
+extern void ub_io_account_dirty(struct address_space *mapping);
+extern void ub_io_account_clean(struct address_space *mapping);
+extern void ub_io_account_cancel(struct address_space *mapping);
+extern void ub_io_writeback_inc(struct address_space *mapping);
+extern void ub_io_writeback_dec(struct address_space *mapping);
+
+#define ub_dirty_pages(ub)	ub_stat_get(ub, dirty_pages)
+
+extern int ub_dirty_limits(unsigned long *pbackground,
+			   long *pdirty, struct user_beancounter *ub);
+
+extern bool ub_should_skip_writeback(struct user_beancounter *ub,
+				     struct inode *inode);
+
+static inline void ub_writeback_io(unsigned long requests, unsigned long sectors)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	ub_stat_add(ub, wb_requests, requests);
+	ub_stat_add(ub, wb_sectors, sectors);
+}
+
+#else /* UBC_IO_ACCT */
+
+static inline void ub_io_account_read(size_t bytes)
+{
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+}
+
+static inline void ub_io_account_dirty(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_account_clean(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_account_cancel(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_writeback_inc(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_writeback_dec(struct address_space *mapping)
+{
+}
+
+static inline unsigned long ub_dirty_pages(struct user_beancounter *ub)
+{
+	return 0;
+}
+
+static inline int ub_dirty_limits(unsigned long *pbackground,
+				  long *pdirty, struct user_beancounter *ub)
+{
+	return 0;
+}
+
+static inline bool ub_should_skip_writeback(struct user_beancounter *ub,
+				     struct inode *inode)
+{
+	return false;
+}
+
+#endif /* UBC_IO_ACCT */
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/kmem.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/kmem.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/kmem.h	2015-01-21 12:02:43.388222019 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/kmem.h	2015-01-21 12:02:43.388222019 +0300
@@ -0,0 +1,198 @@
+/*
+ *  include/bc/kmem.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __UB_SLAB_H_
+#define __UB_SLAB_H_
+
+#include <bc/beancounter.h>
+#include <bc/decl.h>
+
+/*
+ * UB_KMEMSIZE accounting
+ */
+
+#ifdef CONFIG_BC_DEBUG_ITEMS
+#define CHARGE_ORDER(__o)		(1 << (__o))
+#define CHARGE_SIZE(__s)		1
+#else
+#define CHARGE_ORDER(__o)		(PAGE_SIZE << (__o))
+#define CHARGE_SIZE(__s)		(__s)
+#endif
+
+struct mm_struct;
+struct page;
+struct kmem_cache;
+
+UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj))
+UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj))
+
+UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep,
+			void *objp, gfp_t flags))
+UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj))
+
+static inline struct user_beancounter* page_kmem_ub(struct page *page)
+{
+	return page->kmem_ub;
+}
+
+static inline enum ub_severity ub_gfp_sev(gfp_t gfp_mask)
+{
+	return (gfp_mask & __GFP_SOFT_UBC) ? UB_SOFT : UB_HARD;
+}
+
+extern int __ub_kmem_charge(struct user_beancounter *ub,
+		unsigned long size, gfp_t gfp_mask);
+extern void __ub_kmem_uncharge(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		unsigned long size);
+
+static inline int ub_kmem_charge(struct user_beancounter *ub,
+		unsigned long size, gfp_t gfp_mask)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_charge_beancounter_percpu(ub, ub_pcpu, UB_KMEMSIZE, size)) {
+		local_irq_restore(flags);
+		return __ub_kmem_charge(ub, size, gfp_mask);
+	}
+	local_irq_restore(flags);
+	return 0;
+}
+
+static inline void ub_kmem_uncharge(struct user_beancounter *ub,
+		unsigned long size)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_uncharge_beancounter_percpu(ub, ub_pcpu, UB_KMEMSIZE, size))
+		__ub_kmem_uncharge(ub, ub_pcpu, size);
+	local_irq_restore(flags);
+}
+
+static inline int ub_page_charge(struct page *page, int order,
+		struct user_beancounter *ub, gfp_t gfp_mask)
+{
+	if (ub_kmem_charge(ub, CHARGE_ORDER(order), gfp_mask))
+		return -ENOMEM;
+
+	BUG_ON(page->kmem_ub != NULL);
+	page->kmem_ub = get_beancounter(ub);
+	return 0;
+}
+
+static inline void ub_page_uncharge(struct page *page, int order)
+{
+	struct user_beancounter *ub = page->kmem_ub;
+
+	if (likely(ub == NULL))
+		return;
+
+	page->kmem_ub = NULL;
+	BUG_ON(ub->ub_magic != UB_MAGIC);
+	ub_kmem_uncharge(ub, CHARGE_ORDER(order));
+	put_beancounter(ub);
+}
+
+static inline int ub_page_table_get_one(struct mm_struct *mm)
+{
+	if (mm->page_table_precharge)
+		return 0;
+	if (ub_kmem_charge(mm->mm_ub, PAGE_SIZE,
+				GFP_KERNEL | __GFP_SOFT_UBC))
+		return -ENOMEM;
+	return 1;
+}
+
+static inline void ub_page_table_put_one(struct mm_struct *mm, int one)
+{
+	if (one)
+		ub_kmem_uncharge(mm->mm_ub, PAGE_SIZE);
+}
+
+static inline int ub_page_table_charge(struct mm_struct *mm, int one)
+{
+	if (one)
+		return 0;
+	if (unlikely(mm->page_table_precharge == 0))
+		return ub_kmem_charge(mm->mm_ub, PAGE_SIZE,
+				GFP_ATOMIC | __GFP_SOFT_UBC);
+	mm->page_table_precharge--;
+	return 0;
+}
+
+static inline void ub_page_table_uncharge(struct mm_struct *mm)
+{
+	mm->page_table_precharge++;
+}
+
+static inline int ub_page_table_precharge(struct mm_struct *mm, long precharge)
+{
+	if (ub_kmem_charge(mm->mm_ub, precharge << PAGE_SHIFT,
+				GFP_KERNEL | __GFP_SOFT_UBC))
+		return -ENOMEM;
+	mm->page_table_precharge += precharge;
+	return 0;
+}
+
+static inline void ub_page_table_commit(struct mm_struct *mm)
+{
+	if (unlikely(mm->page_table_precharge)) {
+		ub_kmem_uncharge(mm->mm_ub,
+				mm->page_table_precharge << PAGE_SHIFT);
+		mm->page_table_precharge = 0;
+	}
+}
+
+static inline void *ub_kmem_alloc(struct user_beancounter *ub,
+		struct kmem_cache *cachep, gfp_t gfp_flags)
+{
+	void *objp;
+
+	if (ub_kmem_charge(ub, cachep->objuse, gfp_flags))
+		return NULL;
+
+	objp = kmem_cache_alloc(cachep, gfp_flags);
+
+	if (unlikely(objp == NULL))
+		ub_kmem_uncharge(ub, cachep->objuse);
+
+	return objp;
+}
+
+static inline void ub_kmem_free(struct user_beancounter *ub,
+		struct kmem_cache *cachep, void *objp)
+{
+	kmem_cache_free(cachep, objp);
+	ub_kmem_uncharge(ub, cachep->objuse);
+}
+
+#ifdef CONFIG_BEANCOUNTERS
+static inline int should_charge(unsigned long cflags, gfp_t flags)
+{
+	if (!(cflags & SLAB_UBC))
+		return 0;
+	if ((cflags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC))
+		return 0;
+	return 1;
+}
+
+#define should_uncharge(cflags)	should_charge(cflags, __GFP_UBC)
+#else
+#define should_charge(cflags, f)	0
+#define should_uncharge(cflags)		0
+#endif
+
+#endif /* __UB_SLAB_H_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/misc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/misc.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/misc.h	2015-01-21 12:02:43.388222019 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/misc.h	2015-01-21 12:02:43.496219151 +0300
@@ -0,0 +1,43 @@
+/*
+ *  include/bc/misc.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_MISC_H_
+#define __BC_MISC_H_
+
+#include <bc/decl.h>
+
+struct tty_struct;
+struct file;
+struct file_lock;
+struct sigqueue;
+
+UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
+UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
+UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
+UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl))
+UB_DECLARE_FUNC(int, ub_task_charge(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_task_get(struct user_beancounter *ub,
+			struct task_struct *task))
+UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task))
+UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty))
+UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty))
+
+#ifdef CONFIG_BEANCOUNTERS
+#define set_flock_charged(fl)	do { (fl)->fl_charged = 1; } while (0)
+#define unset_flock_charged(fl)	do {		\
+		WARN_ON((fl)->fl_charged == 0);	\
+		(fl)->fl_charged = 0;		\
+	} while (0)
+#else
+#define set_flock_charged(fl)	do { } while (0)
+#define unset_flock_charged(fl)	do { } while (0)
+#endif
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/net.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/net.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/net.h	2015-01-21 12:02:43.279224911 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/net.h	2015-01-21 12:02:47.885102631 +0300
@@ -0,0 +1,203 @@
+/*
+ *  include/bc/net.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_NET_H_
+#define __BC_NET_H_
+
+/*
+ * UB_NUMXXXSOCK, UB_XXXBUF accounting
+ */
+
+#include <bc/decl.h>
+#include <bc/sock.h>
+#include <bc/beancounter.h>
+
+#define bid2sid(__bufid) \
+	((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK)
+
+#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \
+			~(SMP_CACHE_BYTES-1)))
+#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE)
+
+static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	memset(skb_bc(skb), 0, sizeof(struct skb_beancounter));
+#endif
+	return 0;
+}
+
+static inline void ub_skb_free_bc(struct sk_buff *skb)
+{
+}
+
+#define IS_TCP_SOCK(__family, __type) \
+		(((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM)
+
+/* number of sockets */
+UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type, int kern))
+UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk, int kern))
+UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk, int kern))
+UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk))
+
+/* management of queue for send space */
+UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, 
+			unsigned long size))
+UB_DECLARE_FUNC(int, ub_sock_snd_queue_add(struct sock *sk, int resource, 
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk))
+
+/* send space */
+UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid,
+			unsigned long size))
+UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid,
+			unsigned long size, unsigned long ressize))
+UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk,
+			struct sk_buff *skb, enum ub_severity strict))
+UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk))
+UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk))
+
+UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk))
+
+/* receive space */
+UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb))
+UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk,
+			struct sk_buff *skb, enum ub_severity strict))
+
+/* skb destructor */
+UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb))
+
+static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size)
+{
+	return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size);
+}
+
+UB_DECLARE_FUNC(int, ub_sock_makewres_poll(struct sock *sk, unsigned long size))
+
+UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk,
+			unsigned long size))
+
+static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size)
+{
+	return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size);
+}
+
+UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk,
+			unsigned long size, unsigned long ressize))
+
+static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size,
+		unsigned long ressize)
+{
+	ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize);
+}
+
+static inline int ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz)
+{
+	return ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz);
+}
+
+static inline int ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)
+{
+	return ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz);
+}
+
+static inline int ub_tcpsndbuf_charge(struct sock *sk,
+		struct sk_buff *skb)
+{
+	return ub_sock_tcp_chargesend(sk, skb, UB_HARD);
+}
+
+static inline int ub_tcpsndbuf_charge_forced(struct sock *sk,
+		struct sk_buff *skb)
+{
+	return ub_sock_tcp_chargesend(sk, skb, UB_FORCE);
+}
+
+static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)
+{
+	return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT);
+}
+
+static inline int ub_tcprcvbuf_charge_forced(struct sock *sk,
+		struct sk_buff *skb)
+{
+	return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE);
+}
+
+/* Charge size */
+static inline unsigned long skb_charge_datalen(unsigned long chargesize)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	unsigned long slabsize;
+
+	chargesize -= sizeof(struct sk_buff);
+	slabsize = rounddown_pow_of_two(chargesize);
+
+	return (slabsize - sizeof(struct skb_shared_info)) &
+		~(SMP_CACHE_BYTES-1);
+#else
+	return 0;
+#endif
+}
+
+static inline unsigned long skb_charge_size_gen(unsigned long size)
+{ 
+#ifdef CONFIG_BEANCOUNTERS
+	unsigned long slabsize;
+
+	size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info);
+	slabsize = roundup_pow_of_two(size);
+
+	return slabsize + sizeof(struct sk_buff);
+#else
+	return 0;
+#endif
+
+}
+	
+static inline unsigned long skb_charge_size_const(unsigned long size)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	unsigned int ret;
+	if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64)
+		ret = 64 + sizeof(struct sk_buff);
+	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128)
+		ret = 128 + sizeof(struct sk_buff);
+	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256)
+		ret = 256 + sizeof(struct sk_buff);
+	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512)
+		ret = 512 + sizeof(struct sk_buff);
+	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024)
+		ret = 1024 + sizeof(struct sk_buff);
+	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048)
+		ret = 2048 + sizeof(struct sk_buff);
+	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096)
+		ret = 4096 + sizeof(struct sk_buff);
+	else
+		ret = skb_charge_size_gen(size);
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+
+#define skb_charge_size(__size)			\
+	(__builtin_constant_p(__size)	?	\
+	 skb_charge_size_const(__size)	:	\
+	 skb_charge_size_gen(__size))
+
+UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb))
+UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, 
+			struct sock *sk, unsigned long size, int res))
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/oom_kill.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/oom_kill.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/oom_kill.h	2015-01-21 12:02:43.439220664 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/oom_kill.h	2015-01-21 12:02:43.470219841 +0300
@@ -0,0 +1,20 @@
+#include <bc/decl.h>
+#include <bc/task.h>
+#include <bc/beancounter.h>
+
+UB_DECLARE_FUNC(int, ub_oom_lock(struct oom_control *oom_ctrl, gfp_t gfp_mask))
+UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void))
+UB_DECLARE_VOID_FUNC(ub_oom_unlock(struct oom_control *oom_ctrl))
+UB_DECLARE_VOID_FUNC(ub_oom_mm_dead(struct mm_struct *mm))
+UB_DECLARE_FUNC(long, ub_current_overdraft(struct user_beancounter *ub))
+UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub,
+			struct task_struct *tsk))
+UB_DECLARE_FUNC(unsigned long, ub_oom_total_pages(struct user_beancounter *ub))
+UB_DECLARE_FUNC(int, out_of_memory_in_ub(struct user_beancounter *ub,
+					gfp_t gfp_mask))
+UB_DECLARE_VOID_FUNC(ub_oom_start(struct oom_control *oom_ctrl))
+UB_DECLARE_VOID_FUNC(ub_oom_mark_mm(struct mm_struct *mm,
+			struct oom_control *oom_ctrl))
+
+#ifdef CONFIG_BEANCOUNTERS
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/proc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/proc.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/proc.h	2015-01-21 12:02:43.388222019 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/proc.h	2015-01-21 12:02:43.388222019 +0300
@@ -0,0 +1,40 @@
+/*
+ *  include/bc/proc.h
+ *
+ *  Copyright (C) 2006  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __UB_PROC_H_
+#define __UB_PROC_H_
+
+#include <linux/seq_file.h>
+
+struct bc_proc_entry {
+	char *name;
+	union {
+		int (*show)(struct seq_file *, void *);
+		struct file_operations *fops;
+	} u;
+	struct bc_proc_entry *next;
+	int cookie;
+};
+
+struct user_beancounter;
+
+void bc_register_proc_entry(struct bc_proc_entry *);
+void bc_register_proc_root_entry(struct bc_proc_entry *);
+
+static inline struct user_beancounter *seq_beancounter(struct seq_file *f)
+{
+	return (struct user_beancounter *)(f->private);
+}
+
+extern const char *bc_proc_lu_fmt;
+extern const char *bc_proc_lu_lfmt;
+extern const char *bc_proc_llu_fmt;
+extern const char *bc_proc_lu_lu_fmt;
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/sock.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/sock.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/sock.h	2015-01-21 12:02:43.279224911 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/sock.h	2015-01-21 12:02:43.279224911 +0300
@@ -0,0 +1,47 @@
+/*
+ *  include/bc/sock.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_SOCK_H_
+#define __BC_SOCK_H_
+
+#include <bc/task.h>
+
+struct sock;
+struct sk_buff;
+
+struct skb_beancounter {
+	struct user_beancounter *ub;
+	unsigned long charged:27, resource:5;
+};
+
+struct sock_beancounter {
+	struct user_beancounter *ub;
+	/*
+	 * poll_reserv accounts space already charged for future sends.
+	 * It is required to make poll agree with sendmsg.
+	 * Additionally, it makes real charges (with taking bc spinlock)
+	 * in the send path rarer, speeding networking up.
+	 * For TCP (only): changes are protected by socket lock (not bc!)
+	 * For all proto: may be read without serialization in poll.
+	 */
+	unsigned long           poll_reserv;
+	unsigned long		forw_space;
+	/* fields below are protected by bc spinlock */
+	unsigned long           ub_waitspc;     /* space waiting for */
+	unsigned long           ub_wcharged;
+	struct list_head        ub_sock_list;
+};
+
+#define sock_bc(__sk)		(&(__sk)->sk_bc)
+#define skb_bc(__skb)		(&(__skb)->skb_bc)
+#define skbc_sock(__skbc)	(container_of(__skbc, struct sock, sk_bc))
+#define sock_has_ubc(__sk)	(sock_bc(__sk)->ub != NULL)
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/sock_orphan.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/sock_orphan.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/sock_orphan.h	2015-01-21 12:02:43.280224885 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/sock_orphan.h	2015-01-21 12:02:43.279224911 +0300
@@ -0,0 +1,104 @@
+/*
+ *  include/bc/sock_orphan.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_SOCK_ORPHAN_H_
+#define __BC_SOCK_ORPHAN_H_
+
+#include <net/tcp.h>
+
+#include "bc/beancounter.h"
+#include "bc/net.h"
+
+
+static inline struct percpu_counter *__ub_get_orphan_count_ptr(struct sock *sk)
+{
+	if (sock_has_ubc(sk))
+		return &sock_bc(sk)->ub->ub_orphan_count;
+	return sk->sk_prot->orphan_count;
+}
+
+static inline void ub_inc_orphan_count(struct sock *sk)
+{
+	percpu_counter_inc(__ub_get_orphan_count_ptr(sk));
+}
+
+static inline void ub_dec_orphan_count(struct sock *sk)
+{
+	percpu_counter_dec(__ub_get_orphan_count_ptr(sk));
+}
+
+static inline int ub_get_orphan_count(struct sock *sk)
+{
+	return percpu_counter_sum_positive(__ub_get_orphan_count_ptr(sk));
+}
+
+extern int __ub_too_many_orphans(struct sock *sk, int count);
+static inline int ub_too_many_orphans(struct sock *sk, int count)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	if (__ub_too_many_orphans(sk, count))
+		return 1;
+#endif
+	return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans ||
+		(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+		 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]));
+}
+
+#include <bc/kmem.h>
+
+struct inet_timewait_sock;
+
+static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+
+	ub = slab_ub(tw);
+	if (ub != NULL)
+		ub->ub_tw_count += incdec;
+#endif
+}
+
+static inline int __ub_timewait_check(struct sock *sk)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+	unsigned long mem_max, mem;
+	int tw_count;
+
+	ub = sock_bc(sk)->ub;
+	if (ub == NULL)
+		return 1;
+
+	tw_count = ub->ub_tw_count;
+	mem_max = sysctl_tcp_max_tw_kmem_fraction *
+		((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1);
+	mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab);
+	mem *= tw_count;
+	return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max;
+#else
+	return 1;
+#endif
+}
+
+#define ub_timewait_inc(tw, twdr) do {			\
+		if ((twdr)->ub_managed)			\
+			ub_timewait_mod(tw, 1);		\
+	} while (0)
+
+#define ub_timewait_dec(tw, twdr) do {			\
+		if ((twdr)->ub_managed)			\
+			ub_timewait_mod(tw, -1);	\
+	} while (0)
+
+#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \
+					__ub_timewait_check(sk))
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/statd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/statd.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/statd.h	2015-01-21 12:02:43.389221992 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/statd.h	2015-01-21 12:02:43.389221992 +0300
@@ -0,0 +1,70 @@
+/*
+ *  include/bc/statd.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_STATD_H_
+#define __BC_STATD_H_
+
+/* sys_ubstat commands list */
+#define UBSTAT_READ_ONE			0x010000
+#define UBSTAT_READ_ALL			0x020000
+#define UBSTAT_READ_FULL		0x030000
+#define UBSTAT_UBLIST			0x040000
+#define UBSTAT_UBPARMNUM		0x050000
+#define UBSTAT_GETTIME			0x060000
+
+#define UBSTAT_CMD(func)		((func) & 0xF0000)
+#define UBSTAT_PARMID(func)		((func) & 0x0FFFF)
+
+#define TIME_MAX_SEC		(LONG_MAX / HZ)
+#define TIME_MAX_JIF		(TIME_MAX_SEC * HZ)
+
+typedef unsigned long ubstattime_t;
+
+typedef struct {
+	ubstattime_t	start_time;
+	ubstattime_t	end_time;
+	ubstattime_t	cur_time;
+} ubgettime_t;
+
+typedef struct {
+	long		maxinterval;
+	int		signum;
+} ubnotifrq_t;
+
+typedef struct {
+	unsigned long	maxheld;
+	unsigned long	failcnt;
+} ubstatparm_t;
+
+typedef struct {
+	unsigned long	barrier;
+	unsigned long	limit;
+	unsigned long	held;
+	unsigned long	maxheld;
+	unsigned long	minheld;
+	unsigned long	failcnt;
+	unsigned long __unused1;
+	unsigned long __unused2;
+} ubstatparmf_t;
+
+typedef struct {
+	ubstattime_t	start_time;
+	ubstattime_t	end_time;
+	ubstatparmf_t	param[0];
+} ubstatfull_t;
+
+#ifdef __KERNEL__
+struct ub_stat_notify {
+	struct list_head	list;
+	struct task_struct	*task;
+	int			signum;
+};
+#endif
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/task.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/task.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/task.h	2015-01-21 12:02:43.389221992 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/task.h	2015-01-21 12:02:43.389221992 +0300
@@ -0,0 +1,62 @@
+/*
+ *  include/bc/task.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_TASK_H_
+#define __BC_TASK_H_
+
+struct user_beancounter;
+
+
+#ifdef CONFIG_BEANCOUNTERS
+struct task_beancounter {
+	struct user_beancounter	*exec_ub;
+	struct user_beancounter *saved_ub;
+	struct user_beancounter	*task_ub;
+	unsigned long oom_generation;
+};
+
+extern int set_task_exec_ub(struct task_struct *, struct user_beancounter *);
+
+#define get_task_ub(__task)	((__task)->task_bc.task_ub)
+
+extern struct user_beancounter ub0;
+#define get_ub0()	(&ub0)
+
+#define ub_save_context(t)	do {				\
+		t->task_bc.saved_ub = t->task_bc.exec_ub;	\
+		t->task_bc.exec_ub = get_ub0();			\
+	} while (0)
+#define ub_restore_context(t)	do {				\
+		t->task_bc.exec_ub = t->task_bc.saved_ub;	\
+	} while (0)
+
+#define get_exec_ub()		(current->task_bc.exec_ub)
+#define set_exec_ub(__newub)		\
+({					\
+	struct user_beancounter *old;	\
+	struct task_beancounter *tbc;	\
+ 					\
+	tbc = &current->task_bc;	\
+	old = tbc->exec_ub;		\
+	tbc->exec_ub = __newub;		\
+	old;				\
+})
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define get_ub0()		(NULL)
+#define get_exec_ub()		(NULL)
+#define get_task_ub(task)	(NULL)
+#define set_exec_ub(__ub)	(NULL)
+#define ub_save_context(t)	do { } while (0)
+#define ub_restore_context(t)	do { } while (0)
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif /* __task.h_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/tcp.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/tcp.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/tcp.h	2015-01-21 12:02:43.280224885 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/tcp.h	2015-01-21 12:02:43.280224885 +0300
@@ -0,0 +1,76 @@
+/*
+ *  include/bc/tcp.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_TCP_H_
+#define __BC_TCP_H_
+
+/*
+ * UB_NUMXXXSOCK, UB_XXXBUF accounting
+ */
+
+#include <bc/sock.h>
+#include <bc/beancounter.h>
+
+static inline void ub_tcp_update_maxadvmss(struct sock *sk)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	if (!sock_has_ubc(sk))
+		return;
+	if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss)
+		return;
+
+	sock_bc(sk)->ub->ub_maxadvmss =
+		skb_charge_size(MAX_HEADER + sizeof(struct iphdr)
+				+ sizeof(struct tcphdr)	+ tcp_sk(sk)->advmss);
+#endif
+}
+
+static inline int ub_tcp_rmem_allows_expand(struct sock *sk)
+{
+	if (tcp_memory_pressure)
+		return 0;
+#ifdef CONFIG_BEANCOUNTERS
+	if (sock_has_ubc(sk)) {
+		struct user_beancounter *ub;
+
+		ub = sock_bc(sk)->ub;
+		if (ub->ub_rmem_pressure == UB_RMEM_EXPAND)
+			return 1;
+		if (ub->ub_rmem_pressure == UB_RMEM_SHRINK)
+			return 0;
+		return sk->sk_rcvbuf <= ub->ub_rmem_thres;
+	}
+#endif
+	return 1;
+}
+
+static inline int ub_tcp_memory_pressure(struct sock *sk)
+{
+	if (tcp_memory_pressure)
+		return 1;
+#ifdef CONFIG_BEANCOUNTERS
+	if (sock_has_ubc(sk))
+		return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND;
+#endif
+	return 0;
+}
+
+static inline int ub_tcp_shrink_rcvbuf(struct sock *sk)
+{
+	if (tcp_memory_pressure)
+		return 1;
+#ifdef CONFIG_BEANCOUNTERS
+	if (sock_has_ubc(sk))
+		return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK;
+#endif
+	return 0;
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/bc/vmpages.h linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/vmpages.h
--- linux-2.6.32-504.3.3.el6.orig/include/bc/vmpages.h	2015-01-21 12:02:43.389221992 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/bc/vmpages.h	2015-01-21 12:02:58.704815419 +0300
@@ -0,0 +1,184 @@
+/*
+ *  include/bc/vmpages.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __UB_PAGES_H_
+#define __UB_PAGES_H_
+
+#include <linux/linkage.h>
+#include <linux/sched.h>	/* for get_exec_ub() */
+#include <linux/mm.h>
+#include <bc/beancounter.h>
+#include <bc/decl.h>
+
+extern int glob_ve_meminfo;
+
+/*
+ * Check whether vma has private or copy-on-write mapping.
+ */
+#define VM_UB_PRIVATE(__flags, __file)					\
+		( ((__flags) & VM_WRITE) ?				\
+			(__file) == NULL || !((__flags) & VM_SHARED) :	\
+			0						\
+		)
+
+UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm,
+			unsigned long size,
+			unsigned vm_flags,
+			struct file *vm_file,
+			int strict))
+UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm,
+			unsigned long size,
+			unsigned vm_flags,
+			struct file *vm_file))
+
+struct shmem_inode_info;
+UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi))
+UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
+			unsigned long size))
+#define ub_tmpfs_respages_dec(shi)	ub_tmpfs_respages_sub(shi, 1)
+
+UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm,
+			unsigned long size))
+UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
+			unsigned long size))
+
+extern void __ub_update_oomguarpages(struct user_beancounter *ub);
+
+static inline int ub_swap_full(struct user_beancounter *ub)
+{
+	return (ub->ub_parms[UB_SWAPPAGES].held * 2 >
+			ub->ub_parms[UB_SWAPPAGES].limit);
+}
+
+
+struct swap_info_struct;
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+
+extern int ub_swap_init(struct swap_info_struct *si, pgoff_t num);
+extern void ub_swap_fini(struct swap_info_struct *si);
+extern void ub_swapentry_get(struct swap_info_struct *si, pgoff_t offset,
+			     struct user_beancounter *ub);
+extern void ub_swapentry_put(struct swap_info_struct *si, pgoff_t offset);
+extern void ub_swapentry_charge(struct swap_info_struct *si, pgoff_t offset);
+extern void ub_swapentry_uncharge(struct swap_info_struct *si, pgoff_t offset);
+extern void ub_swapentry_recharge(struct swap_info_struct *si, pgoff_t offset,
+				  struct user_beancounter *new_ub);
+
+#else /* CONFIG_BC_SWAP_ACCOUNTING */
+
+static inline int ub_swap_init(struct swap_info_struct *si, pgoff_t num)
+{
+	return 0;
+}
+static inline void ub_swap_fini(struct swap_info_struct *si) { }
+static inline void ub_swapentry_get(struct swap_info_struct *si, pgoff_t offset,
+			     struct user_beancounter *ub) { }
+static inline void ub_swapentry_put(struct swap_info_struct *si, pgoff_t offset) { }
+static inline void ub_swapentry_charge(struct swap_info_struct *si, pgoff_t offset) { }
+static inline void ub_swapentry_uncharge(struct swap_info_struct *si, pgoff_t offset) { }
+static inline void ub_swapentry_recharge(struct swap_info_struct *si, pgoff_t offset,
+					 struct user_beancounter *new_ub) { }
+
+#endif /* CONFIG_BC_SWAP_ACCOUNTING */
+
+
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+
+int ub_hugetlb_charge(struct user_beancounter *ub, struct page *page);
+void ub_hugetlb_uncharge(struct page *page);
+
+int ub_try_to_free_pages(struct user_beancounter *ub, gfp_t gfp_mask);
+
+extern int __ub_phys_charge(struct user_beancounter *ub,
+		unsigned long pages, gfp_t gfp_mask);
+
+static inline int ub_phys_charge(struct user_beancounter *ub,
+		unsigned long pages, gfp_t gfp_mask)
+{
+	if (__try_charge_beancounter_percpu(ub, ub_percpu(ub, get_cpu()),
+				UB_PHYSPAGES, pages)) {
+		put_cpu();
+		return __ub_phys_charge(ub, pages, gfp_mask);
+	}
+	put_cpu();
+	return 0;
+}
+
+static inline void ub_phys_uncharge(struct user_beancounter *ub,
+		unsigned long pages)
+{
+	uncharge_beancounter_fast(ub, UB_PHYSPAGES, pages);
+}
+
+int __ub_check_ram_limits(struct user_beancounter *ub, gfp_t gfp_mask, int size);
+
+static inline int ub_check_ram_limits(struct user_beancounter *ub, gfp_t gfp_mask)
+{
+	if (likely(ub->ub_parms[UB_PHYSPAGES].limit == UB_MAXVALUE ||
+			!precharge_beancounter(ub, UB_PHYSPAGES, 1)))
+		return 0;
+
+	return __ub_check_ram_limits(ub, gfp_mask, 1);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+static inline int ub_precharge_hpage(struct mm_struct *mm)
+{
+	struct user_beancounter *ub = mm_ub(mm);
+
+	if (likely(ub->ub_parms[UB_PHYSPAGES].limit == UB_MAXVALUE ||
+	    !precharge_beancounter(ub, UB_PHYSPAGES, HPAGE_PMD_NR)))
+		return 0;
+
+	return __ub_check_ram_limits(ub, GFP_TRANSHUGE, HPAGE_PMD_NR);
+}
+
+#endif
+
+#else /* CONFIG_BC_RSS_ACCOUNTING */
+
+static inline int ub_try_to_free_pages(struct user_beancounter *ub, gfp_t gfp_mask)
+{
+	return -ENOSYS;
+}
+
+static inline int ub_phys_charge(struct user_beancounter *ub,
+		unsigned long pages, gfp_t gfp_mask)
+{
+	return charge_beancounter_fast(ub, UB_PHYSPAGES, pages, UB_FORCE);
+}
+
+static inline void ub_phys_uncharge(struct user_beancounter *ub,
+		unsigned long pages)
+{
+	uncharge_beancounter_fast(ub, UB_PHYSPAGES, pages);
+}
+
+static inline int ub_check_ram_limits(struct user_beancounter *ub, gfp_t gfp_mask)
+{
+	return 0;
+}
+
+static inline int ub_precharge_hpage(struct mm_struct *mm)
+{
+	return 0;
+}
+#endif /* CONFIG_BC_RSS_ACCOUNTING */
+
+void __show_ub_mem(struct user_beancounter *ub);
+void show_ub_mem(struct user_beancounter *ub);
+
+#endif /* __UB_PAGES_H_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/crypto/sha.h linux-2.6.32-504.3.3.el6-042stab103_6/include/crypto/sha.h
--- linux-2.6.32-504.3.3.el6.orig/include/crypto/sha.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/crypto/sha.h	2015-01-21 12:02:52.161989093 +0300
@@ -82,4 +82,7 @@ struct sha512_state {
 	u8 buf[SHA512_BLOCK_SIZE];
 };
 
+extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/aio.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/aio.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/aio.h	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/aio.h	2015-01-21 12:02:58.219828291 +0300
@@ -24,6 +24,7 @@ struct kioctx;
 #define KIOCB_C_COMPLETE	0x02
 
 #define KIOCB_SYNC_KEY		(~0U)
+#define KIOCB_KERNEL_KEY		(~1U)
 
 /* ki_flags bits */
 /*
@@ -99,6 +100,7 @@ struct kiocb {
 	union {
 		void __user		*user;
 		struct task_struct	*tsk;
+		void			(*complete)(u64 user_data, long res);
 	} ki_obj;
 
 	__u64			ki_user_data;	/* user's data for completion */
@@ -124,9 +126,11 @@ struct kiocb {
 	 * this is the underlying eventfd context to deliver events to.
 	 */
 	struct eventfd_ctx	*ki_eventfd;
+	struct iov_iter		*ki_iter;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
+#define is_kernel_kiocb(iocb)	((iocb)->ki_key == KIOCB_KERNEL_KEY)
 #define init_sync_kiocb(x, filp)			\
 	do {						\
 		struct task_struct *tsk = current;	\
@@ -181,6 +185,7 @@ struct kioctx {
 	atomic_t		users;
 	int			dead;
 	struct mm_struct	*mm;
+	struct ve_struct	*ve;
 
 	/* This needs improving */
 	unsigned long		user_id;
@@ -204,10 +209,16 @@ struct kioctx {
 	struct rcu_head		rcu_head;
 };
 
+#define AIO_MAX_NR_DEFAULT	0x10000
+
+extern struct kmem_cache	*kioctx_cachep;
+
 /* prototypes */
 extern unsigned aio_max_size;
 
 #ifdef CONFIG_AIO
+extern void aio_kick_handler(struct work_struct *);
+extern void wait_for_all_aios(struct kioctx *ctx);
 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
 extern int aio_put_req(struct kiocb *iocb);
 extern void kick_iocb(struct kiocb *iocb);
@@ -216,7 +227,18 @@ struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
+struct kiocb *aio_kernel_alloc(gfp_t gfp);
+void aio_kernel_free(struct kiocb *iocb);
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+			  unsigned short op, struct iov_iter *iter, loff_t off);
+void aio_kernel_init_callback(struct kiocb *iocb,
+			      void (*complete)(u64 user_data, long res),
+			      u64 user_data);
+int aio_kernel_submit(struct kiocb *iocb);
 #else
+static void wait_for_all_aios(struct kioctx *ctx) { }
+static void aio_kick_handler(struct work_struct *) { }
+static void wait_for_all_aios(struct kioctx *ctx) { }
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline int aio_put_req(struct kiocb *iocb) { return 0; }
 static inline void kick_iocb(struct kiocb *iocb) { }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/aio_abi.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/aio_abi.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/aio_abi.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/aio_abi.h	2015-01-21 12:02:58.219828291 +0300
@@ -44,6 +44,8 @@ enum {
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
+	IOCB_CMD_READ_ITER = 9,
+	IOCB_CMD_WRITE_ITER = 10,
 };
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/anon_inodes.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/anon_inodes.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/anon_inodes.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/anon_inodes.h	2015-01-21 12:02:50.739026868 +0300
@@ -8,11 +8,17 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+struct inode;
+extern struct inode *anon_inode_inode;
+
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
+int __anon_inode_getfd(const char *name, const struct file_operations *fops,
+		       void *priv, int flags,
+		       const struct dentry_operations *dops);
 
 #endif /* _LINUX_ANON_INODES_H */
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/audit.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/audit.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/audit.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/audit.h	2015-01-21 12:02:45.874156019 +0300
@@ -639,6 +639,7 @@ extern int audit_signals;
 #define audit_inode_child(p,d,t) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
+#define audit_set_loginuid(t,l) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
 #define audit_log_task_context(b) do { ; } while (0)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/backing-dev.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/backing-dev.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/backing-dev.h	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/backing-dev.h	2015-01-21 12:02:55.243907283 +0300
@@ -65,9 +65,12 @@ struct backing_dev_info {
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	congested_fn *congested_fn2; /* use per-bdi waitq */
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
+	int (*bd_full_fn) (struct backing_dev_info *, long long, int);
+	int bd_full; /* backing dev is full */
 
 	char *name;
 
@@ -79,6 +82,9 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
+	unsigned int min_dirty_pages;
+	unsigned int max_dirty_pages;
+
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects update side of wb_list */
 	struct list_head wb_list; /* the flusher threads hanging off this bdi */
@@ -87,6 +93,8 @@ struct backing_dev_info {
 
 	struct device *dev;
 
+        wait_queue_head_t cong_waitq; /* to wait on congestion */
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
@@ -101,7 +109,8 @@ int bdi_register(struct backing_dev_info
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
-void bdi_start_background_writeback(struct backing_dev_info *bdi);
+void bdi_start_background_writeback(struct backing_dev_info *bdi,
+		struct user_beancounter *);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
@@ -195,6 +204,8 @@ static inline unsigned long bdi_stat_err
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned int min_dirty);
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned int max_dirty);
 
 /*
  * Flags in backing_dev_info::capability
@@ -247,6 +258,7 @@ int bdi_set_max_ratio(struct backing_dev
 #endif
 
 extern struct backing_dev_info default_backing_dev_info;
+extern struct backing_dev_info noop_backing_dev_info;
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
 
 int writeback_in_progress(struct backing_dev_info *bdi);
@@ -274,6 +286,31 @@ static inline int bdi_rw_congested(struc
 				  (1 << BDI_async_congested));
 }
 
+/* congestion helpers for block-devices supporting per-bdi waitq */
+static inline int bdi_congested2(struct backing_dev_info *bdi, int bdi_bits)
+{
+	if (bdi->congested_fn2)
+		return bdi->congested_fn2(bdi->congested_data, bdi_bits);
+	return 0;
+}
+
+static inline int bdi_read_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, 1 << BDI_sync_congested);
+}
+
+static inline int bdi_write_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, 1 << BDI_async_congested);
+}
+
+static inline int bdi_rw_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, (1 << BDI_sync_congested) |
+				  (1 << BDI_async_congested));
+}
+
+
 enum {
 	BLK_RW_ASYNC	= 0,
 	BLK_RW_SYNC	= 1,
@@ -326,6 +363,11 @@ static inline bool mapping_cap_account_d
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
+static inline bool mapping_cap_account_writeback(struct address_space *mapping)
+{
+	return bdi_cap_account_writeback(mapping->backing_dev_info);
+}
+
 static inline bool mapping_cap_swap_backed(struct address_space *mapping)
 {
 	return bdi_cap_swap_backed(mapping->backing_dev_info);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/bio.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/bio.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/bio.h	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/bio.h	2015-01-21 12:02:58.222828211 +0300
@@ -51,6 +51,14 @@ struct bio_vec {
 	unsigned int	bv_offset;
 };
 
+static inline ssize_t bvec_length(const struct bio_vec *bvec, unsigned long nr)
+{
+	ssize_t bytes = 0;
+	while (nr--)
+		bytes += (bvec++)->bv_len;
+	return bytes;
+}
+
 struct bio_set;
 struct bio;
 struct bio_integrity_payload;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/bit_spinlock.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/bit_spinlock.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/bit_spinlock.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/bit_spinlock.h	2015-01-21 12:02:54.123937014 +0300
@@ -84,7 +84,7 @@ static inline int bit_spin_is_locked(int
 {
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT
+#elif defined CONFIG_PREEMPT_COUNT
 	return preempt_count();
 #else
 	return 1;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/blkdev.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/blkdev.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/blkdev.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/blkdev.h	2015-01-21 12:02:52.554978660 +0300
@@ -416,6 +416,7 @@ struct request_queue
 	unsigned int		flush_queue_delayed:1;
 	unsigned int		flush_pending_idx:1;
 	unsigned int		flush_running_idx:1;
+	atomic_t		flush_tag;
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/blkpg.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/blkpg.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/blkpg.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/blkpg.h	2015-01-21 12:02:42.898235027 +0300
@@ -40,6 +40,8 @@ struct blkpg_ioctl_arg {
 /* The subfunctions (for the op field) */
 #define BLKPG_ADD_PARTITION	1
 #define BLKPG_DEL_PARTITION	2
+#define BLKPG_RESIZE_PARTITION	3
+#define BLKPG_GET_PARTITION	4
 
 /* Sizes of name fields. Unused at present. */
 #define BLKPG_DEVNAMELTH	64
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/buffer_head.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/buffer_head.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/buffer_head.h	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/buffer_head.h	2015-01-21 12:02:58.259827230 +0300
@@ -48,6 +48,7 @@ struct page;
 struct buffer_head;
 struct address_space;
 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
+typedef int (bh_submit_io_t)(int rw, struct buffer_head *bh, void *fsdata);
 
 /*
  * Historically, a buffer_head was used to map a single block
@@ -186,6 +187,7 @@ int sync_dirty_buffer(struct buffer_head
 int __sync_dirty_buffer(struct buffer_head *bh, int rw);
 void write_dirty_buffer(struct buffer_head *bh, int rw);
 int submit_bh(int, struct buffer_head *);
+int generic_submit_bh_handler(int rw, struct buffer_head * bh, void *fsdata);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
@@ -202,6 +204,10 @@ int block_write_full_page(struct page *p
 				struct writeback_control *wbc);
 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 			struct writeback_control *wbc, bh_end_io_t *handler);
+int generic_block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc,
+			bh_submit_io_t *submit,	bh_end_io_t *handler);
+
 int block_read_full_page(struct page*, get_block_t*);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 				unsigned long from);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/capability.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/capability.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/capability.h	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/capability.h	2015-01-21 12:02:43.970206566 +0300
@@ -197,12 +197,9 @@ struct cpu_vfs_cap_data {
 
 #define CAP_NET_BROADCAST    11
 
-/* Allow interface configuration */
 /* Allow administration of IP firewall, masquerading and accounting */
 /* Allow setting debug option on sockets */
 /* Allow modification of routing tables */
-/* Allow setting arbitrary process / process group ownership on
-   sockets */
 /* Allow binding to any address for transparent proxying */
 /* Allow setting TOS (type of service) */
 /* Allow setting promiscuous mode */
@@ -232,6 +229,7 @@ struct cpu_vfs_cap_data {
 #define CAP_SYS_MODULE       16
 
 /* Allow ioperm/iopl access */
+/* Allow O_DIRECT access */
 /* Allow sending USB messages to any device via /proc/bus/usb */
 
 #define CAP_SYS_RAWIO        17
@@ -250,24 +248,19 @@ struct cpu_vfs_cap_data {
 
 /* Allow configuration of the secure attention key */
 /* Allow administration of the random device */
-/* Allow examination and configuration of disk quotas */
 /* Allow configuring the kernel's syslog (printk behaviour) */
 /* Allow setting the domainname */
 /* Allow setting the hostname */
 /* Allow calling bdflush() */
-/* Allow mount() and umount(), setting up new smb connection */
+/* Allow setting up new smb connection */
 /* Allow some autofs root ioctls */
 /* Allow nfsservctl */
 /* Allow VM86_REQUEST_IRQ */
 /* Allow to read/write pci config on alpha */
 /* Allow irix_prctl on mips (setstacksize) */
 /* Allow flushing all cache on m68k (sys_cacheflush) */
-/* Allow removing semaphores */
-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
-   and shared memory */
 /* Allow locking/unlocking of shared memory segment */
 /* Allow turning swap on/off */
-/* Allow forged pids on socket credentials passing */
 /* Allow setting readahead and flushing buffers on block devices */
 /* Allow setting geometry in floppy driver */
 /* Allow turning DMA on/off in xd driver */
@@ -340,6 +333,61 @@ struct cpu_vfs_cap_data {
 
 #define CAP_SETFCAP	     31
 
+#ifdef __KERNEL__
+/*
+ * Important note: VZ capabilities do intersect with CAP_AUDIT
+ * this is due to compatibility reasons. Nothing bad.
+ * Both VZ and Audit/SELinux caps are disabled in VPSs.
+ */
+
+/* Allow access to all information. In the other case some structures will be
+ * hiding to ensure different Virtual Environment non-interaction on the same
+ * node (NOW OBSOLETED)
+ */
+#define CAP_SETVEID	     29
+
+#define capable_setveid()	({			\
+		ve_is_super(get_exec_env()) &&		\
+			(capable(CAP_SYS_ADMIN) ||	\
+			 capable(CAP_VE_ADMIN));	\
+	})
+
+/*
+ * coinsides with CAP_AUDIT_CONTROL but we don't care, since
+ * audit is disabled in Virtuozzo
+ */
+#define CAP_VE_ADMIN	     30
+
+#ifdef CONFIG_VE
+
+/* Replacement for CAP_NET_ADMIN:
+   delegated rights to the Virtual environment of its network administration.
+   For now the following rights have been delegated:
+
+   Allow setting arbitrary process / process group ownership on sockets
+   Allow interface configuration
+ */
+#define CAP_VE_NET_ADMIN     CAP_VE_ADMIN
+
+/* Replacement for CAP_SYS_ADMIN:
+   delegated rights to the Virtual environment of its administration.
+   For now the following rights have been delegated:
+ */
+/* Allow mount/umount/remount */
+/* Allow examination and configuration of disk quotas */
+/* Allow removing semaphores */
+/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+   and shared memory */
+/* Allow locking/unlocking of shared memory segment */
+/* Allow forged pids on socket credentials passing */
+
+#define CAP_VE_SYS_ADMIN     CAP_VE_ADMIN
+#else
+#define CAP_VE_NET_ADMIN     CAP_NET_ADMIN
+#define CAP_VE_SYS_ADMIN     CAP_SYS_ADMIN
+#endif
+#endif
+
 /* Override MAC access.
    The base kernel enforces no MAC policy.
    An LSM may enforce a MAC policy, and if it does and it chooses
@@ -420,7 +468,16 @@ struct file;
 #define CAP_INIT_INH_SET    CAP_EMPTY_SET
 
 # define cap_clear(c)         do { (c) = __cap_empty_set; } while (0)
+#ifndef CONFIG_VE
 # define cap_set_full(c)      do { (c) = __cap_full_set; } while (0)
+#else
+# define cap_set_full(c)      do {			\
+		if (ve_is_super(get_exec_env()))	\
+			(c) = __cap_full_set;		\
+		else					\
+			(c) = get_exec_env()->ve_cap_bset;\
+	} while (0)
+#endif
 # define cap_set_init_eff(c)  do { (c) = __cap_init_eff_set; } while (0)
 
 #define cap_raise(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag))
@@ -538,6 +595,10 @@ extern const kernel_cap_t __cap_empty_se
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+#include <linux/spinlock_types.h>
+
+extern spinlock_t task_capability_lock;
+
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cgroup.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cgroup.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cgroup.h	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cgroup.h	2015-01-21 12:02:51.049018638 +0300
@@ -39,6 +39,7 @@ extern void cgroup_exit(struct task_stru
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 
+extern struct file_system_type cgroup_fs_type;
 extern const struct file_operations proc_cgroup_operations;
 
 /* Define the enumeration of all cgroup subsystems */
@@ -147,6 +148,7 @@ enum {
 	 * A thread in rmdir() is wating for this cgroup.
 	 */
 	CGRP_WAIT_ON_RMDIR,
+	CGRP_SELF_DESTRUCTION,
 };
 
 /* which pidlist file are we talking about? */
@@ -206,6 +208,10 @@ struct cgroup {
 	struct cgroupfs_root *root;
 	struct cgroup *top_cgroup;
 
+	/* The path to use for release notifications. */
+	char *release_agent;
+	struct workqueue_struct *khelper_wq;
+
 	/*
 	 * List of cg_cgroup_links pointing at css_sets with
 	 * tasks in this cgroup. Protected by css_set_lock
@@ -401,8 +407,7 @@ struct cftype {
 struct cgroup_scanner {
 	struct cgroup *cg;
 	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
-	void (*process_task)(struct task_struct *p,
-			struct cgroup_scanner *scan);
+	int (*process_task)(struct task_struct *p, struct cgroup_scanner *scan);
 	struct ptr_heap *heap;
 	void *data;
 };
@@ -605,6 +610,39 @@ unsigned short css_id(struct cgroup_subs
 unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
+struct cgroup_sb_opts {
+	unsigned long subsys_bits;
+	unsigned long flags;
+	char *release_agent;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+
+	struct cgroupfs_root *new_root;
+
+};
+
+enum cgroup_open_flags {
+	CGRP_CREAT	= 0x0001,	/* create if not found */
+	CGRP_EXCL	= 0x0002,	/* fail if already exist */
+	CGRP_WEAK	= 0x0004,	/* arm cgroup self-destruction */
+};
+
+struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts);
+struct cgroup *cgroup_get_root(struct vfsmount *mnt);
+struct cgroup *cgroup_kernel_open(struct cgroup *parent,
+		enum cgroup_open_flags flags, char *name);
+int cgroup_kernel_remove(struct cgroup *parent, char *name);
+int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk);
+void cgroup_kernel_close(struct cgroup *cgrp);
+int cpt_collect_cgroups(struct vfsmount *mnt,
+			int (*cb)(struct cgroup *cgrp, void *arg), void *arg);
+
+static inline void __cgroup_kernel_open(struct cgroup *cgrp)
+{
+	atomic_inc(&cgrp->count);
+}
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/compat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/compat.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/compat.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/compat.h	2015-01-21 12:02:43.838210071 +0300
@@ -258,6 +258,7 @@ asmlinkage long compat_sys_settimeofday(
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern int compat_printk(const char *fmt, ...);
+extern int ve_compat_printk(int dst, const char *fmt, ...);
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
 
 asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/completion.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/completion.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/completion.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/completion.h	2015-01-21 12:02:42.972233063 +0300
@@ -77,6 +77,7 @@ static inline void init_completion(struc
 }
 
 extern void wait_for_completion(struct completion *);
+extern void wait_for_completion_io(struct completion *);
 extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
 extern unsigned long wait_for_completion_timeout(struct completion *x,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_context.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_context.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_context.h	2015-01-21 12:02:49.748053174 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_context.h	2015-01-21 12:02:49.777052406 +0300
@@ -0,0 +1,277 @@
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include <bc/beancounter.h>
+
+#define	CPT_CTX_ERROR		-1
+#define	CPT_CTX_IDLE		0
+#define CPT_CTX_SUSPENDING	1
+#define	CPT_CTX_SUSPENDED	2
+#define CPT_CTX_DUMPING		3
+#define CPT_CTX_UNDUMPING	4
+#define CPT_CTX_UNDUMPED	5
+
+#define CPT_TID(tsk)   task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm
+#define CPT_FID		"%d,%d(%s)"
+
+enum {
+	CPT_DOBJ_VFSMOUNT_REF,
+	CPT_DOBJ_FILE,
+	CPT_DOBJ_MAX,
+};
+
+struct cpt_delayed_context {
+	int ve_id;
+	struct task_struct *dfs_daemon;
+	struct completion dfs_notify;
+	struct list_head object_array[CPT_DOBJ_MAX];
+};
+
+void destroy_delayed_context(struct cpt_delayed_context *);
+
+struct pram_stream;
+
+typedef struct cpt_context
+{
+	struct list_head ctx_list;
+	int	refcount;
+	int	ctx_state;
+	int	objcount;
+	int	sticky;
+	struct semaphore main_sem;
+
+	struct file *errorfile;
+	struct file *statusfile;
+	struct file *lockfile;
+	int lockfile_new;
+
+	int	errno;
+	char	*error_msg;
+	loff_t	err_offset;
+
+	struct file	*file;
+	char		*tmpbuf;
+	int		pagesize;
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+	int		iter_done;
+	struct rb_root	iter_rb_root;
+	struct user_beancounter *iter_ub;
+	int		iter_shm_start;
+	struct file	*pagein_file_in;
+	struct file	*pagein_file_out;
+#endif
+	loff_t		current_section;
+	loff_t		current_object;
+
+	loff_t		sections[CPT_SECT_MAX];
+
+	__u32		errormask;
+	__u32		write_error;
+
+	struct list_head object_array[CPT_OBJ_MAX];
+
+	void		(*write)(const void *addr, size_t count, struct cpt_context *ctx);
+	void		(*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
+	ssize_t		(*read)(void *addr, size_t count, struct cpt_context *ctx);
+	ssize_t		(*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
+	void		(*align)(struct cpt_context *ctx);
+	int		ve_id;
+	int		contextid;
+	struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst
+					     * corresponging to start_time */
+	__u64		virt_jiffies64;	/* Virtual jiffies64. It is == cpt_jiffies64 when
+					 * VE did not migrate. */
+	struct timespec	start_time;
+	struct timespec delta_time;
+	__s64		delta_nsec;
+	int		image_version;
+	__u16		image_arch;
+	__u64		iptables_mask;
+	__u64		features;
+
+#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9)
+#define CPT_ANONVMA_HSIZE (1<<CPT_ANONVMA_HBITS)
+	struct hlist_head *anonvmas;
+	int		tasks64;
+	__u32		src_cpu_flags;
+	__u32		dst_cpu_flags;
+	__u32		kernel_config_flags;
+
+	__u32		last_vpid;
+
+	struct filejob  *filejob_queue;
+
+	int		slm_count;
+
+	char		*vdso;
+
+	struct cpt_delayed_context *dctx;
+
+#ifdef CONFIG_BEANCOUNTERS
+	/* Store here ubc limits and barriers during undumping,
+	   and restore them before resuming */
+	struct ubparm	saved_ubc[UB_RESOURCES];
+#endif
+
+#define CPT_MAX_LINKDIRS	1
+	struct file	*linkdirs[CPT_MAX_LINKDIRS];
+	int		linkdirs_num;
+	unsigned int	linkcnt; /* for create hardlinked files */
+	int	hardlinked_on;
+
+#ifdef CONFIG_PRAM
+	struct pram_stream *pram_stream;
+#endif
+
+	loff_t dumpsize;
+	loff_t maxdumpsize;
+} cpt_context_t;
+
+typedef struct {
+	int pid;
+	cpt_context_t *ctx;
+	struct completion done;
+} pagein_info_t;
+
+int pagein_info_printf(char *buf, cpt_context_t *ctx);
+
+#ifdef CONFIG_PRAM
+struct cpt_pram_ops {
+	int (*cpt_open)(cpt_context_t *ctx);
+	int (*cpt_dump)(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end,
+			struct cpt_context *ctx);
+	void (*cpt_close)(cpt_context_t *ctx, int err);
+	int (*rst_open)(cpt_context_t *ctx);
+	int (*rst_undump)(struct mm_struct *mm,
+			  unsigned long start, unsigned long end,
+			  loff_t pos, struct cpt_context *ctx);
+	void (*rst_close)(cpt_context_t *ctx);
+};
+extern struct cpt_pram_ops *cpt_pram_ops;
+
+int cpt_open_pram(cpt_context_t *ctx);
+void cpt_dump_pram(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct cpt_context *ctx);
+void cpt_close_pram(cpt_context_t *ctx, int err);
+int rst_open_pram(cpt_context_t *ctx);
+int rst_undump_pram(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		loff_t pos, struct cpt_context *ctx);
+void rst_close_pram(cpt_context_t *ctx);
+#else
+static inline int cpt_open_pram(cpt_context_t *ctx) { return -ENOSYS; }
+static inline void cpt_dump_pram(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct cpt_context *ctx) { }
+static inline void cpt_close_pram(cpt_context_t *ctx, int err) { }
+static inline int rst_open_pram(cpt_context_t *ctx) { return 0; }
+static inline int rst_undump_pram(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		loff_t pos, struct cpt_context *ctx) { return -ENOSYS; }
+static inline void rst_close_pram(cpt_context_t *ctx) { }
+#endif
+
+int cpt_open_dumpfile(struct cpt_context *);
+int cpt_close_dumpfile(struct cpt_context *);
+int rst_open_dumpfile(struct cpt_context *);
+void rst_close_dumpfile(struct cpt_context *);
+void cpt_context_init(struct cpt_context *);
+void rst_context_init(struct cpt_context *);
+void cpt_context_destroy(struct cpt_context *);
+
+void rst_report_error(int err, cpt_context_t *ctx);
+
+
+int cpt_major_hdr_out(struct cpt_context *ctx);
+int cpt_dump_tail(struct cpt_context *ctx);
+int cpt_close_section(struct cpt_context *ctx);
+int cpt_open_section(struct cpt_context *ctx, __u32 type);
+int cpt_close_object(struct cpt_context *ctx);
+int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx);
+int cpt_push_object(loff_t *saved, struct cpt_context *ctx);
+int cpt_pop_object(loff_t *saved, struct cpt_context *ctx);
+
+int rst_get_section(int type, struct cpt_context * ctx, loff_t *, loff_t *);
+__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx);
+__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx);
+void rst_put_name(__u8 *name, struct cpt_context *ctx);
+int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx);
+void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx);
+
+pid_t vpid_to_pid(pid_t);
+
+#define rst_get_object(type, pos, tmp, ctx) \
+ _rst_get_object((type), (pos), (tmp), sizeof(*(tmp)), (ctx))
+
+extern int debug_level;
+
+#define cpt_printk(lvl, fmt, args...)	do {	\
+		if (lvl <= debug_level)		\
+			printk(fmt, ##args);	\
+	} while (0)
+
+#define dprintk(a...) cpt_printk(3, "CPT DBG: " a)
+#define dprintk_ctx(f, arg...) dprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
+
+#define wprintk(a...) cpt_printk(2, "CPT WRN: " a)
+#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
+
+#define iprintk(a...) cpt_printk(1, "CPT INF: " a)
+#define iprintk_ctx(f, arg...) iprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
+
+#define eprintk(a...) cpt_printk(1, "CPT ERR: " a)
+#define eprintk_ctx(f, arg...)						\
+do {									\
+	eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg);			\
+	if (ctx->error_msg && ctx->err_offset < PAGE_SIZE)		\
+		ctx->err_offset += snprintf((char*)(ctx->error_msg +	\
+				ctx->err_offset),			\
+			       	PAGE_SIZE - ctx->err_offset,		\
+				"Error: " f, ##arg);			\
+} while(0)
+
+#define CPT_TMPBUF_FREE 0x789adf12
+#define CPT_TMPBUF_BUSY 0xabcd9876
+
+static inline void *cpt_get_buf(cpt_context_t *ctx)
+{
+	void *buf = ctx->tmpbuf;
+
+	BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE);
+	*(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY;
+	return buf;
+}
+
+static inline void __cpt_release_buf(cpt_context_t *ctx)
+{
+	void *buf = ctx->tmpbuf;
+
+	*(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
+}
+
+static inline void cpt_release_buf(cpt_context_t *ctx)
+{
+	void *buf = ctx->tmpbuf;
+
+	BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY);
+	*(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
+}
+
+static inline void cpt_flush_error(cpt_context_t *ctx)
+{
+	mm_segment_t oldfs;
+
+	if (ctx->errorfile && ctx->error_msg && ctx->err_offset) {
+		if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) {
+			oldfs = get_fs();
+			set_fs(KERNEL_DS);
+			ctx->errorfile->f_op->write(ctx->errorfile,
+				ctx->error_msg, ctx->err_offset,
+				&ctx->errorfile->f_pos);
+			set_fs(oldfs);
+		}
+		ctx->error_msg[0] = 0;
+		ctx->err_offset = 0;
+	}
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_export.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_export.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_export.h	2015-01-21 12:02:48.234093367 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_export.h	2015-01-21 12:02:49.134069475 +0300
@@ -0,0 +1,41 @@
+/*
+ *
+ *  include/linux/cpt_exports.h
+ *
+ *  Copyright (C) 2008  Parallels
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __CPT_EXPORTS_H__
+#define __CPT_EXPORTS_H__
+
+struct cpt_context;
+
+struct cpt_ops {
+	void (*write)(const void *addr, size_t count, struct cpt_context *ctx);
+	void (*push_object)(loff_t *, struct cpt_context *);
+	void (*pop_object)(loff_t *, struct cpt_context *);
+	loff_t (*lookup_object)(int type, void *p, struct cpt_context *ctx);
+
+};
+
+extern struct cpt_ops cpt_ops;
+
+struct rst_ops {
+	int (*get_object)(int type, loff_t pos, void *tmp,
+			int size, struct cpt_context *ctx);
+	struct file *(*rst_file)(loff_t pos, int fd, struct cpt_context *ctx);
+};
+
+extern struct rst_ops rst_ops;
+
+extern void (*vefs_track_notify_hook)(struct dentry *vdentry, int track_cow);
+extern void (*vefs_track_force_stop_hook)(struct super_block *super);
+extern struct dentry * (*vefs_replaced_dentry_hook)(struct dentry *de);
+extern int (*vefs_is_renamed_dentry_hook)(struct dentry *vde, struct dentry *pde);
+
+#endif
+
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_image.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_image.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_image.h	2015-01-21 12:02:48.235093340 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_image.h	2015-01-21 12:02:54.165935898 +0300
@@ -0,0 +1,2131 @@
+/*
+ *
+ *  include/linux/cpt_image.h
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __CPT_IMAGE_H_
+#define __CPT_IMAGE_H_ 1
+
+#include <linux/inetdevice.h>
+
+#define CPT_NULL (~0ULL)
+#define CPT_NOINDEX (~0U)
+
+/**
+ * WARNING!!! For "expandable" objects at restore state
+ * _always_ use obj->cpt_hdrlen instead of sizeof()
+ */
+#define cpt_object_has(obj, field)	((obj)->cpt_hdrlen >= \
+		offsetof(typeof(*(obj)), field) + sizeof((obj)->field))
+
+/*
+ * Image file layout.
+ *
+ * - major header
+ * - sections[]
+ *
+ *	Each section is:
+ *	- section header
+ *	- array of objects
+ *
+ * All data records are arch independent, 64 bit aligned.
+ */
+
+enum _cpt_object_type
+{
+	CPT_OBJ_TASK			= 0,
+	CPT_OBJ_MM			= 1,
+	CPT_OBJ_FS			= 2,
+	CPT_OBJ_FILES			= 3,
+	CPT_OBJ_FILE			= 4,
+	CPT_OBJ_SIGHAND_STRUCT		= 5,
+	CPT_OBJ_SIGNAL_STRUCT		= 6,
+	CPT_OBJ_TTY			= 7,
+	CPT_OBJ_SOCKET			= 8,
+	CPT_OBJ_SYSVSEM_UNDO		= 9,
+	CPT_OBJ_NAMESPACE		= 10,
+	CPT_OBJ_SYSV_SHM		= 11,
+	CPT_OBJ_INODE			= 12,
+	CPT_OBJ_UBC			= 13,
+	CPT_OBJ_SLM_SGREG		= 14,
+	CPT_OBJ_SLM_REGOBJ		= 15,
+	CPT_OBJ_SLM_MM			= 16,
+	CPT_OBJ_VFSMOUNT_REF		= 17,
+	CPT_OBJ_CGROUP			= 18,
+	CPT_OBJ_CGROUPS			= 19,
+	CPT_OBJ_POSIX_TIMER_LIST	= 20,
+	CPT_OBJ_VFSMOUNT_MISSED_REF	= 21,
+	CPT_OBJ_MAX			= 22,
+	/* The objects above are stored in memory while checkpointing */
+
+	CPT_OBJ_VMA			= 1024,
+	CPT_OBJ_FILEDESC		= 1025,
+	CPT_OBJ_SIGHANDLER		= 1026,
+	CPT_OBJ_SIGINFO			= 1027,
+	CPT_OBJ_LASTSIGINFO		= 1028,
+	CPT_OBJ_SYSV_SEM		= 1029,
+	CPT_OBJ_SKB			= 1030,
+	CPT_OBJ_FLOCK			= 1031,
+	CPT_OBJ_OPENREQ			= 1032,
+	CPT_OBJ_VFSMOUNT		= 1033,
+	CPT_OBJ_TRAILER			= 1034,
+	CPT_OBJ_SYSVSEM_UNDO_REC	= 1035,
+	CPT_OBJ_NET_DEVICE		= 1036,
+	CPT_OBJ_NET_IFADDR		= 1037,
+	CPT_OBJ_NET_ROUTE		= 1038,
+	CPT_OBJ_NET_CONNTRACK		= 1039,
+	CPT_OBJ_NET_CONNTRACK_EXPECT	= 1040,
+	CPT_OBJ_AIO_CONTEXT		= 1041,
+	CPT_OBJ_VEINFO			= 1042,
+	CPT_OBJ_EPOLL			= 1043,
+	CPT_OBJ_EPOLL_FILE		= 1044,
+	CPT_OBJ_SKFILTER		= 1045,
+	CPT_OBJ_SIGALTSTACK		= 1046,
+  	CPT_OBJ_SOCK_MCADDR		= 1047,
+	CPT_OBJ_BIND_MNT		= 1048,
+	CPT_OBJ_SYSVMSG			= 1049,
+	CPT_OBJ_SYSVMSG_MSG		= 1050,
+	CPT_OBJ_MM_AUXV			= 1051,
+	CPT_OBJ_NET_IDEV_CNF		= 1052,
+
+	CPT_OBJ_X86_REGS		= 4096,
+	CPT_OBJ_X86_64_REGS		= 4097,
+	CPT_OBJ_PAGES			= 4098,
+	CPT_OBJ_COPYPAGES		= 4099,
+	CPT_OBJ_REMAPPAGES		= 4100,
+	CPT_OBJ_LAZYPAGES		= 4101,
+	CPT_OBJ_NAME			= 4102,
+	CPT_OBJ_BITS			= 4103,
+	CPT_OBJ_REF			= 4104,
+	CPT_OBJ_ITERPAGES		= 4105,
+	CPT_OBJ_ITERYOUNGPAGES		= 4106,
+	CPT_OBJ_VSYSCALL		= 4107,
+	CPT_OBJ_IA64_REGS		= 4108,
+	CPT_OBJ_INOTIFY			= 4109,
+	CPT_OBJ_INOTIFY_WATCH		= 4110,
+	CPT_OBJ_INOTIFY_EVENT		= 4111,
+	CPT_OBJ_TASK_AUX		= 4112,
+	CPT_OBJ_NET_TUNTAP		= 4113,
+	CPT_OBJ_NET_HWADDR		= 4114,
+	CPT_OBJ_NET_VETH		= 4115,
+	CPT_OBJ_NET_STATS		= 4116,
+	CPT_OBJ_NET_IPIP_TUNNEL		= 4117,
+	CPT_OBJ_TIMERFD			= 4118,
+	CPT_OBJ_EVENTFD			= 4119,
+	CPT_OBJ_NET_BR			= 4120,
+	CPT_OBJ_NET_BR_DEV		= 4121,
+	CPT_OBJ_MOUNT_DATA		= 4122,
+	CPT_OBJ_POSIX_TIMER		= 4123,
+	CPT_OBJ_SOCK_PACKET		= 4124,
+	CPT_OBJ_SOCK_PACKET_MC		= 4125,
+
+	/* 2.6.27-specific */
+	CPT_OBJ_NET_TAP_FILTER = 0x01000000,
+};
+
+#define CPT_ALIGN(n) (((n)+7)&~7)
+
+struct cpt_major_hdr
+{
+	__u8	cpt_signature[4];	/* Magic number */
+	__u16	cpt_hdrlen;		/* Length of this header */
+	__u16	cpt_image_version;	/* Format of this file */
+#define CPT_VERSION_MINOR(a)	((a) & 0xf)
+#define CPT_VERSION_MAJOR(a)	((a) & 0xff00)
+#define CPT_VERSION_8		0
+#define CPT_VERSION_9		0x100
+#define CPT_VERSION_9_1		0x101
+#define CPT_VERSION_9_2		0x102
+#define CPT_VERSION_16		0x200
+#define CPT_VERSION_18		0x300
+#define CPT_VERSION_18_1	0x301
+#define CPT_VERSION_18_2	0x302
+#define CPT_VERSION_18_3	0x303
+#define CPT_VERSION_18_4	0x304
+#define CPT_VERSION_20		0x400
+#define CPT_VERSION_24		0x500
+#define CPT_VERSION_26		0x600
+#define CPT_VERSION_27		0x700
+#define CPT_VERSION_27_3	0x703
+#define CPT_VERSION_32		0x800
+#define CPT_VERSION_32_1	0x801
+#define CPT_VERSION_32_2	0x802
+#define CPT_VERSION_32_3	0x803
+#define CPT_CURRENT_VERSION	CPT_VERSION_32_3
+	__u16	cpt_os_arch;		/* Architecture */
+#define CPT_OS_ARCH_I386	0
+#define CPT_OS_ARCH_EMT64	1
+#define CPT_OS_ARCH_IA64	2
+	__u16	__cpt_pad1;
+	__u32	cpt_ve_features;	/* VE features */
+	__u32	cpt_ve_features2;	/* VE features */
+	__u16	cpt_pagesize;		/* Page size used by OS */
+	__u16	cpt_hz;			/* HZ used by OS */
+	__u64	cpt_start_jiffies64;	/* Jiffies */
+	__u32	cpt_start_sec;		/* Seconds */
+	__u32	cpt_start_nsec;		/* Nanoseconds */
+	__u32	cpt_cpu_caps[4];	/* CPU capabilities */
+	__u32	cpt_kernel_config[4];	/* Kernel config */
+	__u64	cpt_iptables_mask;	/* Used netfilter modules */
+} __attribute__ ((aligned (8)));
+
+#define CPT_SIGNATURE0 0x79
+#define CPT_SIGNATURE1 0x1c
+#define CPT_SIGNATURE2 0x01
+#define CPT_SIGNATURE3 0x63
+
+/* CPU capabilities */
+#define CPT_CPU_X86_CMOV	0
+#define CPT_CPU_X86_FXSR	1
+#define CPT_CPU_X86_SSE		2
+#define CPT_CPU_X86_SSE2	3
+#define CPT_CPU_X86_MMX		4
+#define CPT_CPU_X86_3DNOW	5
+#define CPT_CPU_X86_3DNOW2	6
+#define CPT_CPU_X86_SEP		7
+#define CPT_CPU_X86_EMT64	8
+#define CPT_CPU_X86_IA64	9
+#define CPT_CPU_X86_SYSCALL	10
+#define CPT_CPU_X86_SYSCALL32	11
+#define CPT_CPU_X86_SEP32	12
+#define CPT_CPU_X86_SSE4_1	13
+#define CPT_CPU_X86_SSE4_2	14
+#define CPT_CPU_X86_SSE4A	15
+
+/* Unsupported features */
+#define CPT_EXTERNAL_PROCESS	16
+#define CPT_NAMESPACES		17
+#define CPT_SCHEDULER_POLICY	18
+#define CPT_PTRACED_FROM_VE0	19
+#define CPT_UNSUPPORTED_FSTYPE	20
+#define CPT_BIND_MOUNT		21
+#define CPT_UNSUPPORTED_NETDEV	22
+#define CPT_UNSUPPORTED_MISC	23
+#define CPT_SLM_DMPRST		24
+
+/* CPU capabilities (cont) */
+#define CPT_CPU_X86_XSAVE	25
+#define CPT_CPU_X86_AVX		26
+#define CPT_CPU_X86_AESNI	27
+#define CPT_NO_IPV6		28
+#define CPT_CPU_X86_RDRAND	29
+
+/* This mask is used to determine whether VE
+   has some unsupported features or not */
+#define CPT_UNSUPPORTED_MASK	0xc1fd0000UL
+
+#define CPT_KERNEL_CONFIG_PAE	0
+
+struct cpt_section_hdr
+{
+	__u64	cpt_next;
+	__u32	cpt_section;
+	__u16	cpt_hdrlen;
+	__u16	cpt_align;
+} __attribute__ ((aligned (8)));
+
+enum
+{
+	CPT_SECT_ERROR,			/* Error section, content is string */
+	CPT_SECT_VEINFO,
+	CPT_SECT_FILES,			/* Files. Content is array of file objects */
+	CPT_SECT_TASKS,
+	CPT_SECT_MM,
+	CPT_SECT_FILES_STRUCT,
+	CPT_SECT_FS,
+	CPT_SECT_SIGHAND_STRUCT,
+	CPT_SECT_TTY,
+	CPT_SECT_SOCKET,
+	CPT_SECT_NAMESPACE,
+	CPT_SECT_SYSVSEM_UNDO,
+	CPT_SECT_INODE,			/* Inodes with i->i_nlink==0 and
+					 * deleted dentires with inodes not
+					 * referenced inside dumped process.
+					 */
+	CPT_SECT_SYSV_SHM,
+	CPT_SECT_SYSV_SEM,
+	CPT_SECT_ORPHANS,
+	CPT_SECT_NET_DEVICE,
+	CPT_SECT_NET_IFADDR,
+	CPT_SECT_NET_ROUTE,
+	CPT_SECT_NET_IPTABLES,
+	CPT_SECT_NET_CONNTRACK,
+	CPT_SECT_NET_CONNTRACK_VE0,
+	CPT_SECT_UTSNAME,
+	CPT_SECT_TRAILER,
+	CPT_SECT_UBC,
+	CPT_SECT_SLM_SGREGS,
+	CPT_SECT_SLM_REGOBJS,
+/* Due to silly mistake we cannot index sections beyond this value */
+#define	CPT_SECT_MAX_INDEX	(CPT_SECT_SLM_REGOBJS+1)
+	CPT_SECT_EPOLL,
+	CPT_SECT_VSYSCALL,
+	CPT_SECT_INOTIFY,
+	CPT_SECT_SYSV_MSG,
+	CPT_SECT_SNMP_STATS,
+	CPT_SECT_CGROUPS,
+	CPT_SECT_POSIX_TIMERS,
+	CPT_SECT_NET_IP6TABLES,
+	CPT_SECT_MAX
+};
+
+struct cpt_major_tail
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_lazypages;
+	__u32	cpt_64bit;
+	__u64	cpt_sections[CPT_SECT_MAX_INDEX];
+	__u32	cpt_nsect;
+	__u8	cpt_signature[4];	/* Magic number */
+} __attribute__ ((aligned (8)));
+
+
+/* Common object header. */
+struct cpt_object_hdr
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+} __attribute__ ((aligned (8)));
+
+struct cpt_obj_tar
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_len;
+	__u32	cpt_pad;
+} __attribute__ ((aligned (8)));
+
+enum _cpt_content_type {
+	CPT_CONTENT_VOID,
+	CPT_CONTENT_ARRAY,
+	CPT_CONTENT_DATA,
+	CPT_CONTENT_NAME,
+
+	CPT_CONTENT_STACK,
+	CPT_CONTENT_X86_FPUSTATE_OLD,
+	CPT_CONTENT_X86_FPUSTATE,
+	CPT_CONTENT_MM_CONTEXT,
+	CPT_CONTENT_SEMARRAY,
+	CPT_CONTENT_SEMUNDO,
+	CPT_CONTENT_NLMARRAY,
+	CPT_CONTENT_X86_XSAVE,
+	CPT_CONTENT_PRAM,
+	CPT_CONTENT_MAX
+};
+
+/* CPT_OBJ_BITS: encode array of bytes */ 
+struct cpt_obj_bits
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_size;
+	__u32	__cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_REF: a reference to another object */ 
+struct cpt_obj_ref
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_pos;
+} __attribute__ ((aligned (8)));
+
+struct cpt_timerfd_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_it_value;
+	__u64	cpt_it_interval;
+	__u64	cpt_ticks;
+	__u32	cpt_expired;
+	__u32	cpt_clockid;
+} __attribute__ ((aligned (8)));
+
+struct cpt_eventfd_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_count;
+	__u32	cpt_flags;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_VEINFO: various ve specific data */
+struct cpt_veinfo_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	/* ipc ctls */
+	__u32	shm_ctl_max;
+	__u32	shm_ctl_all;
+	__u32	shm_ctl_mni;
+	__u32	msg_ctl_max;
+	__u32	msg_ctl_mni;
+	__u32	msg_ctl_mnb;
+	__u32	sem_ctl_arr[4];
+
+	/* start time */
+	__u64	start_timespec_delta;
+	__u64	start_jiffies_delta;
+
+	/* later extension */
+	__u32	last_pid;
+	__u32	rnd_va_space;
+	__u32	vpid_max;
+	__u32	__cpt_pad1;
+	__u64	real_start_timespec_delta;
+	__u64	reserved[6];
+	__u64	aio_max_nr;
+} __attribute__ ((aligned (8)));
+
+struct cpt_cgroup_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_index;
+	__s32	cpt_parent;
+	__u32	cpt_flags;
+#define CPT_CGRP_NOTIFY_ON_RELEASE	0x1
+#define CPT_CGRP_SELF_DESTRUCTION	0x2
+};
+
+/* CPT_OBJ_FILE: one struct file */ 
+struct cpt_file_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_flags;
+	__u32	cpt_mode;
+	__u64	cpt_pos;
+	__u32	cpt_uid;
+	__u32	cpt_gid;
+
+	__u32	cpt_i_mode;
+	__u32	cpt_lflags;
+#define CPT_DENTRY_DELETED	1
+#define CPT_DENTRY_ROOT		2
+#define CPT_DENTRY_CLONING	4
+#define CPT_DENTRY_PROC		8
+#define CPT_DENTRY_EPOLL	0x10
+#define CPT_DENTRY_REPLACED	0x20
+#define CPT_DENTRY_INOTIFY	0x40
+#define CPT_DENTRY_FUTEX	0x80
+#define CPT_DENTRY_TUNTAP	0x100
+#define CPT_DENTRY_PROCPID_DEAD 0x200
+#define CPT_DENTRY_HARDLINKED	0x400
+#define CPT_DENTRY_SIGNALFD	0x800
+#define CPT_DENTRY_TIMERFD	0x1000
+#define CPT_DENTRY_EVENTFD	0x2000
+#define CPT_DENTRY_FAKEFILE	0x4000
+#define CPT_DENTRY_SILLYRENAME	0x20000
+	__u64	cpt_inode;
+	__u64	cpt_priv;
+
+	__u32	cpt_fown_fd;
+	__u32	cpt_fown_pid;
+#define CPT_FOWN_STRAY_PID	0
+	__u32	cpt_fown_uid;
+	__u32	cpt_fown_euid;
+	__u32	cpt_fown_signo;
+	__u32	__cpt_pad1;
+	__u64	cpt_vfsmount;
+} __attribute__ ((aligned (8)));
+/* Followed by file name, encoded as CPT_OBJ_NAME */
+
+struct cpt_epoll_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_file;
+} __attribute__ ((aligned (8)));
+/* Followed by array of struct cpt_epoll_file */
+
+struct cpt_epoll_file_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_file;
+	__u32	cpt_fd;
+	__u32	cpt_events;
+	__u64	cpt_data;
+	__u32	cpt_revents;
+	__u32	cpt_ready;
+} __attribute__ ((aligned (8)));
+
+struct cpt_inotify_wd_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_wd;
+	__u32	cpt_mask;
+} __attribute__ ((aligned (8)));
+/* Followed by cpt_file_image of inode to watch */
+
+struct cpt_inotify_ev_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_wd;
+	__u32	cpt_mask;
+	__u32	cpt_cookie;
+	__u32	cpt_namelen;
+} __attribute__ ((aligned (8)));
+/* Followed by name */
+
+struct cpt_inotify_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_file;
+	__u32	cpt_user;
+	__u32	cpt_max_events;
+	__u32	cpt_last_wd;
+	__u32	__cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */
+
+
+/* CPT_OBJ_FILEDESC: one file descriptor */
+struct cpt_fd_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_fd;
+	__u32	cpt_flags;
+#define CPT_FD_FLAG_CLOSEEXEC	1
+	__u64	cpt_file;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_FILES: one files_struct */
+struct cpt_files_struct_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_index;
+	__u32	cpt_max_fds;
+	__u32	cpt_next_fd;
+	__u32	__cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by array of cpt_fd_image */
+
+/* CPT_OBJ_FS: one fs_struct */
+struct cpt_fs_struct_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_umask;
+	__u32	__cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */
+
+/* CPT_OBJ_INODE: one struct inode */
+struct cpt_inode_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_dev;
+	__u64	cpt_ino;
+	__u32	cpt_mode;
+	__u32	cpt_nlink;
+	__u32	cpt_uid;
+	__u32	cpt_gid;
+	__u64	cpt_rdev;
+	__u64	cpt_size;
+	__u64	cpt_blksize;
+	__u64	cpt_atime;
+	__u64	cpt_mtime;
+	__u64	cpt_ctime;
+	__u64	cpt_blocks;
+	__u32	cpt_sb;
+	__u32	__cpt_pad1;
+	__u64	cpt_vfsmount;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_VFSMOUNT: one vfsmount */
+struct cpt_vfsmount_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_mntflags;
+#define CPT_MNT_BIND	0x80000000
+#define CPT_MNT_EXT	0x40000000
+#define CPT_MNT_DELAYFS	0x20000000
+#define CPT_MNT_PLOOP	0x10000000
+	__u32	cpt_flags;
+	__u64	cpt_mnt_bind;
+	__u64	cpt_mnt_parent;
+	__u64	cpt_mnt_shared;
+	__u64	cpt_mnt_master;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_flock_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_owner;
+	__u32	cpt_pid;
+	__u64	cpt_start;
+	__u64	cpt_end;
+	__u32	cpt_flags;
+#define CPT_FLOCK_DELAYED	0x00010000
+	__u32	cpt_type;
+	__u32	cpt_svid;
+} __attribute__ ((aligned (8)));
+
+struct cpt_tty_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_flags;
+	__u32	cpt_link;
+	__u32	cpt_index;
+	__u32	cpt_drv_type;
+	__u32	cpt_drv_subtype;
+	__u32	cpt_drv_flags;
+	__u8	cpt_packet;
+	__u8	cpt_stopped;
+	__u8	cpt_hw_stopped;
+	__u8	cpt_flow_stopped;
+
+	__u32	cpt_canon_data;
+	__u32	cpt_canon_head;
+	__u32	cpt_canon_column;
+	__u32	cpt_column;
+	__u8	cpt_ctrl_status;
+	__u8	cpt_erasing;
+	__u8	cpt_lnext;
+	__u8	cpt_icanon;
+	__u8	cpt_raw;
+	__u8	cpt_real_raw;
+	__u8	cpt_closing;
+	__u8	__cpt_pad1;
+	__u16	cpt_minimum_to_wake;
+	__u16	__cpt_pad2;
+	__u32	cpt_pgrp;
+	__u32	cpt_session;
+	__u32	cpt_c_line;
+	__u8	cpt_name[64];	
+	__u16	cpt_ws_row;
+	__u16	cpt_ws_col;
+	__u16	cpt_ws_prow;
+	__u16	cpt_ws_pcol;
+	__u8	cpt_c_cc[32];
+	__u32	cpt_c_iflag;
+	__u32	cpt_c_oflag;
+	__u32	cpt_c_cflag;
+	__u32	cpt_c_lflag;
+	__u32	cpt_read_flags[4096/32];
+} __attribute__ ((aligned (8)));
+
+struct cpt_sock_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_file;
+	__u32	cpt_parent;
+	__u32	cpt_index;
+
+	__u64	cpt_ssflags;
+	__u16	cpt_type;
+	__u16	cpt_family;
+	__u8	cpt_sstate;
+	__u8	cpt_passcred;
+	__u8	cpt_state;
+	__u8	cpt_reuse;
+
+	__u8	cpt_zapped;
+	__u8	cpt_shutdown;
+	__u8	cpt_userlocks;
+	__u8	cpt_no_check;
+	__u8	cpt_debug;
+	__u8	cpt_rcvtstamp;
+	__u8	cpt_localroute;
+	__u8	cpt_protocol;
+
+	__u32	cpt_err;
+	__u32	cpt_err_soft;
+
+	__u16	cpt_max_ack_backlog;
+	__u16   __cpt_pad1;
+	__u32	cpt_priority;
+
+	__u32	cpt_rcvlowat;
+	__u32	cpt_bound_dev_if;
+
+	__u64	cpt_rcvtimeo;
+	__u64	cpt_sndtimeo;
+	__u32	cpt_rcvbuf;
+	__u32	cpt_sndbuf;
+	__u64	cpt_flags;
+	__u64	cpt_lingertime;
+	__u32	cpt_peer_pid;
+	__u32	cpt_peer_uid;
+
+	__u32	cpt_peer_gid;
+	__u32	cpt_laddrlen;
+	__u32	cpt_laddr[128/4];
+	__u32	cpt_raddrlen;
+	__u32	cpt_raddr[128/4];
+	/* AF_UNIX */
+	__u32	cpt_peer;
+
+	__u8	cpt_socketpair;
+	__u8	cpt_sockflags;
+#define CPT_SOCK_DELETED	0x1
+#define CPT_SOCK_DELAYED	0x2
+
+	__u16	__cpt_pad4;
+	__u32	__cpt_pad5;
+/*
+	struct sk_filter      	*sk_filter;
+ */
+
+	__u64			cpt_stamp;
+	__u32			cpt_daddr;
+	__u16			cpt_dport;
+	__u16			cpt_sport;
+
+	union {
+		struct {
+			__u32	cpt_saddr;
+			__u32	cpt_rcv_saddr;
+		};
+
+		__u64		cpt_vfsmount_ref;
+	};
+
+
+	__u32			cpt_uc_ttl;
+	__u32			cpt_tos;
+
+	__u32			cpt_cmsg_flags;
+	__u32			cpt_mc_index;
+
+	__u32			cpt_mc_addr;
+/*
+	struct ip_options	*opt;
+ */
+	__u8			cpt_hdrincl;
+	__u8			cpt_mc_ttl;
+	__u8			cpt_mc_loop;
+	__u8			cpt_pmtudisc;
+
+	__u8			cpt_recverr;
+	__u8			cpt_freebind;
+	__u16			cpt_idcounter;
+	__u32			cpt_cork_flags;
+
+	__u32			cpt_cork_fragsize;
+	__u32			cpt_cork_length;
+	__u32			cpt_cork_addr;
+	__u32			cpt_cork_saddr;
+	__u32			cpt_cork_daddr;
+	__u32			cpt_cork_oif;
+
+	__u32			cpt_udp_pending;
+	__u32			cpt_udp_corkflag;
+	__u16			cpt_udp_encap;
+	__u16			cpt_udp_len;
+	__u32			__cpt_pad7;
+
+	__u64			cpt_saddr6[2];
+	__u64			cpt_rcv_saddr6[2];
+	__u64			cpt_daddr6[2];
+	__u32			cpt_flow_label6;
+	__u32			cpt_frag_size6;
+	__u32			cpt_hop_limit6;
+	__u32			cpt_mcast_hops6;
+
+	__u32			cpt_mcast_oif6;
+	__u8			cpt_rxopt6;
+	__u8			cpt_mc_loop6;
+	__u8			cpt_recverr6;
+	__u8			cpt_sndflow6;
+
+	__u8			cpt_pmtudisc6;
+	__u8			cpt_ipv6only6;
+	__u8			cpt_mapped;
+	__u8			__cpt_pad8;
+	__u32	cpt_pred_flags;
+
+	__u32	cpt_rcv_nxt;
+	__u32	cpt_snd_nxt;
+
+	__u32	cpt_snd_una;
+	__u32	cpt_snd_sml;
+
+	__u32	cpt_rcv_tstamp;
+	__u32	cpt_lsndtime;
+
+	__u8	cpt_tcp_header_len;
+	__u8	cpt_ack_pending;
+	__u8	cpt_quick;
+	__u8	cpt_pingpong;
+	__u8	cpt_blocked;
+	__u8	__cpt_pad9;
+	__u16	__cpt_pad10;
+
+	__u32	cpt_ato;
+	__u32	cpt_ack_timeout;
+
+	__u32	cpt_lrcvtime;
+	__u16	cpt_last_seg_size;
+	__u16	cpt_rcv_mss;
+
+	__u32	cpt_snd_wl1;
+	__u32	cpt_snd_wnd;
+
+	__u32	cpt_max_window;
+	__u32	cpt_pmtu_cookie;
+
+	__u32	cpt_mss_cache;
+	__u16	cpt_mss_cache_std;
+	__u16	cpt_mss_clamp;
+
+	__u16	cpt_ext_header_len;
+	__u16	cpt_ext2_header_len;
+	__u8	cpt_ca_state;
+	__u8	cpt_retransmits;
+	__u8	cpt_reordering;
+	__u8	cpt_frto_counter;
+
+	__u32	cpt_frto_highmark;
+	__u8	cpt_adv_cong;
+	__u8	cpt_defer_accept;
+	__u8	cpt_backoff;
+	__u8	__cpt_pad11;
+
+	__u32	cpt_srtt;
+	__u32	cpt_mdev;
+
+	__u32	cpt_mdev_max;
+	__u32	cpt_rttvar;
+
+	__u32	cpt_rtt_seq;
+	__u32	cpt_rto;
+
+	__u32	cpt_packets_out;
+	__u32	cpt_left_out;
+
+	__u32	cpt_retrans_out;
+ 	__u32	cpt_snd_ssthresh;
+
+ 	__u32	cpt_snd_cwnd;
+ 	__u16	cpt_snd_cwnd_cnt;
+	__u16	cpt_snd_cwnd_clamp;
+
+	__u32	cpt_snd_cwnd_used;
+	__u32	cpt_snd_cwnd_stamp;
+
+	__u32	cpt_timeout;
+	__u32	cpt_ka_timeout;
+
+ 	__u32	cpt_rcv_wnd;
+	__u32	cpt_rcv_wup;
+
+	__u32	cpt_write_seq;
+	__u32	cpt_pushed_seq;
+
+	__u32	cpt_copied_seq;
+	__u8	cpt_tstamp_ok;
+	__u8	cpt_wscale_ok;
+	__u8	cpt_sack_ok;
+	__u8	cpt_saw_tstamp;
+
+        __u8	cpt_snd_wscale;
+        __u8	cpt_rcv_wscale;
+	__u8	cpt_nonagle;
+	__u8	cpt_keepalive_probes;
+        __u32	cpt_rcv_tsval;
+
+        __u32	cpt_rcv_tsecr;
+        __u32	cpt_ts_recent;
+
+	__u64	cpt_ts_recent_stamp;
+	__u16	cpt_user_mss;
+	__u8	cpt_dsack;
+	__u8	unused; /* was cpt_eff_sacks */
+	__u32	cpt_sack_array[2*5];
+	__u32	cpt_window_clamp;
+
+	__u32	cpt_rcv_ssthresh;
+	__u8	cpt_probes_out;
+	__u8	cpt_num_sacks;
+	__u16	cpt_advmss;
+
+	__u8	cpt_syn_retries;
+	__u8	cpt_ecn_flags;
+	__u16	cpt_prior_ssthresh;
+	__u32	cpt_lost_out;
+
+	__u32   cpt_sacked_out;
+	__u32   cpt_fackets_out;
+
+	__u32   cpt_high_seq;
+	__u32	cpt_retrans_stamp;
+
+	__u32	cpt_undo_marker;
+	__u32	cpt_undo_retrans;
+
+	__u32	cpt_urg_seq;
+	__u16	cpt_urg_data;
+	__u8	cpt_pending;
+	__u8	unused2; /* was cpt_urg_mode */
+
+	__u32	cpt_snd_up;
+	__u32	cpt_keepalive_time;
+
+	__u32   cpt_keepalive_intvl;
+	__u32   cpt_linger2;
+
+	__u32	cpt_rcvrtt_rtt;
+	__u32	cpt_rcvrtt_seq;
+
+	__u32	cpt_rcvrtt_time;
+	__u32	__cpt_pad12;
+
+	__u16	cpt_i_mode;
+	__u16	__cpt_pad13;
+	__u32	__cpt_pad14;
+
+	__u32	cpt_i_uid;
+	__u32	cpt_i_gid;
+} __attribute__ ((aligned (8)));
+
+struct cpt_sockmc_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u16	cpt_family;
+	__u16	cpt_mode;
+	__u32	cpt_ifindex;
+	__u32	cpt_mcaddr[4];
+} __attribute__ ((aligned (8)));
+/* Followed by array of source addresses, each zero padded to 16 bytes */
+
+struct cpt_sock_packet_ring_image {
+	__u32	cpt_tp_block_size;
+	__u32	cpt_tp_block_nr;
+	__u32	cpt_tp_frame_size;
+	__u32	cpt_tp_frame_nr;
+};
+
+struct cpt_sock_packet_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_stats_tp_packets;
+	__u32	cpt_stats_tp_drops;
+
+	__u8	cpt_tp_loss;
+	__u8	cpt_auxdata;
+	__u8	cpt_origdev;
+	__u8	cpt_pad1;
+	__u32	cpt_pad2;
+
+	__u32	cpt_copy_thresh;
+	__u32	cpt_tp_version;
+	__u32	cpt_tp_reserve;
+	__u32	cpt_tp_tstamp;
+
+	struct cpt_sock_packet_ring_image cpt_rx_ring;
+	struct cpt_sock_packet_ring_image cpt_tx_ring;
+} __attribute__ ((aligned (8)));
+/* Followed by array of cpt_sock_packet_mc_image */
+
+struct cpt_sock_packet_mc_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_ifindex;
+	__u32	cpt_count;
+	__u16	cpt_type;
+	__u16	cpt_alen;
+	__u8	cpt_addr[MAX_ADDR_LEN];
+} __attribute__ ((aligned (8)));
+
+struct cpt_openreq_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_rcv_isn;
+	__u32	cpt_snt_isn;
+
+	__u16	cpt_rmt_port;
+	__u16	cpt_mss;
+	__u8	cpt_family;
+	__u8	cpt_retrans;
+	__u8	cpt_snd_wscale;
+	__u8	cpt_rcv_wscale;
+
+	__u8	cpt_tstamp_ok;
+	__u8	cpt_sack_ok;
+	__u8	cpt_wscale_ok;
+	__u8	cpt_ecn_ok;
+	__u8	cpt_acked;
+	__u8	__cpt_pad1;
+	__u16	__cpt_pad2;
+
+	__u32	cpt_window_clamp;
+	__u32	cpt_rcv_wnd;
+	__u32	cpt_ts_recent;
+	__u32	cpt_iif;
+	__u64	cpt_expires;
+
+	__u64	cpt_loc_addr[2];
+	__u64	cpt_rmt_addr[2];
+/*
+	struct ip_options	*opt;
+ */
+	
+} __attribute__ ((aligned (8)));
+
+struct cpt_skb_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_owner;
+	__u32	cpt_queue;
+#define CPT_SKB_NQ	0
+#define CPT_SKB_RQ	1
+#define CPT_SKB_WQ	2
+#define CPT_SKB_OFOQ	3
+
+	__u64	cpt_stamp;
+	__u32	cpt_len;
+	__u32	cpt_hspace;
+	__u32	cpt_tspace;
+	__u32	cpt_h;
+	__u32	cpt_nh;
+	__u32	cpt_mac;
+	
+	__u64	cpt_cb[5];
+	__u32	cpt_mac_len;
+	__u32	cpt_csum;
+	__u8	cpt_local_df;
+	__u8	cpt_pkt_type;
+	__u8	cpt_ip_summed;
+	__u8	__cpt_pad1;
+	__u32	cpt_priority;
+	__u16	cpt_protocol;
+	__u16	cpt_security;
+	__u16	cpt_gso_segs;
+	__u16	cpt_gso_size;
+	__u16	cpt_gso_type;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_sysvshm_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_key;
+	__u64	cpt_uid;
+	__u64	cpt_gid;
+	__u64	cpt_cuid;
+	__u64	cpt_cgid;
+	__u64	cpt_mode;
+	__u64	cpt_seq;
+
+	__u32	cpt_id;
+	__u32	cpt_mlockuser;
+	__u64	cpt_segsz;
+	__u64	cpt_atime;
+	__u64	cpt_ctime;
+	__u64	cpt_dtime;
+	__u64	cpt_creator;
+	__u64	cpt_last;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_sysvsem_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_key;
+	__u64	cpt_uid;
+	__u64	cpt_gid;
+	__u64	cpt_cuid;
+	__u64	cpt_cgid;
+	__u64	cpt_mode;
+	__u64	cpt_seq;
+	__u32	cpt_id;
+	__u32	__cpt_pad1;
+
+	__u64	cpt_otime;
+	__u64	cpt_ctime;
+} __attribute__ ((aligned (8)));
+/* Content is array of pairs semval/sempid */
+
+struct cpt_sysvsem_undo_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_id;
+	__u32	cpt_nsem;
+} __attribute__ ((aligned (8)));
+
+struct cpt_sysvmsg_msg_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_type;
+	__u64	cpt_size;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_sysvmsg_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_key;
+	__u64	cpt_uid;
+	__u64	cpt_gid;
+	__u64	cpt_cuid;
+	__u64	cpt_cgid;
+	__u64	cpt_mode;
+	__u64	cpt_seq;
+	__u32	cpt_id;
+	__u32	__cpt_pad1;
+
+	__u64	cpt_stime;
+	__u64	cpt_rtime;
+	__u64	cpt_ctime;
+	__u64	cpt_last_sender;
+	__u64	cpt_last_receiver;
+	__u64	cpt_qbytes;
+} __attribute__ ((aligned (8)));
+/* Content is array of sysv msg */
+
+
+struct cpt_mm_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_start_code;
+	__u64	cpt_end_code;
+	__u64	cpt_start_data;
+	__u64	cpt_end_data;
+	__u64	cpt_start_brk;
+	__u64	cpt_brk;
+	__u64	cpt_start_stack;
+	__u64	cpt_start_arg;
+	__u64	cpt_end_arg;
+	__u64	cpt_start_env;
+	__u64	cpt_end_env;
+	__u64	cpt_def_flags;
+	__u64	cpt_mmub;
+	__u8	cpt_dumpable;
+	__u8	cpt_vps_dumpable;
+	__u8	cpt_used_hugetlb;
+	__u8	__cpt_pad;
+	__u32	cpt_vdso;
+	__u64	cpt_mm_flags;
+} __attribute__ ((aligned (8)));
+
+struct cpt_page_block
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_start;
+	__u64	cpt_end;
+} __attribute__ ((aligned (8)));
+
+struct cpt_remappage_block
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_start;
+	__u64	cpt_end;
+	__u64	cpt_pgoff;
+} __attribute__ ((aligned (8)));
+
+struct cpt_copypage_block
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_start;
+	__u64	cpt_end;
+	__u64	cpt_source;
+} __attribute__ ((aligned (8)));
+
+struct cpt_lazypage_block
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_start;
+	__u64	cpt_end;
+	__u64	cpt_index;
+} __attribute__ ((aligned (8)));
+
+struct cpt_iterpage_block
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_start;
+	__u64	cpt_end;
+} __attribute__ ((aligned (8)));
+/* Followed by array of PFNs */
+
+struct cpt_vma_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_file;
+	__u32	cpt_type;
+#define CPT_VMA_TYPE_0		0
+#define CPT_VMA_TYPE_SHM	1
+#define CPT_VMA_VDSO		2
+#define CPT_VMA_VDSO_OLD	3 /* 64 bit rhel5 vdso */
+	__u32	cpt_anonvma;
+	__u64	cpt_anonvmaid;
+
+	__u64	cpt_start;
+	__u64	cpt_end;
+	__u64	cpt_flags;
+	__u64	cpt_pgprot;
+	__u64	cpt_pgoff;
+} __attribute__ ((aligned (8)));
+
+struct cpt_aio_ctx_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_max_reqs;
+	__u32	cpt_ring_pages;
+	__u32	cpt_tail;
+	__u32	cpt_nr;
+	__u64	cpt_mmap_base;
+	/* Data (io_event's) and struct aio_ring are stored in user space VM */
+} __attribute__ ((aligned (8)));
+
+
+/* Format of MM section.
+ *
+ * It is array of MM objects (mm_struct). Each MM object is
+ * header, encoding mm_struct, followed by array of VMA objects.
+ * Each VMA consists of VMA header, encoding vm_area_struct, and
+ * if the VMA contains copied pages, the header is followed by
+ * array of tuples start-end each followed by data.
+ *
+ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good?
+ */
+
+struct cpt_restart_block {
+	__u64	fn;
+#define CPT_RBL_0			0
+#define CPT_RBL_NANOSLEEP		1
+#define CPT_RBL_COMPAT_NANOSLEEP	2
+#define CPT_RBL_POLL			3
+#define CPT_RBL_FUTEX_WAIT		4
+#define CPT_RBL_POSIX_CPU_NSLEEP	5
+	__u64	arg0;
+	__u64	arg1;
+	__u64	arg2;
+	__u64	arg3;
+} __attribute__ ((aligned (8)));
+
+struct cpt_siginfo_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_qflags;
+	__u32	cpt_signo;
+	__u32	cpt_errno;
+	__u32	cpt_code;
+
+	__u64	cpt_sigval;
+	__u32	cpt_pid;
+	__u32	cpt_uid;
+	__u64	cpt_utime;
+	__u64	cpt_stime;
+
+	__u64	cpt_user;
+
+	int	cpt_sifields[SI_PAD_SIZE];
+} __attribute__ ((aligned (8)));
+
+/* Portable presentaions for segment registers */
+
+#define CPT_SEG_ZERO		0
+#define CPT_SEG_TLS1		1
+#define CPT_SEG_TLS2		2
+#define CPT_SEG_TLS3		3
+#define CPT_SEG_USER32_DS	4
+#define CPT_SEG_USER32_CS	5
+#define CPT_SEG_USER64_DS	6
+#define CPT_SEG_USER64_CS	7
+#define CPT_SEG_LDT		256
+
+struct cpt_x86_regs
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_debugreg[8];
+	__u32	cpt_fs;
+	__u32	cpt_gs;
+
+	__u32	cpt_ebx;
+	__u32	cpt_ecx;
+	__u32	cpt_edx;
+	__u32	cpt_esi;
+	__u32	cpt_edi;
+	__u32	cpt_ebp;
+	__u32	cpt_eax;
+	__u32	cpt_xds;
+	__u32	cpt_xes;
+	__u32	cpt_orig_eax;
+	__u32	cpt_eip;
+	__u32	cpt_xcs;
+	__u32	cpt_eflags;
+	__u32	cpt_esp;
+	__u32	cpt_xss;
+	__u32	cpt_ugs;
+};
+
+struct cpt_x86_64_regs
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_debugreg[8];
+
+	__u64	cpt_fsbase;
+	__u64	cpt_gsbase;
+	__u32	cpt_fsindex;
+	__u32	cpt_gsindex;
+	__u32	cpt_ds;
+	__u32	cpt_es;
+
+	__u64	cpt_r15;
+	__u64	cpt_r14;
+	__u64	cpt_r13;
+	__u64	cpt_r12;
+	__u64	cpt_rbp;
+	__u64	cpt_rbx;
+	__u64	cpt_r11;
+	__u64	cpt_r10;	
+	__u64	cpt_r9;
+	__u64	cpt_r8;
+	__u64	cpt_rax;
+	__u64	cpt_rcx;
+	__u64	cpt_rdx;
+	__u64	cpt_rsi;
+	__u64	cpt_rdi;
+	__u64	cpt_orig_rax;
+	__u64	cpt_rip;
+	__u64	cpt_cs;
+	__u64	cpt_eflags;
+	__u64	cpt_rsp;
+	__u64	cpt_ss;
+};
+
+struct cpt_ia64_regs
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	gr[128];
+	__u64	fr[256];
+	__u64	br[8];
+	__u64	nat[2];
+
+	__u64	ar_bspstore;
+	__u64	num_regs;
+	__u64	loadrs;
+	__u64	ar_bsp;
+	__u64	ar_unat;
+	__u64	ar_pfs;
+	__u64	ar_ccv;
+	__u64	ar_fpsr;
+	__u64	ar_csd;
+	__u64	ar_ssd;
+	__u64	ar_ec;
+	__u64	ar_lc;
+	__u64	ar_rsc;
+	__u64	ar_rnat;
+
+	__u64	cr_iip;
+	__u64	cr_ipsr;
+
+	__u64	cfm;
+	__u64	pr;
+
+	__u64	ibr[8];
+	__u64	dbr[8];
+};
+
+
+struct cpt_task_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_state;
+	__u64	cpt_flags;
+#define CPT_TASK_FLAGS_MASK	(PF_EXITING | PF_FORKNOEXEC | \
+				 PF_SUPERPRIV | PF_DUMPCORE | PF_SIGNALED)
+	__u64	cpt_ptrace;
+	__u32	cpt_prio;
+	__u32	cpt_static_prio;
+	__u32	cpt_policy;
+	__u32	cpt_rt_priority;
+
+	/* struct thread_info */
+	__u64	cpt_exec_domain;
+	__u64	cpt_thrflags;
+	__u64	cpt_thrstatus;
+	__u64	cpt_addr_limit;
+
+	__u64	cpt_personality;
+
+	__u64	cpt_mm;
+	__u64	cpt_files;
+	__u64	cpt_fs;
+	__u64	cpt_signal;
+	__u64	cpt_sighand;
+	__u64	cpt_sigblocked;
+	__u64	cpt_sigrblocked;
+	__u64	cpt_sigpending;
+	__u64	cpt_namespace;
+	__u64	cpt_sysvsem_undo;
+	__u32	cpt_pid;
+	__u32	cpt_tgid;
+	__u32	cpt_ppid;
+	__u32	cpt_rppid;
+	__u32	cpt_pgrp;
+	__u32	cpt_session;
+	__u32	cpt_old_pgrp;
+	__u32	__cpt_pad;
+	__u32	cpt_leader;
+	__u8	cpt_pn_state;
+	__u8	cpt_stopped_state;
+	__u8	cpt_sigsuspend_state;
+	__u8	cpt_64bit;
+	__u64	cpt_set_tid;
+	__u64	cpt_clear_tid;
+	__u32	cpt_exit_code;
+	__u32	cpt_exit_signal;
+	__u32	cpt_pdeath_signal;
+	__u32	cpt_user;
+	__u32	cpt_uid;
+	__u32	cpt_euid;
+	__u32	cpt_suid;
+	__u32	cpt_fsuid;
+	__u32	cpt_gid;
+	__u32	cpt_egid;
+	__u32	cpt_sgid;
+	__u32	cpt_fsgid;
+	__u32	cpt_ngids;
+	__u32	cpt_gids[32];
+	__u8	cpt_prctl_uac;
+	__u8	cpt_prctl_fpemu;
+	__u16	__cpt_pad1;
+	__u64	cpt_ecap;
+	__u64	cpt_icap;
+	__u64	cpt_pcap;
+	__u8	cpt_comm[16];
+	__u64	cpt_tls[3];
+	struct cpt_restart_block cpt_restart;
+	__u64	cpt_it_real_value;	/* V8: jiffies, V9..: nsec */
+	__u64	cpt_it_real_incr;	/* V8: jiffies, V9..: nsec */
+	__u64	cpt_it_prof_value;
+	__u64	cpt_it_prof_incr;
+	__u64	cpt_it_virt_value;
+	__u64	cpt_it_virt_incr;
+
+	__u16	cpt_used_math;
+	__u8	cpt_keepcap;
+	__u8	cpt_did_exec;
+	__u32	cpt_ptrace_message;
+
+	__u64	cpt_utime;
+	__u64	cpt_stime;
+	__u64	cpt_starttime;		/* V8: jiffies, V9...: timespec */
+	__u64	cpt_nvcsw;
+	__u64	cpt_nivcsw;
+	__u64	cpt_min_flt;
+	__u64	cpt_maj_flt;
+
+	__u64	cpt_sigsuspend_blocked;
+	__u64	cpt_cutime, cpt_cstime;
+	__u64	cpt_cnvcsw, cpt_cnivcsw;
+	__u64	cpt_cmin_flt, cpt_cmaj_flt;
+
+#define CPT_RLIM_NLIMITS 16
+	__u64	cpt_rlim_cur[CPT_RLIM_NLIMITS];
+	__u64	cpt_rlim_max[CPT_RLIM_NLIMITS];
+
+	__u64	cpt_task_ub;
+	__u64	cpt_exec_ub;
+	__u64	cpt_mm_ub;
+	__u64	cpt_fork_sub;
+	__u64	cpt_posix_timers;
+	__u64	cpt_bcap;
+} __attribute__ ((aligned (8)));
+
+struct cpt_sigaltstack_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_stack;
+	__u32	cpt_stacksize;
+	__u32	__cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+struct cpt_task_aux_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_robust_list;
+	__u64	__cpt_future[16];
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_signal_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_leader;
+	__u8	cpt_pgrp_type;
+	__u8	cpt_old_pgrp_type;
+	__u8	cpt_session_type;
+#define CPT_PGRP_NORMAL		0
+#define CPT_PGRP_ORPHAN		1
+#define CPT_PGRP_STRAY		2
+	__u8	__cpt_pad1;
+	__u64	cpt_pgrp;
+	__u64	cpt_old_pgrp;
+	__u64	cpt_session;
+	__u64	cpt_sigpending;
+	__u64	cpt_ctty;
+
+	__u32	cpt_curr_target;
+	__u32	cpt_group_exit;
+	__u32	cpt_group_exit_code;
+	__u32	cpt_group_exit_task;
+	__u32	cpt_notify_count;
+	__u32	cpt_group_stop_count;
+	__u32	cpt_stop_state;
+	__u32	__cpt_pad2;
+
+	__u64	cpt_utime, cpt_stime, cpt_cutime, cpt_cstime;
+	__u64	cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw;
+	__u64	cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt;
+
+	__u64	cpt_rlim_cur[CPT_RLIM_NLIMITS];
+	__u64	cpt_rlim_max[CPT_RLIM_NLIMITS];
+#define CPT_SIGNAL_STOP_STOPPED	(1ull << 0)
+#define CPT_SIGNAL_STOP_CONTINUED 	(1ull << 1)
+#define CPT_SIGNAL_CLD_STOPPED		(1ull << 2)
+#define CPT_SIGNAL_CLD_CONTINUED	(1ull << 3)
+	__u64	cpt_flags;
+} __attribute__ ((aligned (8)));
+/* Followed by list of posix timers. */
+
+struct cpt_sighand_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+} __attribute__ ((aligned (8)));
+/* Followed by list of sighandles. */
+
+struct cpt_sighandler_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+	
+	__u32	cpt_signo;
+	__u32	__cpt_pad1;
+	__u64	cpt_handler;
+	__u64	cpt_restorer;
+	__u64	cpt_flags;
+	__u64	cpt_mask;
+} __attribute__ ((aligned (8)));
+
+struct cpt_posix_timer_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_timer_id;
+	__u32	cpt_timer_clock;
+	__u32	cpt_timer_overrun;
+	__u32	cpt_timer_overrun_last;
+	__u32	cpt_timer_signal_pending;
+	__u32	__cpt_pad1;
+	__u64	cpt_timer_interval;
+	__u64	cpt_timer_value;
+
+	__u64	cpt_sigev_value;
+	__u32	cpt_sigev_signo;
+	__u32	cpt_sigev_notify;
+	__u32	cpt_sigev_notify_tid;
+	__u32	__cpt_pad2;
+	__u64	cpt_dump_time;
+} __attribute__ ((aligned (8)));
+
+struct cpt_netdev_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_index;
+	__u32	cpt_flags;
+	__u8	cpt_name[16];
+	__u32	cpt_mtu;
+	__u32	cpt_pad;
+} __attribute__ ((aligned (8)));
+
+struct cpt_tuntap_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_owner;
+	__u32	unused; /* was cpt_attached */
+	__u64	cpt_flags;
+	__u64	cpt_bindfile;
+	__u64	cpt_if_flags;
+	__u8	cpt_dev_addr[6];
+	__u16	cpt_pad;
+	__u32	cpt_chr_filter[2];
+	__u32	cpt_net_filter[2];
+} __attribute__ ((aligned (8)));
+
+struct cpt_tap_filter_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_count;
+	__u32	cpt_mask[2];
+	__u8	cpt_addr[8][6];
+} __attribute__ ((aligned (8)));
+
+struct cpt_veth_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_allow_mac_change;
+	__u32	__cpt_pad;
+} __attribute__ ((aligned (8)));
+
+struct cpt_tunnel_image {
+	__u64   cpt_next;
+	__u32   cpt_object;
+	__u16   cpt_hdrlen;
+	__u16   cpt_content;
+
+	__u32   cpt_tnl_flags;
+#define CPT_TUNNEL_FBDEV        0x1
+#define CPT_TUNNEL_SIT          0x2
+#define CPT_TUNNEL_GRE          0x4
+#define CPT_TUNNEL_GRE_TAP      0x8
+	__u16   cpt_i_flags;
+	__u16   cpt_o_flags;
+	__u32   cpt_i_key;
+	__u32   cpt_o_key;
+	__u32   cpt_iphdr[5];
+	__u32   cpt_i_seqno;
+	__u32   cpt_o_seqno;
+	__u8	cpt_pad[4];
+	__u32	cpt_link;
+	__u8	cpt_pad1[4];
+} __attribute__ ((aligned (8)));
+
+struct cpt_br_nested_dev {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u8 name[16];
+};
+
+struct cpt_br_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64 designated_root;
+	__u64 bridge_id;
+	__u32 root_path_cost;
+	__u32 max_age;
+	__u32 hello_time;
+	__u32 forward_delay;
+	__u32 bridge_max_age;
+	__u32 bridge_hello_time;
+	__u32 bridge_forward_delay;
+	__u32 ageing_time;
+	__u8 root_port;
+	__u8 stp_enabled;
+	__u8 via_phys_dev;
+	__u8 pad[5];
+} __attribute__ ((aligned (8)));
+
+struct cpt_hwaddr_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u8	cpt_dev_addr[32];
+} __attribute__ ((aligned (8)));
+
+struct cpt_netstats_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_rx_packets;
+	__u64	cpt_tx_packets;
+	__u64	cpt_rx_bytes;
+	__u64	cpt_tx_bytes;
+	__u64	cpt_rx_errors;
+	__u64	cpt_tx_errors;
+	__u64	cpt_rx_dropped;
+	__u64	cpt_tx_dropped;
+	__u64	cpt_multicast;
+	__u64	cpt_collisions;
+	__u64	cpt_rx_length_errors;
+	__u64	cpt_rx_over_errors;
+	__u64	cpt_rx_crc_errors;
+	__u64	cpt_rx_frame_errors;
+	__u64	cpt_rx_fifo_errors;
+	__u64	cpt_rx_missed_errors;
+	__u64	cpt_tx_aborted_errors;
+	__u64	cpt_tx_carrier_errors;
+	__u64	cpt_tx_fifo_errors;
+	__u64	cpt_tx_heartbeat_errors;
+	__u64	cpt_tx_window_errors;
+	__u64	cpt_rx_compressed;
+	__u64	cpt_tx_compressed;
+	__u64	pad[4];
+} __attribute__ ((aligned (8)));
+
+struct cpt_idev_cnf_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_data[IPV4_DEVCONF_MAX];
+} __attribute__ ((aligned (8)));
+
+struct cpt_ifaddr_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u32	cpt_index;
+	__u8	cpt_family;
+	__u8	cpt_masklen;
+	__u8	cpt_flags;
+	__u8	cpt_scope;
+	__u32	cpt_address[4];
+	__u32	cpt_peer[4];
+	__u32	cpt_broadcast[4];
+	__u8	cpt_label[16];
+	__u32	cpt_valid_lft;
+	__u32	cpt_prefered_lft;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ipct_tuple
+{
+	__u32	cpt_src;
+	__u16	cpt_srcport;
+	__u16	__cpt_pad1;
+
+	__u32	cpt_dst;
+	__u16	cpt_dstport;
+	__u8	cpt_protonum;
+	__u8	cpt_dir;	/* TEMPORARY HACK TO VALIDATE CODE */
+	__u16	cpt_l3num;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ipct_tuple_compat /* 2.6.18 */
+{
+	__u32	cpt_src;
+	__u16	cpt_srcport;
+	__u16	__cpt_pad1;
+
+	__u32	cpt_dst;
+	__u16	cpt_dstport;
+	__u8	cpt_protonum;
+	__u8	cpt_dir;	/* TEMPORARY HACK TO VALIDATE CODE */
+} __attribute__ ((aligned (8)));
+
+struct cpt_nat_manip
+{
+	__u8	cpt_direction;
+	__u8	cpt_hooknum;
+	__u8	cpt_maniptype;
+	__u8	__cpt_pad1;
+
+	__u32	cpt_manip_addr;
+	__u16	cpt_manip_port;
+	__u16	__cpt_pad2;
+	__u32	__cpt_pad3;
+} __attribute__ ((aligned (8)));
+
+struct cpt_nat_seq
+{
+	__u32	cpt_correction_pos;
+	__u32	cpt_offset_before;
+	__u32	cpt_offset_after;
+	__u32	__cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ip_connexpect_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_timeout;
+	__u32	cpt_sibling_conntrack;	/* Index of child conntrack */
+	__u32   cpt_pad1;
+
+	struct cpt_ipct_tuple	cpt_tuple;
+	struct cpt_ipct_tuple	cpt_mask;
+
+	__u8	cpt_dir;
+	__u8	cpt_flags;
+	__u8    cpt_pad2[6];
+
+	__u32   cpt_class;
+	__u32   cpt_pad3;	
+} __attribute__ ((aligned (8)));
+
+struct cpt_ip_connexpect_image_compat
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_timeout;
+	__u32	cpt_sibling_conntrack;	/* Index of child conntrack */
+	__u32	cpt_seq;		/* id in 2.6.15 */
+
+	struct cpt_ipct_tuple_compat	cpt_ct_tuple;	/* NU 2.6.15 */
+	struct cpt_ipct_tuple_compat	cpt_tuple;
+	struct cpt_ipct_tuple_compat	cpt_mask;
+
+	/* union ip_conntrack_expect_help. Used by ftp, irc, amanda */
+	__u32	cpt_help[3];			/* NU 2.6.15 */
+	__u16	cpt_manip_proto;
+	__u8	cpt_dir;
+	__u8	cpt_flags;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ip_conntrack_image
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	struct cpt_ipct_tuple cpt_tuple[2];
+	__u64	cpt_status;
+	__u64	cpt_timeout;
+	__u32	cpt_index;
+	__u8	cpt_ct_helper;
+	__u8	cpt_nat_helper;
+	__u16	cpt_pad1;
+
+	/* union ip_conntrack_proto. Used by tcp and icmp. */
+	__u32	cpt_proto_data[16];
+
+	/* union ip_conntrack_help. Used by ftp and pptp helper.
+	 * We do not support pptp...
+	 */
+	__u32	cpt_help_data[8];
+
+	struct	cpt_nat_seq	cpt_nat_seq[2];
+
+	__u32	cpt_masq_index;
+	__u32	cpt_id;
+	__u32	cpt_mark;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ip_conntrack_image_compat
+{
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	struct cpt_ipct_tuple_compat cpt_tuple[2];
+	__u64	cpt_status;
+	__u64	cpt_timeout;
+	__u32	cpt_index;
+	__u8	cpt_ct_helper;
+	__u8	cpt_nat_helper;
+	__u16	cpt_pad1;
+
+	/* union ip_conntrack_proto. Used by tcp and icmp. */
+	__u32	cpt_proto_data[12];
+
+	/* union ip_conntrack_help. Used by ftp and pptp helper.
+	 * We do not support pptp...
+	 */
+	__u32	cpt_help_data[6];
+
+	/* nat info */
+	__u32	cpt_initialized;	/* NU 2.6.15 */
+	__u32	cpt_num_manips;		/* NU 2.6.15 */
+	struct  cpt_nat_manip	cpt_nat_manips[6];	/* NU 2.6.15 */
+
+	struct	cpt_nat_seq	cpt_nat_seq[2];
+
+	__u32	cpt_masq_index;
+	__u32	cpt_id;
+	__u32	cpt_mark;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ubparm
+{
+	__u64	barrier;
+	__u64	limit;
+	__u64	held;
+	__u64	maxheld;
+	__u64	minheld;
+	__u64	failcnt;
+} __attribute__ ((aligned (8)));
+
+struct cpt_beancounter_image {
+	__u64	cpt_next;
+	__u32	cpt_object;
+	__u16	cpt_hdrlen;
+	__u16	cpt_content;
+
+	__u64	cpt_parent;
+	__u32	cpt_id;
+	__u16   cpt_ub_resources;
+	__u16   cpt_ub_flags;
+#define CPT_UB_NOSTORE		(1 << 0)
+	struct	cpt_ubparm	cpt_parms[32 * 2];
+} __attribute__ ((aligned (8)));
+
+struct cpt_slm_sgreg_image {
+	__u64   cpt_next;
+	__u32   cpt_object;
+	__u16   cpt_hdrlen;
+	__u16   cpt_content;
+
+	__u32   cpt_size;
+	__u32   __cpt_pad1;
+	__u32   cpt_id;
+	__u16   cpt_resource;
+	__u8    cpt_regname[32];
+	__u8	__cpt_pad2[2];
+} __attribute__ ((aligned (8)));
+
+struct cpt_slm_obj_image {      
+	__u64   cpt_next;
+	__u32   cpt_object;
+	__u16   cpt_hdrlen;
+	__u16   cpt_content;
+
+	__u32   cpt_size;
+	__u32   __cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+#ifdef __KERNEL__
+
+static inline void __user * cpt_ptr_import(__u64 ptr)
+{
+	return (void*)(unsigned long)ptr;
+}
+
+static inline __u64 cpt_ptr_export(void __user *ptr)
+{
+	return (__u64)(unsigned long)ptr;
+}
+
+static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr)
+{
+	memcpy(sig, &ptr, sizeof(*sig));
+}
+
+static inline __u64 cpt_sigset_export(sigset_t *sig)
+{
+	return *(__u64*)sig;
+}
+
+static inline __u64 cpt_timespec_export(struct timespec *tv)
+{
+	return (((u64)tv->tv_sec) << 32) + tv->tv_nsec;
+}
+
+static inline void cpt_timespec_import(struct timespec *tv, __u64 val)
+{
+	tv->tv_sec = val>>32;
+	tv->tv_nsec = (val&0xFFFFFFFF);
+}
+
+static inline __u64 cpt_timeval_export(struct timeval *tv)
+{
+	return (((u64)tv->tv_sec) << 32) + tv->tv_usec;
+}
+
+static inline void cpt_timeval_import(struct timeval *tv, __u64 val)
+{
+	tv->tv_sec = val>>32;
+	tv->tv_usec = (val&0xFFFFFFFF);
+}
+
+#endif
+
+#endif /* __CPT_IMAGE_H_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_ioctl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_ioctl.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_ioctl.h	2015-01-21 12:02:48.235093340 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_ioctl.h	2015-01-21 12:02:50.885022991 +0300
@@ -0,0 +1,67 @@
+/*
+ *
+ *  include/linux/cpt_ioctl.h
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _CPT_IOCTL_H_
+#define _CPT_IOCTL_H_ 1
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define CPTCTLTYPE '-'
+#define CPT_SET_DUMPFD	_IOW(CPTCTLTYPE, 1, int)
+#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int)
+#define CPT_SET_LOCKFD	_IOW(CPTCTLTYPE, 3, int)
+#define CPT_SET_VEID	_IOW(CPTCTLTYPE, 4, int)
+#define CPT_SUSPEND	_IO(CPTCTLTYPE, 5)
+#define CPT_DUMP	_IO(CPTCTLTYPE, 6)
+#define CPT_UNDUMP	_IO(CPTCTLTYPE, 7)
+#define CPT_RESUME	_IO(CPTCTLTYPE, 8)
+#define CPT_KILL	_IO(CPTCTLTYPE, 9)
+#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10)
+#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int)
+#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12)
+#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int)
+#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int)
+#define CPT_PAGEIND	_IO(CPTCTLTYPE, 15)
+#define CPT_VMPREP	_IOW(CPTCTLTYPE, 16, int)
+#define CPT_SET_LAZY	_IOW(CPTCTLTYPE, 17, int)
+#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int)
+#define CPT_TEST_CAPS	_IOW(CPTCTLTYPE, 19, unsigned int)
+#define CPT_TEST_VECAPS	_IOW(CPTCTLTYPE, 20, unsigned int)
+#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int)
+
+#define CPT_ITER	_IOW(CPTCTLTYPE, 23, int)
+#define CPT_LINKDIR_ADD	_IOW(CPTCTLTYPE, 24, int)
+#define CPT_HARDLNK_ON	_IOW(CPTCTLTYPE, 25, int)
+
+#define CPT_TEST_VERSION _IOW(CPTCTLTYPE, 26, int)
+#define CPT_SET_LOCKFD2 _IOW(CPTCTLTYPE, 27, int)
+
+#define CPT_SET_PRAM	_IOW(CPTCTLTYPE, 28, int)
+
+#define CPT_STOP_TRACKER	_IOW(CPTCTLTYPE, 29, int)
+
+#define CPT_TEST_VECAPS2	_IOW(CPTCTLTYPE, 30, unsigned int)
+
+/* CPT_TEST_VECAPS return codes */
+#define VECAPS_OK			0
+#define VECAPS_NO_CPU_FEATURE		1
+#define VECAPS_UNSUPPORTED_FEATURE	2
+#define VECAPS_NO_IPV6_MODULE		3
+#define VECAPS_NO_SLM_MODULE		4
+#define VECAPS_NO_MNT_NAMESPACES	5
+
+struct vecaps {
+	__u32	dst_flags;
+	__u32	features;
+};
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_obj.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_obj.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cpt_obj.h	2015-01-21 12:02:49.748053174 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpt_obj.h	2015-01-21 12:02:49.859050229 +0300
@@ -0,0 +1,72 @@
+#ifndef __CPT_OBJ_H_
+#define __CPT_OBJ_H_ 1
+
+#undef ITER_DEBUG
+
+#include <linux/list.h>
+#include <linux/cpt_image.h>
+
+typedef struct _cpt_object
+{
+	struct list_head	o_list;
+	struct list_head	o_hash;
+	int			o_count;
+	int			o_index;
+	int			o_lock;
+	loff_t			o_pos;
+	loff_t			o_ppos;
+	void			*o_obj;
+	void			*o_image;
+	void			*o_parent;
+	unsigned int		o_flags;
+#define CPT_INODE_HARDLINKED	0x1
+#define CPT_VFSMOUNT_DELAYFS	0x1
+#define CPT_FILE_DELAYFS	0x1
+#define CPT_FILE_SILLYRENAME	0x2
+#define CPT_FILE_SYSVIPC	0x4
+#define CPT_TTY_NOPAIR		0x1
+#define CPT_NAMESPACE_MAIN	0x1
+} cpt_object_t;
+
+struct cpt_context;
+
+#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list)
+#define for_each_object_safe(obj, nobj, type) list_for_each_entry_safe(obj, nobj, &ctx->object_array[type], o_list)
+
+
+extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx);
+extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx);
+
+cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
+cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx);
+cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx);
+
+static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx)
+{
+	cpt->o_pos = pos;
+	/* Add to pos hash table */
+}
+
+static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx)
+{
+	cpt->o_obj = ptr;
+	/* Add to hash table */
+}
+
+static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx)
+{
+	cpt->o_index = index;
+	/* Add to index hash table */
+}
+
+
+extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx);
+extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx);
+extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
+extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx);
+extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
+
+extern int cpt_object_init(struct cpt_context *ctx);
+extern int cpt_object_destroy(struct cpt_context *ctx);
+
+#endif /* __CPT_OBJ_H_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cpuset.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpuset.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cpuset.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cpuset.h	2015-01-21 12:02:53.664949197 +0300
@@ -16,6 +16,8 @@
 
 #ifdef CONFIG_CPUSETS
 
+extern int sysctl_strict_mem_cpuset;
+
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
 extern int cpuset_init(void);
@@ -128,6 +130,11 @@ static inline void set_mems_allowed(node
 	task_unlock(current);
 }
 
+extern int cgroup_set_cpumask(struct cgroup *cgrp,
+			      const struct cpumask *cpus_allowed);
+extern int cgroup_set_nodemask(struct cgroup *cgrp,
+			       const nodemask_t *nodes_allowed);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init(void) { return 0; }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/crc32c.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/crc32c.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/crc32c.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/crc32c.h	2015-01-21 12:02:53.063965151 +0300
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 extern u32 crc32c(u32 crc, const void *address, unsigned int length);
+extern u32 crc32c_generic(u32 crc, const void *address, unsigned int length);
 
 /* This macro exists for backwards-compatibility. */
 #define crc32c_le crc32c
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cred.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cred.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cred.h	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cred.h	2015-01-21 12:02:42.736239328 +0300
@@ -157,7 +157,6 @@ extern const struct cred *get_task_cred(
 extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
 extern struct cred *prepare_exec_creds(void);
-extern struct cred *prepare_usermodehelper_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
 extern const struct cred *override_creds(const struct cred *);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/cryptohash.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cryptohash.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/cryptohash.h	2014-12-12 23:29:10.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/cryptohash.h	2015-01-21 12:02:52.174988749 +0300
@@ -2,10 +2,12 @@
 #define __CRYPTOHASH_H
 
 #define SHA_DIGEST_WORDS 5
-#define SHA_WORKSPACE_WORDS 80
+#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
+#define SHA_WORKSPACE_WORDS 16
 
 void sha_init(__u32 *buf);
 void sha_transform(__u32 *digest, const char *data, __u32 *W);
+extern void (*sha_batch_transform)(__u32 *, const char *, unsigned);
 
 #define MD5_DIGEST_WORDS 4
 #define MD5_MESSAGE_BYTES 64
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/dcache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/dcache.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/dcache.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/dcache.h	2015-01-21 12:02:43.156228178 +0300
@@ -102,6 +102,7 @@ struct dentry {
 	struct qstr d_name;
 
 	struct list_head d_lru;		/* LRU list */
+	struct list_head d_bclru;		/* LRU list */
 	/*
 	 * d_child and d_rcu can share memory
 	 */
@@ -116,9 +117,15 @@ struct dentry {
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
 
+	unsigned int d_lru_time;
+
+	struct user_beancounter *d_ub;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
 };
 
+#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
+
+extern struct kmem_cache *dentry_cache;
 /*
  * dentry->d_lock spinlock nesting subclasses:
  *
@@ -194,6 +201,8 @@ d_automount:	no		no		no	 yes
 #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080
      /* Parent inode is watched by some fsnotify listener */
 
+#define DCACHE_BCTOP		0x0100
+
 #define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
 #define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
 #define DCACHE_MANAGE_TRANSIT	0x40000	/* manage transit from this dirent */
@@ -258,6 +267,7 @@ extern struct dentry * d_obtain_alias(st
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
+extern int __shrink_dcache_ub(struct user_beancounter *ub, int count, int popup);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
@@ -329,6 +339,7 @@ extern char *dynamic_dname(struct dentry
 extern char *__d_path(const struct path *path, struct path *root, char *, int);
 extern char *d_path(const struct path *, char *, int);
 extern char *dentry_path(struct dentry *, char *, int);
+extern int d_root_check(struct path *path);
 
 /* Allocation counts.. */
 
@@ -384,6 +395,7 @@ static inline struct dentry *dget_parent
 }
 
 extern void dput(struct dentry *);
+extern void dput_nocache(struct dentry *dentry, int nocache);
 
 static inline bool d_managed(struct dentry *dentry)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/delayacct.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/delayacct.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/delayacct.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/delayacct.h	2015-01-21 12:02:54.057938765 +0300
@@ -101,6 +101,25 @@ static inline int delayacct_add_tsk(stru
 	return __delayacct_add_tsk(d, tsk);
 }
 
+static inline void delayacct_add_stats(struct taskstats *d,
+					struct taskstats *s)
+{
+	if (!delayacct_on)
+		return;
+
+	d->cpu_count			+= s->cpu_count;
+	d->cpu_delay_total		+= s->cpu_delay_total;
+	d->cpu_run_real_total		+= s->cpu_run_real_total;
+	d->cpu_run_virtual_total	+= s->cpu_run_virtual_total;
+	d->cpu_scaled_run_real_total	+= s->cpu_scaled_run_real_total;
+	d->blkio_count			+= s->blkio_count;
+	d->blkio_delay_total		+= s->blkio_delay_total;
+	d->swapin_count			+= s->swapin_count;
+	d->swapin_delay_total		+= s->swapin_delay_total;
+	d->freepages_count		+= s->freepages_count;
+	d->freepages_delay_total	+= s->freepages_delay_total;
+}
+
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 {
 	if (tsk->delays)
@@ -138,6 +157,9 @@ static inline void delayacct_blkio_end(v
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 { return 0; }
+static inline void delayacct_add_stats(struct taskstats *d,
+					struct taskstats *s)
+{}
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 { return 0; }
 static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/device.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/device.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/device.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/device.h	2015-01-21 12:02:47.466113754 +0300
@@ -220,8 +220,18 @@ struct class_dev_iter {
 	const struct device_type	*type;
 };
 
+#ifndef CONFIG_VE
 extern struct kobject *sysfs_dev_block_kobj;
 extern struct kobject *sysfs_dev_char_kobj;
+#define ve_sysfs_dev_block_kobj sysfs_dev_block_kobj
+#define ve_sysfs_dev_char_kobj sysfs_dev_char_kobj
+#define ve_sysfs_block_kobj sysfs_block_kobj
+#else
+#define ve_sysfs_dev_block_kobj (get_exec_env()->dev_block_kobj)
+#define ve_sysfs_dev_char_kobj (get_exec_env()->dev_char_kobj)
+#define ve_sysfs_block_kobj (get_exec_env()->block_kobj)
+#endif
+
 extern int __must_check __class_register(struct class *class,
 					 struct lock_class_key *key);
 extern void class_unregister(struct class *class);
@@ -290,6 +300,15 @@ extern struct class * __must_check __cla
 						  struct lock_class_key *key);
 extern void class_destroy(struct class *cls);
 
+extern struct class net_class;
+extern struct kset *class_kset;
+
+int classes_init(void);
+void classes_fini(void);
+
+int devices_init(void);
+void devices_fini(void);
+
 /* This is a #define to keep the compiler from merging different
  * instances of the __key variable */
 #define class_create(owner, name)		\
@@ -645,6 +664,7 @@ extern void put_device(struct device *de
 extern void wait_for_device_probe(void);
 
 #ifdef CONFIG_DEVTMPFS
+extern struct file_system_type dev_fs_type;
 extern int devtmpfs_create_node(struct device *dev);
 extern int devtmpfs_delete_node(struct device *dev);
 extern int devtmpfs_mount(const char *mountpoint);
@@ -660,6 +680,8 @@ extern void device_shutdown(void);
 /* drivers/base/sys.c */
 extern void sysdev_shutdown(void);
 
+/* net/core/net-sysfs.c */
+int is_dev_netdev(struct device *dev);
 /* debugging and troubleshooting/diagnostic helpers. */
 extern const char *dev_driver_string(const struct device *dev);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/device_cgroup.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/device_cgroup.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/device_cgroup.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/device_cgroup.h	2015-01-21 12:02:44.395195283 +0300
@@ -12,9 +12,14 @@ static inline int devcgroup_inode_permis
 		return 0;
 	return __devcgroup_inode_permission(inode, mask);
 }
+extern int devcgroup_device_visible(int type, int major,
+		int start_minor, int nr_minors);
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
 static inline int devcgroup_inode_mknod(int mode, dev_t dev)
 { return 0; }
+static inline int devcgroup_device_visible(int type, int major,
+		int start_minor, int nr_minors)
+{ return 0; }
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/devpts_fs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/devpts_fs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/devpts_fs.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/devpts_fs.h	2015-01-21 12:02:43.826210389 +0300
@@ -26,6 +26,7 @@ struct tty_struct *devpts_get_tty(struct
 /* unlink */
 void devpts_pty_kill(struct tty_struct *tty);
 
+extern struct file_system_type devpts_fs_type;
 #else
 
 /* Dummy stubs in the no-pty case */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/drbd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/drbd.h	2015-01-21 12:02:58.389823779 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd.h	2015-01-21 12:02:58.389823779 +0300
@@ -0,0 +1,383 @@
+/*
+  drbd.h
+  Kernel module for 2.6.x Kernels
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+#ifndef DRBD_H
+#define DRBD_H
+#include <linux/connector.h>
+
+#include <asm/types.h>
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#else
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <limits.h>
+
+/* Although the Linux source code makes a difference between
+   generic endianness and the bitfields' endianness, there is no
+   architecture as of Linux-2.6.24-rc4 where the bitfields' endianness
+   does not match the generic endianness. */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN_BITFIELD
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define __BIG_ENDIAN_BITFIELD
+#else
+# error "sorry, weird endianness on this box"
+#endif
+
+#endif
+
+
+enum drbd_io_error_p {
+	EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
+	EP_CALL_HELPER,
+	EP_DETACH
+};
+
+enum drbd_fencing_p {
+	FP_DONT_CARE,
+	FP_RESOURCE,
+	FP_STONITH
+};
+
+enum drbd_disconnect_p {
+	DP_RECONNECT,
+	DP_DROP_NET_CONF,
+	DP_FREEZE_IO
+};
+
+enum drbd_after_sb_p {
+	ASB_DISCONNECT,
+	ASB_DISCARD_YOUNGER_PRI,
+	ASB_DISCARD_OLDER_PRI,
+	ASB_DISCARD_ZERO_CHG,
+	ASB_DISCARD_LEAST_CHG,
+	ASB_DISCARD_LOCAL,
+	ASB_DISCARD_REMOTE,
+	ASB_CONSENSUS,
+	ASB_DISCARD_SECONDARY,
+	ASB_CALL_HELPER,
+	ASB_VIOLENTLY
+};
+
+enum drbd_on_no_data {
+	OND_IO_ERROR,
+	OND_SUSPEND_IO
+};
+
+enum drbd_on_congestion {
+	OC_BLOCK,
+	OC_PULL_AHEAD,
+	OC_DISCONNECT,
+};
+
+/* KEEP the order, do not delete or insert. Only append. */
+enum drbd_ret_code {
+	ERR_CODE_BASE		= 100,
+	NO_ERROR		= 101,
+	ERR_LOCAL_ADDR		= 102,
+	ERR_PEER_ADDR		= 103,
+	ERR_OPEN_DISK		= 104,
+	ERR_OPEN_MD_DISK	= 105,
+	ERR_DISK_NOT_BDEV	= 107,
+	ERR_MD_NOT_BDEV		= 108,
+	ERR_DISK_TOO_SMALL	= 111,
+	ERR_MD_DISK_TOO_SMALL	= 112,
+	ERR_BDCLAIM_DISK	= 114,
+	ERR_BDCLAIM_MD_DISK	= 115,
+	ERR_MD_IDX_INVALID	= 116,
+	ERR_IO_MD_DISK		= 118,
+	ERR_MD_INVALID          = 119,
+	ERR_AUTH_ALG		= 120,
+	ERR_AUTH_ALG_ND		= 121,
+	ERR_NOMEM		= 122,
+	ERR_DISCARD		= 123,
+	ERR_DISK_CONFIGURED	= 124,
+	ERR_NET_CONFIGURED	= 125,
+	ERR_MANDATORY_TAG	= 126,
+	ERR_MINOR_INVALID	= 127,
+	ERR_INTR		= 129, /* EINTR */
+	ERR_RESIZE_RESYNC	= 130,
+	ERR_NO_PRIMARY		= 131,
+	ERR_SYNC_AFTER		= 132,
+	ERR_SYNC_AFTER_CYCLE	= 133,
+	ERR_PAUSE_IS_SET	= 134,
+	ERR_PAUSE_IS_CLEAR	= 135,
+	ERR_PACKET_NR		= 137,
+	ERR_NO_DISK		= 138,
+	ERR_NOT_PROTO_C		= 139,
+	ERR_NOMEM_BITMAP	= 140,
+	ERR_INTEGRITY_ALG	= 141, /* DRBD 8.2 only */
+	ERR_INTEGRITY_ALG_ND	= 142, /* DRBD 8.2 only */
+	ERR_CPU_MASK_PARSE	= 143, /* DRBD 8.2 only */
+	ERR_CSUMS_ALG		= 144, /* DRBD 8.2 only */
+	ERR_CSUMS_ALG_ND	= 145, /* DRBD 8.2 only */
+	ERR_VERIFY_ALG		= 146, /* DRBD 8.2 only */
+	ERR_VERIFY_ALG_ND	= 147, /* DRBD 8.2 only */
+	ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
+	ERR_VERIFY_RUNNING	= 149, /* DRBD 8.2 only */
+	ERR_DATA_NOT_CURRENT	= 150,
+	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
+	ERR_PERM		= 152,
+	ERR_NEED_APV_93		= 153,
+	ERR_STONITH_AND_PROT_A  = 154,
+	ERR_CONG_NOT_PROTO_A	= 155,
+	ERR_PIC_AFTER_DEP	= 156,
+	ERR_PIC_PEER_DEP	= 157,
+
+	/* insert new ones above this line */
+	AFTER_LAST_ERR_CODE
+};
+
+#define DRBD_PROT_A   1
+#define DRBD_PROT_B   2
+#define DRBD_PROT_C   3
+
+enum drbd_role {
+	R_UNKNOWN = 0,
+	R_PRIMARY = 1,     /* role */
+	R_SECONDARY = 2,   /* role */
+	R_MASK = 3,
+};
+
+/* The order of these constants is important.
+ * The lower ones (<C_WF_REPORT_PARAMS) indicate
+ * that there is no socket!
+ * >=C_WF_REPORT_PARAMS ==> There is a socket
+ */
+enum drbd_conns {
+	C_STANDALONE,
+	C_DISCONNECTING,  /* Temporal state on the way to StandAlone. */
+	C_UNCONNECTED,    /* >= C_UNCONNECTED -> inc_net() succeeds */
+
+	/* These temporal states are all used on the way
+	 * from >= C_CONNECTED to Unconnected.
+	 * The 'disconnect reason' states
+	 * I do not allow to change between them. */
+	C_TIMEOUT,
+	C_BROKEN_PIPE,
+	C_NETWORK_FAILURE,
+	C_PROTOCOL_ERROR,
+	C_TEAR_DOWN,
+
+	C_WF_CONNECTION,
+	C_WF_REPORT_PARAMS, /* we have a socket */
+	C_CONNECTED,      /* we have introduced each other */
+	C_STARTING_SYNC_S,  /* starting full sync by admin request. */
+	C_STARTING_SYNC_T,  /* starting full sync by admin request. */
+	C_WF_BITMAP_S,
+	C_WF_BITMAP_T,
+	C_WF_SYNC_UUID,
+
+	/* All SyncStates are tested with this comparison
+	 * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
+	C_SYNC_SOURCE,
+	C_SYNC_TARGET,
+	C_VERIFY_S,
+	C_VERIFY_T,
+	C_PAUSED_SYNC_S,
+	C_PAUSED_SYNC_T,
+
+	C_AHEAD,
+	C_BEHIND,
+
+	C_MASK = 31
+};
+
+enum drbd_disk_state {
+	D_DISKLESS,
+	D_ATTACHING,      /* In the process of reading the meta-data */
+	D_FAILED,         /* Becomes D_DISKLESS as soon as we told it the peer */
+			/* when >= D_FAILED it is legal to access mdev->bc */
+	D_NEGOTIATING,    /* Late attaching state, we need to talk to the peer */
+	D_INCONSISTENT,
+	D_OUTDATED,
+	D_UNKNOWN,       /* Only used for the peer, never for myself */
+	D_CONSISTENT,     /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
+	D_UP_TO_DATE,       /* Only this disk state allows applications' IO ! */
+	D_MASK = 15
+};
+
+union drbd_state {
+/* According to gcc's docs is the ...
+ * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
+ * Determined by ABI.
+ * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
+ * even though we transmit as "cpu_to_be32(state)",
+ * the offsets of the bitfields still need to be swapped
+ * on different endianness.
+ */
+	struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned susp:1 ;   /* 2/2	 IO suspended no/yes (by user) */
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned susp_nod:1 ; /* IO suspended because no data */
+		unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/
+		unsigned _pad:9;   /* 0	 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+		unsigned _pad:9;
+		unsigned susp_fen:1 ;
+		unsigned susp_nod:1 ;
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned susp:1 ;   /* 2/2	 IO suspended  no/yes */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+#else
+# error "this endianness is not supported"
+#endif
+#ifndef DRBD_DEBUG_STATE_CHANGES
+# ifdef CONFIG_DYNAMIC_DEBUG
+#  define DRBD_DEBUG_STATE_CHANGES 1
+# else
+#  define DRBD_DEBUG_STATE_CHANGES 0
+# endif
+#endif
+#if DRBD_DEBUG_STATE_CHANGES
+		unsigned int line;
+		const char *func;
+		unsigned long long seq;
+#endif
+	};
+	unsigned int i;
+};
+
+enum drbd_state_rv {
+	SS_CW_NO_NEED = 4,
+	SS_CW_SUCCESS = 3,
+	SS_NOTHING_TO_DO = 2,
+	SS_SUCCESS = 1,
+	SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
+	SS_TWO_PRIMARIES = -1,
+	SS_NO_UP_TO_DATE_DISK = -2,
+	SS_NO_LOCAL_DISK = -4,
+	SS_NO_REMOTE_DISK = -5,
+	SS_CONNECTED_OUTDATES = -6,
+	SS_PRIMARY_NOP = -7,
+	SS_RESYNC_RUNNING = -8,
+	SS_ALREADY_STANDALONE = -9,
+	SS_CW_FAILED_BY_PEER = -10,
+	SS_IS_DISKLESS = -11,
+	SS_DEVICE_IN_USE = -12,
+	SS_NO_NET_CONFIG = -13,
+	SS_NO_VERIFY_ALG = -14,       /* drbd-8.2 only */
+	SS_NEED_CONNECTION = -15,    /* drbd-8.2 only */
+	SS_LOWER_THAN_OUTDATED = -16,
+	SS_NOT_SUPPORTED = -17,      /* drbd-8.2 only */
+	SS_IN_TRANSIENT_STATE = -18,  /* Retry after the next state change */
+	SS_CONCURRENT_ST_CHG = -19,   /* Concurrent cluster side state change! */
+	SS_AFTER_LAST_ERROR = -20,    /* Keep this at bottom */
+};
+
+/* from drbd_strings.c */
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#define SHARED_SECRET_MAX 64
+
+#define MDF_CONSISTENT		(1 << 0)
+#define MDF_PRIMARY_IND		(1 << 1)
+#define MDF_CONNECTED_IND	(1 << 2)
+#define MDF_FULL_SYNC		(1 << 3)
+#define MDF_WAS_UP_TO_DATE	(1 << 4)
+#define MDF_PEER_OUT_DATED	(1 << 5)
+#define MDF_CRASHED_PRIMARY      (1 << 6)
+
+enum drbd_uuid_index {
+	UI_CURRENT,
+	UI_BITMAP,
+	UI_HISTORY_START,
+	UI_HISTORY_END,
+	UI_SIZE,      /* nl-packet: number of dirty bits */
+	UI_FLAGS,     /* nl-packet: flags */
+	UI_EXTENDED_SIZE   /* Everything. */
+};
+
+enum drbd_timeout_flag {
+	UT_DEFAULT      = 0,
+	UT_DEGRADED     = 1,
+	UT_PEER_OUTDATED = 2,
+};
+
+#define UUID_JUST_CREATED ((__u64)4)
+
+#define DRBD_MAGIC 0x83740267
+#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
+#define DRBD_MAGIC_BIG 0x835a
+#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
+
+/* these are of type "int" */
+#define DRBD_MD_INDEX_INTERNAL -1
+#define DRBD_MD_INDEX_FLEX_EXT -2
+#define DRBD_MD_INDEX_FLEX_INT -3
+
+/* Start of the new netlink/connector stuff */
+
+#define DRBD_NL_CREATE_DEVICE 0x01
+#define DRBD_NL_SET_DEFAULTS  0x02
+
+/* The following line should be moved over to linux/connector.h
+ * when the time comes */
+#ifndef CN_IDX_DRBD
+# define CN_IDX_DRBD			0x4
+/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
+#endif
+#define CN_VAL_DRBD			0x1
+
+/* For searching a vacant cn_idx value */
+#define CN_IDX_STEP			6977
+
+struct drbd_nl_cfg_req {
+	int packet_type;
+	unsigned int drbd_minor;
+	int flags;
+	unsigned short tag_list[];
+};
+
+struct drbd_nl_cfg_reply {
+	int packet_type;
+	unsigned int minor;
+	/* FIXME: This is super ugly. */
+	int ret_code; /* enum drbd_ret_code or enum drbd_state_rv */
+	unsigned short tag_list[]; /* only used with get_* calls */
+};
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_config.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_config.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_config.h	2015-01-21 12:02:58.389823779 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_config.h	2015-01-21 12:02:58.399823514 +0300
@@ -0,0 +1,173 @@
+/*
+  drbd_config.h
+  DRBD's compile time configuration.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef DRBD_CONFIG_H
+#define DRBD_CONFIG_H
+
+extern const char *drbd_buildtag(void);
+
+/* Necessary to build the external module against >= Linux-2.6.33 */
+#ifdef REL_VERSION
+#undef REL_VERSION
+#undef API_VERSION
+#undef PRO_VERSION_MIN
+#undef PRO_VERSION_MAX
+#endif
+
+/* End of external module for 2.6.33 stuff */
+
+#define REL_VERSION "8.3.13"
+#define API_VERSION 88
+#define PRO_VERSION_MIN 86
+#define PRO_VERSION_MAX 96
+
+#ifndef __CHECKER__   /* for a sparse run, we need all STATICs */
+#define DBG_ALL_SYMBOLS /* no static functs, improves quality of OOPS traces */
+#endif
+
+/* drbd_assert_breakpoint() function
+#define DBG_ASSERTS
+ */
+
+/* Dump all cstate changes */
+#define DUMP_MD 2
+
+/* some extra checks
+#define PARANOIA
+ */
+
+/* Enable fault insertion code */
+#define DRBD_ENABLE_FAULTS
+
+/* RedHat's 2.6.9 kernels have the gfp_t type. Mainline has this feature
+ * since 2.6.16. If you build for RedHat enable the line below. */
+#define KERNEL_HAS_GFP_T
+
+/* kernel.org has atomic_add_return since 2.6.10. some vendor kernels
+ * have it backported, though. Others don't. */
+//#define NEED_BACKPORT_OF_ATOMIC_ADD
+
+/* 2.6.something has deprecated kmem_cache_t
+ * some older still use it.
+ * some have it defined as struct kmem_cache_s, some as struct kmem_cache */
+//#define USE_KMEM_CACHE_S
+
+/* 2.6.something has sock_create_kern (SE-linux security context stuff)
+ * some older distribution kernels don't. */
+//#define DEFINE_SOCK_CREATE_KERN
+
+/* 2.6.24 and later have kernel_sock_shutdown.
+ * some older distribution kernels may also have a backport. */
+//#define DEFINE_KERNEL_SOCK_SHUTDOWN
+
+/* in older kernels (vanilla < 2.6.16) struct netlink_skb_parms has a
+ * member called dst_groups. Later it is called dst_group (without 's'). */
+//#define DRBD_NL_DST_GROUPS
+
+/* in older kernels (vanilla < 2.6.14) is no kzalloc() */
+//#define NEED_BACKPORT_OF_KZALLOC
+
+// some vendor kernels have it, some don't
+//#define NEED_SG_SET_BUF
+#define HAVE_LINUX_SCATTERLIST_H
+
+/* 2.6.29 and up no longer have swabb.h */
+//#define HAVE_LINUX_BYTEORDER_SWABB_H
+
+/* some vendor kernel have it backported. */
+#define HAVE_SET_CPUS_ALLOWED_PTR
+
+/* Some vendor kernels < 2.6.7 might define msleep in one or
+ * another way .. */
+
+#define KERNEL_HAS_MSLEEP
+
+/* Some other kernels < 2.6.8 do not have struct kvec,
+ * others do.. */
+
+#define KERNEL_HAS_KVEC
+
+/* Actually availabe since 2.6.26, but vendors have backported...
+ */
+#define KERNEL_HAS_PROC_CREATE_DATA
+
+/* In 2.6.32 we finally fixed connector to pass netlink_skb_parms to the callback
+ */
+#define KERNEL_HAS_CN_SKB_PARMS
+/* 2.6.39 converts connector to be syncronous, and removes .eff_cap from the
+ *  * parameters. We then need to test on current_cap() instead. */
+#define HAVE_NL_SKB_EFF_CAP
+
+/* In the 2.6.34 mergewindow blk_queue_max_sectors() got blk_queue_max_hw_sectors() and
+   blk_queue_max_(phys|hw)_segments() got blk_queue_max_segments()
+   See Linux commits: 086fa5ff0854c676ec333 8a78362c4eefc1deddbef */
+//#define NEED_BLK_QUEUE_MAX_HW_SECTORS
+//#define NEED_BLK_QUEUE_MAX_SEGMENTS
+
+/* For kernel versions 2.6.31 to 2.6.33 inclusive, even though
+ * blk_queue_max_hw_sectors is present, we actually need to use
+ * blk_queue_max_sectors to set max_hw_sectors. :-(
+ * RHEL6 2.6.32 chose to be different and already has eliminated
+ * blk_queue_max_sectors as upstream 2.6.34 did.
+ * I check it into the git repo as defined,
+ * because if someone does not run our compat adjust magic, it otherwise would
+ * silently compile broken code on affected kernel versions, which is worse
+ * than the compile error it may cause on more recent kernels.
+ */
+// #define USE_BLK_QUEUE_MAX_SECTORS_ANYWAYS
+
+/* For kernel versions > 2.6.38, open_bdev_excl has been replaced with
+ * blkdev_get_by_path. See e525fd89 and d4d77629 */
+//#define COMPAT_HAVE_BLKDEV_GET_BY_PATH
+
+/* before open_bdev_exclusive, there was a open_bdev_excl,
+ * see 30c40d2 */
+#define COMPAT_HAVE_OPEN_BDEV_EXCLUSIVE
+
+/* some old kernels do not have atomic_add_unless() */
+//#define NEED_ATOMIC_ADD_UNLESS
+
+/* some old kernels do not have the bool type */
+//#define NEED_BOOL_TYPE
+
+/* some older kernels do not have schedule_timeout_interruptible() */
+//#define NEED_SCHEDULE_TIMEOUT_INTERR
+
+/* Stone old kernels lack the fmode_t type */
+#define COMPAT_HAVE_FMODE_T
+
+/* In commit c4945b9e (v2.6.39-rc1), the little-endian bit ops got renamed */
+// #define COMPAT_HAVE_FIND_NEXT_ZERO_BIT_LE
+
+/* In ancient kernels (2.6.5) kref_put() only takes a kref as argument */
+//#define COMPAT_KREF_PUT_HAS_SINGLE_ARG
+
+/* in Commit 5a7bbad27a410350e64a2d7f5ec18fc73836c14f (between Linux-3.1 and 3.2)
+   make_request() becomes type void. Before it had type int. */
+// #define COMPAT_HAVE_VOID_MAKE_REQUEST
+
+/* mempool_create_page_pool did not exist prior to 2.6.16 */
+#define COMPAT_HAVE_MEMPOOL_CREATE_PAGE_POOL
+
+/* bioset_create did change its signature a few times */
+#define COMPAT_HAVE_BIOSET_CREATE
+#define COMPAT_HAVE_BIOSET_CREATE_FRONT_PAD
+//#define COMPAT_BIOSET_CREATE_HAS_THREE_PARAMETERS
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_limits.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_limits.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_limits.h	2015-01-21 12:02:58.389823779 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_limits.h	2015-01-21 12:02:58.389823779 +0300
@@ -0,0 +1,173 @@
+/*
+  drbd_limits.h
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+*/
+
+/*
+ * Our current limitations.
+ * Some of them are hard limits,
+ * some of them are arbitrary range limits, that make it easier to provide
+ * feedback about nonsense settings for certain configurable values.
+ */
+
+#ifndef DRBD_LIMITS_H
+#define DRBD_LIMITS_H 1
+
+#define DEBUG_RANGE_CHECK 0
+
+#define DRBD_MINOR_COUNT_MIN 1
+#define DRBD_MINOR_COUNT_MAX 256
+#define DRBD_MINOR_COUNT_DEF 32
+
+#define DRBD_DIALOG_REFRESH_MIN 0
+#define DRBD_DIALOG_REFRESH_MAX 600
+
+/* valid port number */
+#define DRBD_PORT_MIN 1
+#define DRBD_PORT_MAX 0xffff
+
+/* startup { */
+  /* if you want more than 3.4 days, disable */
+#define DRBD_WFC_TIMEOUT_MIN 0
+#define DRBD_WFC_TIMEOUT_MAX 300000
+#define DRBD_WFC_TIMEOUT_DEF 0
+
+#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
+#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
+#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
+
+#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
+#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
+#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
+/* }*/
+
+/* net { */
+  /* timeout, unit centi seconds
+   * more than one minute timeout is not useful */
+#define DRBD_TIMEOUT_MIN 1
+#define DRBD_TIMEOUT_MAX 600
+#define DRBD_TIMEOUT_DEF 60       /* 6 seconds */
+
+ /* If backing disk takes longer than disk_timeout, mark the disk as failed */
+#define DRBD_DISK_TIMEOUT_MIN 0    /* 0 = disabled */
+#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */
+#define DRBD_DISK_TIMEOUT_DEF 0    /* disabled */
+
+  /* active connection retries when C_WF_CONNECTION */
+#define DRBD_CONNECT_INT_MIN 1
+#define DRBD_CONNECT_INT_MAX 120
+#define DRBD_CONNECT_INT_DEF 10   /* seconds */
+
+  /* keep-alive probes when idle */
+#define DRBD_PING_INT_MIN 1
+#define DRBD_PING_INT_MAX 120
+#define DRBD_PING_INT_DEF 10
+
+ /* timeout for the ping packets.*/
+#define DRBD_PING_TIMEO_MIN  1
+#define DRBD_PING_TIMEO_MAX  300
+#define DRBD_PING_TIMEO_DEF  5
+
+  /* max number of write requests between write barriers */
+#define DRBD_MAX_EPOCH_SIZE_MIN 1
+#define DRBD_MAX_EPOCH_SIZE_MAX 20000
+#define DRBD_MAX_EPOCH_SIZE_DEF 2048
+
+  /* I don't think that a tcp send buffer of more than 10M is useful */
+#define DRBD_SNDBUF_SIZE_MIN  0
+#define DRBD_SNDBUF_SIZE_MAX  (10<<20)
+#define DRBD_SNDBUF_SIZE_DEF  0
+
+#define DRBD_RCVBUF_SIZE_MIN  0
+#define DRBD_RCVBUF_SIZE_MAX  (10<<20)
+#define DRBD_RCVBUF_SIZE_DEF  0
+
+  /* @4k PageSize -> 128kB - 512MB */
+#define DRBD_MAX_BUFFERS_MIN  32
+#define DRBD_MAX_BUFFERS_MAX  131072
+#define DRBD_MAX_BUFFERS_DEF  2048
+
+  /* @4k PageSize -> 4kB - 512MB */
+#define DRBD_UNPLUG_WATERMARK_MIN  1
+#define DRBD_UNPLUG_WATERMARK_MAX  131072
+#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
+
+  /* 0 is disabled.
+   * 200 should be more than enough even for very short timeouts */
+#define DRBD_KO_COUNT_MIN  0
+#define DRBD_KO_COUNT_MAX  200
+#define DRBD_KO_COUNT_DEF  0
+/* } */
+
+/* syncer { */
+  /* FIXME allow rate to be zero? */
+#define DRBD_RATE_MIN 1
+/* channel bonding 10 GbE, or other hardware */
+#define DRBD_RATE_MAX (4 << 20)
+#define DRBD_RATE_DEF 250  /* kb/second */
+
+  /* less than 7 would hit performance unnecessarily.
+   * 3833 is the largest prime that still does fit
+   * into 64 sectors of activity log */
+#define DRBD_AL_EXTENTS_MIN  7
+#define DRBD_AL_EXTENTS_MAX  3833
+#define DRBD_AL_EXTENTS_DEF  127
+
+#define DRBD_AFTER_MIN  -1
+#define DRBD_AFTER_MAX  255
+#define DRBD_AFTER_DEF  -1
+
+/* } */
+
+/* drbdsetup XY resize -d Z
+ * you are free to reduce the device size to nothing, if you want to.
+ * the upper limit with 64bit kernel, enough ram and flexible meta data
+ * is 1 PiB, currently. */
+/* DRBD_MAX_SECTORS */
+#define DRBD_DISK_SIZE_SECT_MIN  0
+#define DRBD_DISK_SIZE_SECT_MAX  (1 * (2LLU << 40))
+#define DRBD_DISK_SIZE_SECT_DEF  0 /* = disabled = no user size... */
+
+#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
+#define DRBD_FENCING_DEF FP_DONT_CARE
+#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
+#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
+#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
+#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
+#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
+#define DRBD_ON_CONGESTION_DEF OC_BLOCK
+
+#define DRBD_MAX_BIO_BVECS_MIN 0
+#define DRBD_MAX_BIO_BVECS_MAX 128
+#define DRBD_MAX_BIO_BVECS_DEF 0
+
+#define DRBD_C_PLAN_AHEAD_MIN  0
+#define DRBD_C_PLAN_AHEAD_MAX  300
+#define DRBD_C_PLAN_AHEAD_DEF  0 /* RS rate controller disabled by default */
+
+#define DRBD_C_DELAY_TARGET_MIN 1
+#define DRBD_C_DELAY_TARGET_MAX 100
+#define DRBD_C_DELAY_TARGET_DEF 10
+
+#define DRBD_C_FILL_TARGET_MIN 0
+#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
+#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
+
+#define DRBD_C_MAX_RATE_MIN     250 /* kByte/sec */
+#define DRBD_C_MAX_RATE_MAX     (4 << 20)
+#define DRBD_C_MAX_RATE_DEF     102400
+
+#define DRBD_C_MIN_RATE_MIN     0 /* kByte/sec */
+#define DRBD_C_MIN_RATE_MAX     (4 << 20)
+#define DRBD_C_MIN_RATE_DEF     4096
+
+#define DRBD_CONG_FILL_MIN	0
+#define DRBD_CONG_FILL_MAX	(10<<21) /* 10GByte in sectors */
+#define DRBD_CONG_FILL_DEF	0
+
+#define DRBD_CONG_EXTENTS_MIN	DRBD_AL_EXTENTS_MIN
+#define DRBD_CONG_EXTENTS_MAX	DRBD_AL_EXTENTS_MAX
+#define DRBD_CONG_EXTENTS_DEF	DRBD_AL_EXTENTS_DEF
+
+#undef RANGE
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_nl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_nl.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_nl.h	2015-01-21 12:02:58.390823752 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_nl.h	2015-01-21 12:02:58.390823752 +0300
@@ -0,0 +1,162 @@
+/*
+   PAKET( name,
+	  TYPE ( pn, pr, member )
+	  ...
+   )
+
+   You may never reissue one of the pn arguments
+*/
+
+#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
+#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
+#endif
+
+NL_PACKET(primary, 1,
+       NL_BIT(		1,	T_MAY_IGNORE,	primary_force)
+)
+
+NL_PACKET(secondary, 2, )
+
+NL_PACKET(disk_conf, 3,
+	NL_INT64(	2,	T_MAY_IGNORE,	disk_size)
+	NL_STRING(	3,	T_MANDATORY,	backing_dev,	128)
+	NL_STRING(	4,	T_MANDATORY,	meta_dev,	128)
+	NL_INTEGER(	5,	T_MANDATORY,	meta_dev_idx)
+	NL_INTEGER(	6,	T_MAY_IGNORE,	on_io_error)
+	NL_INTEGER(	7,	T_MAY_IGNORE,	fencing)
+	NL_BIT(		37,	T_MAY_IGNORE,	use_bmbv)
+	NL_BIT(		53,	T_MAY_IGNORE,	no_disk_flush)
+	NL_BIT(		54,	T_MAY_IGNORE,	no_md_flush)
+	  /*  55 max_bio_size was available in 8.2.6rc2 */
+	NL_INTEGER(	56,	T_MAY_IGNORE,	max_bio_bvecs)
+	NL_BIT(		57,	T_MAY_IGNORE,	no_disk_barrier)
+	NL_BIT(		58,	T_MAY_IGNORE,	no_disk_drain)
+	NL_INTEGER(	89,	T_MAY_IGNORE,	disk_timeout)
+)
+
+NL_PACKET(detach, 4,
+	NL_BIT(		88,	T_MANDATORY,	detach_force)
+)
+
+NL_PACKET(net_conf, 5,
+	NL_STRING(	8,	T_MANDATORY,	my_addr,	128)
+	NL_STRING(	9,	T_MANDATORY,	peer_addr,	128)
+	NL_STRING(	10,	T_MAY_IGNORE,	shared_secret,	SHARED_SECRET_MAX)
+	NL_STRING(	11,	T_MAY_IGNORE,	cram_hmac_alg,	SHARED_SECRET_MAX)
+	NL_STRING(	44,	T_MAY_IGNORE,	integrity_alg,	SHARED_SECRET_MAX)
+	NL_INTEGER(	14,	T_MAY_IGNORE,	timeout)
+	NL_INTEGER(	15,	T_MANDATORY,	wire_protocol)
+	NL_INTEGER(	16,	T_MAY_IGNORE,	try_connect_int)
+	NL_INTEGER(	17,	T_MAY_IGNORE,	ping_int)
+	NL_INTEGER(	18,	T_MAY_IGNORE,	max_epoch_size)
+	NL_INTEGER(	19,	T_MAY_IGNORE,	max_buffers)
+	NL_INTEGER(	20,	T_MAY_IGNORE,	unplug_watermark)
+	NL_INTEGER(	21,	T_MAY_IGNORE,	sndbuf_size)
+	NL_INTEGER(	22,	T_MAY_IGNORE,	ko_count)
+	NL_INTEGER(	24,	T_MAY_IGNORE,	after_sb_0p)
+	NL_INTEGER(	25,	T_MAY_IGNORE,	after_sb_1p)
+	NL_INTEGER(	26,	T_MAY_IGNORE,	after_sb_2p)
+	NL_INTEGER(	39,	T_MAY_IGNORE,	rr_conflict)
+	NL_INTEGER(	40,	T_MAY_IGNORE,	ping_timeo)
+	NL_INTEGER(	67,	T_MAY_IGNORE,	rcvbuf_size)
+	NL_INTEGER(	81,	T_MAY_IGNORE,	on_congestion)
+	NL_INTEGER(	82,	T_MAY_IGNORE,	cong_fill)
+	NL_INTEGER(	83,	T_MAY_IGNORE,	cong_extents)
+	  /* 59 addr_family was available in GIT, never released */
+	NL_BIT(		60,	T_MANDATORY,	mind_af)
+	NL_BIT(		27,	T_MAY_IGNORE,	want_lose)
+	NL_BIT(		28,	T_MAY_IGNORE,	two_primaries)
+	NL_BIT(		41,	T_MAY_IGNORE,	always_asbp)
+	NL_BIT(		61,	T_MAY_IGNORE,	no_cork)
+	NL_BIT(		62,	T_MANDATORY,	auto_sndbuf_size)
+	NL_BIT(		70,	T_MANDATORY,	dry_run)
+)
+
+NL_PACKET(disconnect, 6,
+	NL_BIT(		84,	T_MAY_IGNORE,	force)
+)
+
+NL_PACKET(resize, 7,
+	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
+	NL_BIT(			68,	T_MAY_IGNORE,	resize_force)
+	NL_BIT(			69,	T_MANDATORY,	no_resync)
+)
+
+NL_PACKET(syncer_conf, 8,
+	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
+	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
+	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
+	  /* NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume) */
+	  /* NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval) */
+	  /* NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th) removed */
+	  /* NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th) removed */
+	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
+	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
+	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
+	NL_BIT(         65,     T_MAY_IGNORE,   use_rle)
+	NL_INTEGER(	75,	T_MAY_IGNORE,	on_no_data)
+	NL_INTEGER(	76,	T_MAY_IGNORE,	c_plan_ahead)
+	NL_INTEGER(     77,	T_MAY_IGNORE,	c_delay_target)
+	NL_INTEGER(     78,	T_MAY_IGNORE,	c_fill_target)
+	NL_INTEGER(     79,	T_MAY_IGNORE,	c_max_rate)
+	NL_INTEGER(     80,	T_MAY_IGNORE,	c_min_rate)
+)
+
+NL_PACKET(invalidate, 9, )
+NL_PACKET(invalidate_peer, 10, )
+NL_PACKET(pause_sync, 11, )
+NL_PACKET(resume_sync, 12, )
+NL_PACKET(suspend_io, 13, )
+NL_PACKET(resume_io, 14, )
+NL_PACKET(outdate, 15, )
+NL_PACKET(get_config, 16, )
+NL_PACKET(get_state, 17,
+	NL_INTEGER(	33,	T_MAY_IGNORE,	state_i)
+)
+
+NL_PACKET(get_uuids, 18,
+	NL_STRING(	34,	T_MAY_IGNORE,	uuids,	(UI_SIZE*sizeof(__u64)))
+	NL_INTEGER(	35,	T_MAY_IGNORE,	uuids_flags)
+)
+
+NL_PACKET(get_timeout_flag, 19,
+	NL_BIT(		36,	T_MAY_IGNORE,	use_degraded)
+)
+
+NL_PACKET(call_helper, 20,
+	NL_STRING(	38,	T_MAY_IGNORE,	helper,		32)
+)
+
+/* Tag nr 42 already allocated in drbd-8.1 development. */
+
+NL_PACKET(sync_progress, 23,
+	NL_INTEGER(	43,	T_MAY_IGNORE,	sync_progress)
+)
+
+NL_PACKET(dump_ee, 24,
+	NL_STRING(	45,	T_MAY_IGNORE,	dump_ee_reason, 32)
+	NL_STRING(	46,	T_MAY_IGNORE,	seen_digest, SHARED_SECRET_MAX)
+	NL_STRING(	47,	T_MAY_IGNORE,	calc_digest, SHARED_SECRET_MAX)
+	NL_INT64(	48,	T_MAY_IGNORE,	ee_sector)
+	NL_INT64(	49,	T_MAY_IGNORE,	ee_block_id)
+	NL_STRING(	50,	T_MAY_IGNORE,	ee_data,	32 << 10)
+)
+
+NL_PACKET(start_ov, 25,
+	NL_INT64(	66,	T_MAY_IGNORE,	start_sector)
+)
+
+NL_PACKET(new_c_uuid, 26,
+       NL_BIT(		63,	T_MANDATORY,	clear_bm)
+)
+
+#ifdef NL_RESPONSE
+NL_RESPONSE(return_code_only, 27)
+#endif
+
+#undef NL_PACKET
+#undef NL_INTEGER
+#undef NL_INT64
+#undef NL_BIT
+#undef NL_STRING
+#undef NL_RESPONSE
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_tag_magic.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_tag_magic.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/drbd_tag_magic.h	2015-01-21 12:02:58.390823752 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/drbd_tag_magic.h	2015-01-21 12:02:58.390823752 +0300
@@ -0,0 +1,84 @@
+#ifndef DRBD_TAG_MAGIC_H
+#define DRBD_TAG_MAGIC_H
+
+#define TT_END     0
+#define TT_REMOVED 0xE000
+
+/* declare packet_type enums */
+enum packet_types {
+#define NL_PACKET(name, number, fields) P_ ## name = number,
+#define NL_RESPONSE(name, number) P_ ## name = number,
+#define NL_INTEGER(pn, pr, member)
+#define NL_INT64(pn, pr, member)
+#define NL_BIT(pn, pr, member)
+#define NL_STRING(pn, pr, member, len)
+#include "drbd_nl.h"
+	P_nl_after_last_packet,
+};
+
+/* These struct are used to deduce the size of the tag lists: */
+#define NL_PACKET(name, number, fields)	\
+	struct name ## _tag_len_struct { fields };
+#define NL_INTEGER(pn, pr, member)		\
+	int member; int tag_and_len ## member;
+#define NL_INT64(pn, pr, member)		\
+	__u64 member; int tag_and_len ## member;
+#define NL_BIT(pn, pr, member)		\
+	unsigned char member:1; int tag_and_len ## member;
+#define NL_STRING(pn, pr, member, len)	\
+	unsigned char member[len]; int member ## _len; \
+	int tag_and_len ## member;
+#include "linux/drbd_nl.h"
+
+/* declare tag-list-sizes */
+static const int tag_list_sizes[] = {
+#define NL_PACKET(name, number, fields) 2 fields ,
+#define NL_INTEGER(pn, pr, member)      + 4 + 4
+#define NL_INT64(pn, pr, member)        + 4 + 8
+#define NL_BIT(pn, pr, member)          + 4 + 1
+#define NL_STRING(pn, pr, member, len)  + 4 + (len)
+#include "drbd_nl.h"
+};
+
+/* The two highest bits are used for the tag type */
+#define TT_MASK      0xC000
+#define TT_INTEGER   0x0000
+#define TT_INT64     0x4000
+#define TT_BIT       0x8000
+#define TT_STRING    0xC000
+/* The next bit indicates if processing of the tag is mandatory */
+#define T_MANDATORY  0x2000
+#define T_MAY_IGNORE 0x0000
+#define TN_MASK      0x1fff
+/* The remaining 13 bits are used to enumerate the tags */
+
+#define tag_type(T)   ((T) & TT_MASK)
+#define tag_number(T) ((T) & TN_MASK)
+
+/* declare tag enums */
+#define NL_PACKET(name, number, fields) fields
+enum drbd_tags {
+#define NL_INTEGER(pn, pr, member)     T_ ## member = pn | TT_INTEGER | pr ,
+#define NL_INT64(pn, pr, member)       T_ ## member = pn | TT_INT64   | pr ,
+#define NL_BIT(pn, pr, member)         T_ ## member = pn | TT_BIT     | pr ,
+#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING  | pr ,
+#include "drbd_nl.h"
+};
+
+struct tag {
+	const char *name;
+	int type_n_flags;
+	int max_len;
+};
+
+/* declare tag names */
+#define NL_PACKET(name, number, fields) fields
+static const struct tag tag_descriptions[] = {
+#define NL_INTEGER(pn, pr, member)     [ pn ] = { #member, TT_INTEGER | pr, sizeof(int)   },
+#define NL_INT64(pn, pr, member)       [ pn ] = { #member, TT_INT64   | pr, sizeof(__u64) },
+#define NL_BIT(pn, pr, member)         [ pn ] = { #member, TT_BIT     | pr, sizeof(int)   },
+#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING  | pr, (len)         },
+#include "drbd_nl.h"
+};
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/elf.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/elf.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/elf.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/elf.h	2015-01-21 12:02:48.001099552 +0300
@@ -444,5 +444,6 @@ static inline int elf_coredump_extra_not
 extern int elf_coredump_extra_notes_size(void);
 extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset);
 #endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_ELF_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/eventfd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/eventfd.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/eventfd.h	2014-12-12 23:28:52.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/eventfd.h	2015-01-21 12:02:48.123096314 +0300
@@ -11,6 +11,7 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/wait.h>
+#include <linux/kref.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -26,6 +27,23 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+struct eventfd_ctx {
+	struct kref kref;
+	wait_queue_head_t wqh;
+	/*
+	 * Every time that a write(2) is performed on an eventfd, the
+	 * value of the __u64 being written is added to "count" and a
+	 * wakeup is performed on "wqh". A read(2) will return the "count"
+	 * value to userspace, and will reset "count" to zero. The kernel
+	 * side eventfd_signal() also, adds to the "count" counter and
+	 * issue a wakeup.
+	 */
+	__u64 count;
+	unsigned int flags;
+};
+
+extern const struct file_operations eventfd_fops;
+
 #ifdef CONFIG_EVENTFD
 
 struct file *eventfd_file_create(unsigned int count, int flags);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/eventpoll.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/eventpoll.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/eventpoll.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/eventpoll.h	2015-01-21 12:02:57.966835006 +0300
@@ -17,6 +17,7 @@
 /* For O_CLOEXEC */
 #include <linux/fcntl.h>
 #include <linux/types.h>
+#include <linux/fs.h>
 
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
@@ -63,6 +64,107 @@ static inline void eventpoll_init_file(s
 	INIT_LIST_HEAD(&file->f_ep_links);
 }
 
+struct epoll_filefd {
+	struct file *file;
+	int fd;
+	int added;
+};
+
+/*
+ * This structure is stored inside the "private_data" member of the file
+ * structure and rapresent the main data sructure for the eventpoll
+ * interface.
+ */
+struct eventpoll {
+	/* Protect the this structure access */
+	spinlock_t lock;
+
+	/*
+	 * This mutex is used to ensure that files are not removed
+	 * while epoll is using them. This is held during the event
+	 * collection loop, the file cleanup path, the epoll file exit
+	 * code and the ctl operations.
+	 */
+	struct mutex mtx;
+
+	/* Wait queue used by sys_epoll_wait() */
+	wait_queue_head_t wq;
+
+	/* Wait queue used by file->poll() */
+	wait_queue_head_t poll_wait;
+
+	/* List of ready file descriptors */
+	struct list_head rdllist;
+
+	/* RB tree root used to store monitored fd structs */
+	struct rb_root rbr;
+
+	/*
+	 * This is a single linked list that chains all the "struct epitem" that
+	 * happened while transfering ready events to userspace w/out
+	 * holding ->lock.
+	 */
+	struct epitem *ovflist;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
+
+	struct file *file;
+
+	/* used to optimize loop detection check */
+	int visited;
+	struct list_head visited_list_link;
+};
+
+/*
+ * Each file descriptor added to the eventpoll interface will
+ * have an entry of this type linked to the "rbr" RB tree.
+ */
+struct epitem {
+	union {
+		/* RB tree node links this structure to the eventpoll RB tree */
+		struct rb_node rbn;
+		/* Used to free the struct epitem */
+		struct rcu_head rcu;
+	};
+
+	/* List header used to link this structure to the eventpoll ready list */
+	struct list_head rdllink;
+
+	/*
+	 * Works together "struct eventpoll"->ovflist in keeping the
+	 * single linked chain of items.
+	 */
+	struct epitem *next;
+
+	/* The file descriptor information this item refers to */
+	struct epoll_filefd ffd;
+
+	/* Number of active wait queue attached to poll operations */
+	int nwait;
+
+	/* List containing poll wait queues */
+	struct list_head pwqlist;
+
+	/* The "container" of this item */
+	struct eventpoll *ep;
+
+	/* List header used to link this item to the "struct file" items list */
+	struct list_head fllink;
+
+	/* The structure that describe the interested events and the source fd */
+	struct epoll_event event;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
+};
+
+extern struct semaphore epsem;
+extern const struct file_operations eventpoll_fops;
+extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
+extern int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+		     struct file *tfile, int fd, int full_check);
+extern void clear_tfile_check_list(void);
 
 /* Used to release the epoll bits inside the "struct file" */
 void eventpoll_release_file(struct file *file);
@@ -95,6 +197,8 @@ static inline void eventpoll_release(str
 	eventpoll_release_file(file);
 }
 
+extern struct mutex epmutex;
+
 #else
 
 static inline void eventpoll_init_file(struct file *file) {}
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/exportfs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/exportfs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/exportfs.h	2014-12-12 23:28:55.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/exportfs.h	2015-01-21 12:02:52.023992756 +0300
@@ -8,6 +8,9 @@ struct inode;
 struct super_block;
 struct vfsmount;
 
+/* limit the handle size to NFSv4 handle size now */
+#define MAX_HANDLE_SZ 128
+
 /*
  * The fileid_type identifies how the file within the filesystem is encoded.
  * In theory this is freely set and parsed by the filesystem, but we try to
@@ -108,8 +111,10 @@ struct fid {
  *    set, the encode_fh() should store sufficient information so that a good
  *    attempt can be made to find not only the file but also it's place in the
  *    filesystem.   This typically means storing a reference to de->d_parent in
- *    the filehandle fragment.  encode_fh() should return the number of bytes
- *    stored or a negative error code such as %-ENOSPC
+ *    the filehandle fragment.  encode_fh() should return the fileid_type on
+ *    success and on error returns 255 (if the space needed to encode fh is
+ *    greater than @max_len*4 bytes). On error @max_len contains the minimum
+ *    size(in 4 byte unit) needed to encode the file handle.
  *
  * fh_to_dentry:
  *    @fh_to_dentry is given a &struct super_block (@sb) and a file handle
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fadvise.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fadvise.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fadvise.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fadvise.h	2015-01-21 12:02:42.170254356 +0300
@@ -18,4 +18,7 @@
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
 
+#ifdef __KERNEL__
+extern int generic_fadvise(struct file* file, loff_t off, loff_t len, int adv);
+#endif
 #endif	/* FADVISE_H_INCLUDED */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fairsched.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fairsched.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fairsched.h	2015-01-21 12:02:53.654949462 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fairsched.h	2015-01-21 12:02:54.149936322 +0300
@@ -0,0 +1,79 @@
+/*
+ * Fair Scheduler
+ *
+ * Copyright (C) 2000-2008  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __LINUX_FAIRSCHED_H__
+#define __LINUX_FAIRSCHED_H__
+
+#define FAIRSCHED_SET_RATE	0
+#define FAIRSCHED_DROP_RATE	1
+#define FAIRSCHED_GET_RATE	2
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_VZ_FAIRSCHED
+
+#define FSCHWEIGHT_MAX		((1 << 16) - 1)
+#define FSCHRATE_SHIFT		10
+#define FSCH_TIMESLICE		16
+
+/******************************************************************************
+ * cfs group shares = FSCHWEIGHT_BASE / fairsched weight
+ *
+ * vzctl cpuunits default 1000
+ * cfs shares default value is 1024 (see init_task_group_load in sched.c)
+ * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024
+ *				^--- from vzctl
+ * weight in 1..65535  -->  shares in 7..512000
+ * shares should be >1 (see comment in sched_group_set_shares function)
+ *****************************************************************************/
+
+#define FSCHWEIGHT_BASE		512000UL
+
+asmlinkage long sys_fairsched_mknod(unsigned int parent, unsigned int weight,
+				   unsigned int newid);
+asmlinkage long sys_fairsched_rmnod(unsigned int id);
+asmlinkage long sys_fairsched_mvpr(pid_t pid, unsigned int id);
+asmlinkage long sys_fairsched_vcpus(unsigned int id, unsigned int vcpus);
+asmlinkage long sys_fairsched_chwt(unsigned int id, unsigned int weight);
+asmlinkage long sys_fairsched_rate(unsigned int id, int op, unsigned int rate);
+asmlinkage long sys_fairsched_cpumask(unsigned int id, unsigned int len,
+				      unsigned long __user *user_mask_ptr);
+asmlinkage long sys_fairsched_nodemask(unsigned int id, unsigned int len,
+				       unsigned long __user *user_mask_ptr);
+
+int fairsched_new_node(int id, unsigned int vcpus);
+int fairsched_move_task(int id, struct task_struct *tsk);
+void fairsched_drop_node(int id, int leave);
+
+struct kernel_cpustat;
+void cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat);
+int fairsched_get_cpu_stat(int id, struct kernel_cpustat *kstat);
+
+int cpu_cgroup_get_avenrun(struct cgroup *cgrp, unsigned long *avenrun);
+int fairsched_get_cpu_avenrun(int id, unsigned long *avenrun);
+
+struct cftype;
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *p);
+int fairsched_show_stat(struct seq_file *p, int id);
+
+#else /* CONFIG_VZ_FAIRSCHED */
+
+static inline int fairsched_new_node(int id, unsigned int vcpus) { return 0; }
+static inline int fairsched_move_task(int id, struct task_struct *tsk) { return 0; }
+static inline void fairsched_drop_node(int id, int leave) { }
+static inline int fairsched_show_stat(struct seq_file *p, int id) { return -ENOSYS; }
+static inline int fairsched_get_cpu_avenrun(int id, unsigned long *avenrun) { return -ENOSYS; }
+static inline int fairsched_get_cpu_stat(int id, struct kernel_cpustat *kstat) { return -ENOSYS; }
+
+#endif /* CONFIG_VZ_FAIRSCHED */
+#endif /* __KERNEL__ */
+
+#endif /* __LINUX_FAIRSCHED_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/falloc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/falloc.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/falloc.h	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/falloc.h	2015-01-21 12:02:43.106229505 +0300
@@ -4,6 +4,9 @@
 #define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
 #define FALLOC_FL_PUNCH_HOLE	0x02 /* de-allocates range */
 
+#define FALLOC_FL_CONVERT_AND_EXTEND 0x100 /* mark extents as initialized
+					    * and extend i_size */
+
 #ifdef __KERNEL__
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fcntl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fcntl.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fcntl.h	2014-12-12 23:29:10.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fcntl.h	2015-01-21 12:02:52.023992756 +0300
@@ -40,6 +40,7 @@
                                            unlinking file.  */
 #define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
 #define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
+#define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
 
 #ifdef __KERNEL__
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fdtable.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fdtable.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fdtable.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fdtable.h	2015-01-21 12:02:57.967834979 +0300
@@ -64,6 +64,7 @@ struct vfsmount;
 struct dentry;
 
 extern int expand_files(struct files_struct *, int nr);
+extern int expand_fdtable(struct files_struct *files, int nr);
 extern void free_fdtable_rcu(struct rcu_head *rcu);
 extern void __init files_defer_init(void);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fence-watchdog.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fence-watchdog.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fence-watchdog.h	2015-01-21 12:02:58.505820701 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fence-watchdog.h	2015-01-21 12:02:58.548819559 +0300
@@ -0,0 +1,7 @@
+#ifndef _LINUX_FENCE_WATCHDOG_H_
+#define _LINUX_FENCE_WATCHDOG_H_
+
+inline int fence_wdog_check_timer(void);
+bool fence_wdog_tmo_match(void);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/file.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/file.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/file.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/file.h	2015-01-21 12:02:50.194041336 +0300
@@ -28,6 +28,7 @@ static inline void fput_light(struct fil
 		fput(file);
 }
 
+extern struct file *get_empty_filp(void);
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_light(unsigned int fd, int *fput_needed);
 extern void set_close_on_exec(unsigned int fd, int flag);
@@ -39,4 +40,7 @@ extern void put_unused_fd(unsigned int f
 
 extern void fd_install(unsigned int fd, struct file *file);
 
+struct file *get_task_file(pid_t pid, int fd);
+extern struct kmem_cache *filp_cachep;
+
 #endif /* __LINUX_FILE_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/freezer.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/freezer.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/freezer.h	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/freezer.h	2015-01-21 12:02:48.066097826 +0300
@@ -63,9 +63,20 @@ static inline int try_to_freeze(void)
 extern bool freeze_task(struct task_struct *p, bool sig_only);
 extern void cancel_freezing(struct task_struct *p);
 
+enum freezer_state {
+	CGROUP_THAWED = 0,
+	CGROUP_FREEZING,
+	CGROUP_FROZEN,
+};
+
 #ifdef CONFIG_CGROUP_FREEZER
+extern int freezer_change_state(struct cgroup *, enum freezer_state);
 extern int cgroup_freezing_or_frozen(struct task_struct *task);
 #else /* !CONFIG_CGROUP_FREEZER */
+static inline int freezer_change_state(struct cgroup *c, enum freezer_state s)
+{
+	return -ENOSYS;
+}
 static inline int cgroup_freezing_or_frozen(struct task_struct *task)
 {
 	return 0;
@@ -139,7 +150,7 @@ static inline void set_freezable_with_si
  * if it ends up racing with the freezer. Callers must be able to deal with
  * spurious wakeups.
  */
-#define freezable_schedule()						\
+#define __freezable_schedule()						\
 ({									\
 	freezer_do_not_count();						\
 	if (!try_to_freeze())						\
@@ -228,7 +239,7 @@ static inline int freezer_should_skip(st
 static inline void set_freezable(void) {}
 static inline void set_freezable_with_signal(void) {}
 
-#define freezable_schedule()  schedule()
+#define __freezable_schedule()  schedule()
 
 #define freezable_schedule_timeout_killable(timeout)			\
 	schedule_timeout_killable(timeout)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fs.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fs.h	2015-01-21 12:02:58.723814914 +0300
@@ -62,6 +62,8 @@ struct inodes_stat_t {
 #define MAY_APPEND 8
 #define MAY_ACCESS 16
 #define MAY_OPEN 32
+#define MAY_QUOTACTL 64 /* for devgroup-vs-openvz only */
+#define MAY_MOUNT 128
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
@@ -103,6 +105,12 @@ struct inodes_stat_t {
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)4096)
 
+/* Can do sys_quotactl (for devperms) */
+#define FMODE_QUOTACTL		((__force fmode_t)0x2000)
+
+/* Exclusive access; usually due to mount(2) */
+#define FMODE_EXCLUSIVE		((__force fmode_t)0x4000)
+
 /*
  * The below are the various read and write types that we support. Some of
  * them include behavioral modifiers that send information down to the
@@ -198,6 +206,9 @@ struct inodes_stat_t {
 #define FS_REQUIRES_DEV 1 
 #define FS_BINARY_MOUNTDATA 2
 #define FS_HAS_SUBTYPE 4
+#define FS_VIRTUALIZED 64	/* Can mount this fstype inside ve */
+#define FS_MANGLE_PROC 128	/* hide some /proc/mounts info inside VE */
+#define FS_NFS_EXPORTABLE 256
 #define FS_HAS_NEW_FREEZE 512	/* new freeze mechanism */
 #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move()
@@ -206,6 +217,12 @@ struct inodes_stat_t {
 #define FS_HANDLE_QUOTA		(1<<16)	/* FS handle quota disable/enable */
 #define FS_WEAK_REVALIDATE (1<<17) /* FS has d_op->d_weak_revalidate. Must
 					also have FS_REVAL_DOT set! */
+/*
+ * f_op->mmap must be called with vma=NULL before taking mmap_sem;
+ * workaround for wrong i_mutex vs mmap_sem lock ordering in pfcache
+ * (PSBM-23133) - vdavydov@
+ */
+#define FS_HAS_MMAP_PREP	(1<<18)
 
  /*
   * the fs is built with the new s_writers member in the superblock
@@ -224,6 +241,7 @@ struct inodes_stat_t {
 #define MS_REMOUNT	32	/* Alter flags of a mounted FS */
 #define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
 #define MS_DIRSYNC	128	/* Directory modifications are synchronous */
+#define MS_CPTMOUNT	256
 #define MS_NOATIME	1024	/* Do not update access times. */
 #define MS_NODIRATIME	2048	/* Do not update directory access times */
 #define MS_BIND		4096
@@ -252,6 +270,12 @@ struct inodes_stat_t {
 #define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
 
 /*
+ * Remounts, which change any flags except for following ones,
+ * are forbidden inside containers.
+ */
+#define MS_VE_RMT_MASK	MS_RDONLY
+
+/*
  * Old magic mount flag and mask
  */
 #define MS_MGC_VAL 0xC0ED0000
@@ -272,6 +296,9 @@ struct inodes_stat_t {
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_AOP_EXT	16384 /* fs supports extended aops */
 
+/* VZ flags -- These are not upstream! */
+#define S_NOUNUSE	(1 << 17) /* just destroy inode in cleanup */
+
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
  * flags just means all the inodes inherit those flags by default. It might be
@@ -412,7 +439,6 @@ struct inodes_stat_t {
 #include <linux/path.h>
 #include <linux/stat.h>
 #include <linux/cache.h>
-#include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
@@ -424,6 +450,7 @@ struct inodes_stat_t {
 #include <linux/fiemap.h>
 #include <linux/lockdep.h>
 #include <linux/percpu_counter.h>
+#include <linux/workqueue.h>
 
 #include <asm/atomic.h>
 #include <asm/byteorder.h>
@@ -453,6 +480,12 @@ extern int leases_enable, lease_break_ti
 extern int dir_notify_enable;
 #endif
 
+int ve_fsync_behavior(void);
+
+#define FSYNC_NEVER	0	/* ve syncs are ignored    */
+#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
@@ -460,6 +493,8 @@ typedef void (dio_iodone_t)(struct kiocb
 			ssize_t bytes, void *private, int ret,
 			bool is_async);
 
+int may_use_odirect(void);
+
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
@@ -509,10 +544,15 @@ struct iattr {
 	struct file	*ia_file;
 };
 
+#include <linux/kobject.h>
+
 /*
  * Includes for diskquotas.
  */
 #include <linux/quota.h>
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+#include <linux/vzquota_qlnk.h>
+#endif
 
 /** 
  * enum positive_aop_returns - aop return codes with specific semantics
@@ -557,33 +597,141 @@ enum positive_aop_returns {
 struct page;
 struct address_space;
 struct writeback_control;
+struct file_ra_state;
 
 struct iov_iter {
-	const struct iovec *iov;
+	struct iov_iter_ops *ops;
+	unsigned long data;
 	unsigned long nr_segs;
 	size_t iov_offset;
 	size_t count;
 };
 
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-size_t iov_iter_single_seg_count(const struct iov_iter *i);
+struct iov_iter_ops {
+	size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *,
+					 unsigned long, size_t);
+	size_t (*ii_copy_to_user)(struct page *, struct iov_iter *,
+				  unsigned long, size_t);
+	size_t (*ii_copy_from_user_atomic)(struct page *, struct iov_iter *,
+					   unsigned long, size_t);
+	size_t (*ii_copy_from_user)(struct page *, struct iov_iter *,
+					  unsigned long, size_t);
+	void (*ii_advance)(struct iov_iter *, size_t);
+	int (*ii_fault_in_readable)(struct iov_iter *, size_t);
+	size_t (*ii_single_seg_count)(const struct iov_iter *);
+	int (*ii_shorten)(struct iov_iter *, size_t);
+};
+
+static inline size_t iov_iter_copy_to_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_to_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_to_user(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_from_user(page, i, offset, bytes);
+}
+static inline void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+	return i->ops->ii_advance(i, bytes);
+}
+static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return i->ops->ii_fault_in_readable(i, bytes);
+}
+static inline size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+	return i->ops->ii_single_seg_count(i);
+}
+static inline int iov_iter_shorten(struct iov_iter *i, size_t count)
+{
+	return i->ops->ii_shorten(i, count);
+}
+
+extern struct iov_iter_ops ii_bvec_ops;
+
+struct bio_vec;
+static inline void iov_iter_init_bvec(struct iov_iter *i,
+				      struct bio_vec *bvec,
+				      unsigned long nr_segs,
+				      size_t count, size_t written)
+{
+	i->ops = &ii_bvec_ops;
+	i->data = (unsigned long)bvec;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_bvec(struct iov_iter *i)
+{
+	return i->ops == &ii_bvec_ops;
+}
+static inline struct bio_vec *iov_iter_bvec(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_bvec(i));
+	return (struct bio_vec *)i->data;
+}
+
+extern struct iov_iter_ops ii_page_ops;
+
+static inline void iov_iter_init_page(struct iov_iter *i,
+				      struct page *page,
+				      size_t count, size_t written)
+{
+	i->ops = &ii_page_ops;
+	i->data = (unsigned long)page;
+	i->nr_segs = 1;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_page(struct iov_iter *i)
+{
+	return i->ops == &ii_page_ops;
+}
+static inline struct page *iov_iter_page(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_page(i));
+	return (struct page *)i->data;
+}
+
+extern struct iov_iter_ops ii_iovec_ops;
 
 static inline void iov_iter_init(struct iov_iter *i,
 			const struct iovec *iov, unsigned long nr_segs,
 			size_t count, size_t written)
 {
-	i->iov = iov;
+	i->ops = &ii_iovec_ops;
+	i->data = (unsigned long)iov;
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count + written;
 
 	iov_iter_advance(i, written);
 }
+static inline int iov_iter_has_iovec(const struct iov_iter *i)
+{
+	return i->ops == &ii_iovec_ops;
+}
+static inline struct iovec *iov_iter_iovec(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_iovec(i));
+	return (struct iovec *)i->data;
+}
 
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
@@ -639,6 +787,10 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, gfp_t);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
+	ssize_t (*direct_IO_bvec)(int, struct kiocb *, struct bio_vec *bvec,
+			loff_t offset, unsigned long bvec_len);
+	ssize_t (*direct_IO_page)(int, struct kiocb *, struct page *page,
+			loff_t offset);
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
 						void **, unsigned long *);
 	/*
@@ -697,6 +849,8 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct user_beancounter *dirtied_ub;
+	struct list_head	i_peer_list;
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -816,6 +970,9 @@ struct inode {
 #ifdef CONFIG_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
 #endif
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+	struct vz_quota_ilink	i_qlnk;
+#endif
 	struct list_head	i_devices;
 	union {
 		struct pipe_inode_info	*i_pipe;
@@ -848,9 +1005,11 @@ struct inode {
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
+	struct file		*i_peer_file;
 	void			*i_private; /* fs or device private pointer */
 };
 
+extern struct kmem_cache *inode_cachep;
 /*
  * inode->i_mutex nesting subclasses for the lock validator:
  *
@@ -990,13 +1149,21 @@ struct file {
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
 	spinlock_t		f_lock;  /* f_ep_links, f_flags, no IRQ */
+#ifdef CONFIG_SMP
+	int			f_sb_list_cpu;
+#endif
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
+	char			f_heavy;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;
-	struct file_ra_state	f_ra;
+	union {
+		struct file_ra_state	f_ra;
+		struct work_struct	f_work;
+	};
+	struct user_beancounter	*f_ub;
 
 	u64			f_version;
 #ifdef CONFIG_SECURITY
@@ -1014,9 +1181,17 @@ struct file {
 	unsigned long f_mnt_write_state;
 #endif
 };
-extern spinlock_t files_lock;
-#define file_list_lock() spin_lock(&files_lock);
-#define file_list_unlock() spin_unlock(&files_lock);
+
+struct file_handle {
+	__u32 handle_bytes;
+	int handle_type;
+	/* file identifier */
+	unsigned char f_handle[0];
+};
+
+extern int vfs_inode_fhandle(struct inode *, struct file_handle *, int size);
+extern struct dentry *vfs_fhandle_to_dentry(struct super_block *,
+					    struct file_handle *);
 
 #define get_file(x)	atomic_long_inc(&(x)->f_count)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
@@ -1081,6 +1256,7 @@ static inline int file_check_writeable(s
 #define FL_LEASE	32	/* lease held on this file */
 #define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
+#define FL_LOCAL	256	/* A local lock */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -1100,6 +1276,7 @@ typedef struct files_struct *fl_owner_t;
 struct file_lock_operations {
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
+	int (*fl_owner_id)(struct file_lock *);
 };
 
 struct lock_manager_operations {
@@ -1129,8 +1306,11 @@ struct file_lock {
 	struct list_head fl_link;	/* doubly linked list of all locks */
 	struct list_head fl_block;	/* circular list of blocked processes */
 	fl_owner_t fl_owner;
-	unsigned char fl_flags;
+	unsigned short fl_flags;
 	unsigned char fl_type;
+#ifdef CONFIG_BEANCOUNTERS
+	unsigned char fl_charged;
+#endif
 	unsigned int fl_pid;
 	struct pid *fl_nspid;
 	wait_queue_head_t fl_wait;
@@ -1160,6 +1340,9 @@ struct file_lock {
 #define OFFT_OFFSET_MAX	INT_LIMIT(off_t)
 #endif
 
+struct file_lock *locks_alloc_lock(int);
+void locks_free_lock(struct file_lock *);
+
 #include <linux/fcntl.h>
 
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
@@ -1168,6 +1351,8 @@ extern void send_sigio(struct fown_struc
 extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
 			loff_t endbyte, unsigned int flags);
 
+extern void generic_set_file_flags_unlocked(struct file*, unsigned int arg);
+extern int generic_set_file_flags(struct file*, unsigned int arg);
 #ifdef CONFIG_FILE_LOCKING
 extern int fcntl_getlk(struct file *, struct flock __user *);
 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
@@ -1386,6 +1571,8 @@ struct mm_struct;
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
 
+struct pramcache_struct;
+
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
@@ -1441,7 +1628,11 @@ struct super_block {
 
 	struct list_head	s_inodes;	/* all inodes */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
+#ifdef CONFIG_SMP
+	struct list_head	*s_files;
+#else
 	struct list_head	s_files;
+#endif
 	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
@@ -1456,6 +1647,9 @@ struct super_block {
 	wait_queue_head_t	s_wait_unfrozen;
 
 	char s_id[32];				/* Informational name */
+	u8 s_uuid[16];				/* UUID */
+
+	unsigned int		s_mnt_count;
 
 	void 			*s_fs_info;	/* Filesystem private info */
 	fmode_t			s_mode;
@@ -1701,7 +1895,9 @@ struct file_operations {
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
@@ -1717,11 +1913,13 @@ struct file_operations {
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-	int (*check_flags)(int);
+	int (*set_flags)(struct file *, int);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*fadvise)(struct file* file, loff_t offset, loff_t len, int advice);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	struct file * (*get_host)(struct file *);
 };
 
 struct inode_operations {
@@ -1788,12 +1986,18 @@ struct super_operations {
 	void (*umount_begin) (struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
+	void (*show_type)(struct seq_file *, struct super_block *sb);
 	int (*show_stats)(struct seq_file *, struct vfsmount *);
 #ifdef CONFIG_QUOTA
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+	ssize_t (*quota_read_ino)(struct super_block *, struct inode *, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+	ssize_t (*quota_write_ino)(struct super_block *, struct inode *, const char *, size_t, loff_t);
+	struct inode *(*get_quota_root)(struct super_block *);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*start_write)(struct super_block *sb, int level, bool wait);
+	void (*end_write)(struct super_block *sb, int level);
 };
 
 /*
@@ -1965,6 +2169,7 @@ struct file_system_type {
 
 	struct lock_class_key s_lock_key;
 	struct lock_class_key s_umount_key;
+	struct lock_class_key s_rename_mutex_key;
 
 	struct lock_class_key i_lock_key;
 	struct lock_class_key i_mutex_key;
@@ -1973,8 +2178,13 @@ struct file_system_type {
 #ifndef __GENKSYMS__
 	struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
 #endif
+	struct file_system_type *proto;
+	struct ve_struct *owner_env;
 };
 
+void get_filesystem(struct file_system_type *fs);
+void put_filesystem(struct file_system_type *fs);
+
 extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
@@ -2029,9 +2239,24 @@ extern int register_filesystem(struct fi
 extern int unregister_filesystem(struct file_system_type *);
 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
+
+#ifdef CONFIG_VE
+extern int register_ve_fs_type_data(struct ve_struct *, struct file_system_type *,
+		struct file_system_type **, struct vfsmount **, void *data);
+#define register_ve_fs_type(ve, tmpl, pfstype, pmnt) \
+	register_ve_fs_type_data(ve, tmpl, pfstype, pmnt, NULL)
+extern int register_ve_fs_type_data_flags(struct ve_struct *ve, struct file_system_type *template,
+					  struct file_system_type **p_fs_type, struct vfsmount **p_mnt,
+					  void *data, int flags);
+extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *);
+extern void umount_ve_fs_type(struct file_system_type *local_fs_type, int veid);
+#endif
+
+#define kern_umount mntput
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, const char *, char *, unsigned long, void *);
+extern int do_remount(struct path *path, int flags, int mnt_flags, void *data);
 extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 
@@ -2042,6 +2267,8 @@ extern int statfs_by_dentry(struct dentr
 
 extern int current_umask(void);
 
+extern int ve_devmnt_process(struct ve_struct *, dev_t, void **, int);
+
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
@@ -2162,9 +2389,14 @@ extern long do_sys_open(int dfd, const c
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
 				 const struct cred *);
+extern struct file *nameidata_to_filp(struct nameidata *nd);
 extern int filp_close(struct file *, fl_owner_t id);
 extern struct filename *getname(const char __user *);
 
+extern int open_inode_peer(struct inode *, struct path *, const struct cred *);
+extern void close_inode_peer(struct inode *);
+extern void peer_fput(struct file *file);
+
 /* fs/ioctl.c */
 
 extern int ioctl_preallocate(struct file *filp, void __user *argp);
@@ -2216,6 +2448,7 @@ static inline int thaw_bdev(struct block
 }
 #endif
 extern int sync_filesystem(struct super_block *);
+extern int __sync_filesystem(struct super_block *i, struct user_beancounter *, int);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
@@ -2283,6 +2516,11 @@ extern void init_special_inode(struct in
 extern void make_bad_inode(struct inode *);
 extern int is_bad_inode(struct inode *);
 
+#if IS_ENABLED(CONFIG_VZ_CHECKPOINT)
+extern struct inode *anon_inode_inode;
+extern const struct file_operations bad_file_ops;
+#endif
+
 extern const struct file_operations read_pipefifo_fops;
 extern const struct file_operations write_pipefifo_fops;
 extern const struct file_operations rdwr_pipefifo_fops;
@@ -2307,7 +2545,8 @@ extern int check_disk_change(struct bloc
 extern int __invalidate_device(struct block_device *, bool);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
-extern int invalidate_inodes(struct super_block *, bool);
+extern int invalidate_inodes_check(struct super_block *, bool, int check);
+#define invalidate_inodes(sb, kd) invalidate_inodes_check(sb, kd, 0)
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 
@@ -2378,6 +2617,9 @@ extern struct file *create_read_pipe(str
 extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
+extern void release_open_intent(struct nameidata *nd);
+
+extern int path_walk(struct filename *name, struct nameidata *nd);
 extern struct file *do_filp_open(int dfd, struct filename *filename,
 		int open_flag, int mode, int acc_mode);
 extern int may_open(struct path *, int, int);
@@ -2428,6 +2670,7 @@ extern void clear_inode(struct inode *);
 extern void destroy_inode(struct inode *);
 extern void __destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
+extern struct inode *new_inode_pseudo(struct super_block *);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_suid(struct file *);
 
@@ -2437,8 +2680,6 @@ static inline void insert_inode_hash(str
 	__insert_inode_hash(inode, inode->i_ino);
 }
 
-extern void file_move(struct file *f, struct list_head *list);
-extern void file_kill(struct file *f);
 #ifdef CONFIG_BLOCK
 struct bio;
 extern void submit_bio(int, struct bio *);
@@ -2453,13 +2694,20 @@ extern int generic_file_readonly_mmap(st
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
 		loff_t *);
+extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
+extern ssize_t generic_file_direct_write_iter(struct kiocb *, struct iov_iter *,
+		loff_t, loff_t *, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, loff_t *, size_t, ssize_t);
+extern ssize_t generic_file_buffered_write_iter(struct kiocb *, struct iov_iter *,
+		loff_t, loff_t *, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
@@ -2751,6 +2999,17 @@ ssize_t simple_attr_read(struct file *fi
 ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
+static inline void *file_private(struct file *file)
+{
+	struct file *host = file;
+
+	while (host->f_op->get_host) {
+		host = host->f_op->get_host(host);
+		BUG_ON(host->f_mapping != file->f_mapping);
+	}
+	return host->private_data;
+}
+
 struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fs_struct.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fs_struct.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fs_struct.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fs_struct.h	2015-01-21 12:02:50.195041309 +0300
@@ -5,7 +5,7 @@
 
 struct fs_struct {
 	int users;
-	rwlock_t lock;
+	spinlock_t lock;
 	int umask;
 	int in_exec;
 	struct path root, pwd;
@@ -20,5 +20,33 @@ extern struct fs_struct *copy_fs_struct(
 extern void free_fs_struct(struct fs_struct *);
 extern void daemonize_fs_struct(void);
 extern int unshare_fs_struct(void);
+extern int open_to_namei_flags(int flag);
+
+static inline void get_fs_root(struct fs_struct *fs, struct path *root)
+{
+	spin_lock(&fs->lock);
+	*root = fs->root;
+	path_get(root);
+	spin_unlock(&fs->lock);
+}
+
+static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
+{
+	spin_lock(&fs->lock);
+	*pwd = fs->pwd;
+	path_get(pwd);
+	spin_unlock(&fs->lock);
+}
+
+static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root,
+				       struct path *pwd)
+{
+	spin_lock(&fs->lock);
+	*root = fs->root;
+	path_get(root);
+	*pwd = fs->pwd;
+	path_get(pwd);
+	spin_unlock(&fs->lock);
+}
 
 #endif /* _LINUX_FS_STRUCT_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fsnotify_backend.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fsnotify_backend.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fsnotify_backend.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fsnotify_backend.h	2015-01-21 12:02:47.929101463 +0300
@@ -85,6 +85,7 @@ struct fsnotify_ops {
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
 	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
+	void (*detach_mnt)(struct fsnotify_mark_entry *e);
 };
 
 /*
@@ -351,6 +352,7 @@ extern void fsnotify_clear_marks_by_grou
 extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
 extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
+extern void fsnotify_unmount_mnt(struct vfsmount *mnt);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
@@ -383,6 +385,7 @@ static inline u32 fsnotify_get_cookie(vo
 static inline void fsnotify_unmount_inodes(struct super_block *sb)
 {}
 
+static inline void fsnotify_unmount_mnt(struct vfsmount *mnt) { }
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/fuse.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fuse.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/fuse.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/fuse.h	2015-01-21 12:02:51.643002871 +0300
@@ -270,6 +270,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_POLL   = 1,
 	FUSE_NOTIFY_INVAL_INODE = 2,
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
+	FUSE_NOTIFY_INVAL_FILES = 4,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -596,4 +597,8 @@ struct fuse_notify_inval_entry_out {
 	__u32	padding;
 };
 
+struct fuse_notify_inval_files_out {
+	__u64	ino;
+};
+
 #endif /* _LINUX_FUSE_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/futex.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/futex.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/futex.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/futex.h	2015-01-21 12:02:57.967834979 +0300
@@ -132,6 +132,7 @@ union ktime;
 long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
+extern long futex_wait_restart(struct restart_block *restart);
 extern int
 handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/genhd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/genhd.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/genhd.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/genhd.h	2015-01-21 12:02:47.211120524 +0300
@@ -22,6 +22,7 @@
 #define part_to_dev(part)	(&((part)->__dev))
 
 extern struct device_type part_type;
+extern struct device_type disk_type;
 extern struct kobject *block_depr;
 extern struct class block_class;
 
@@ -89,7 +90,13 @@ struct disk_stats {
 	
 struct hd_struct {
 	sector_t start_sect;
+	/*
+	 * nr_sects is protected by sequence counter. One might extend a
+	 * partition while IO is happening to it and update of nr_sects
+	 * can be non-atomic on 32bit machines with 64bit sector_t.
+	 */
 	sector_t nr_sects;
+	seqcount_t seq;
 	sector_t alignment_offset;
 	unsigned int discard_alignment;
 	struct device __dev;
@@ -539,6 +546,8 @@ extern struct hd_struct * __must_check a
 						     sector_t len, int flags);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
+extern sector_t part_nr_sects_read(struct hd_struct *part);
+extern void part_nr_sects_write(struct hd_struct *part, sector_t val);
 
 extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/gfp.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/gfp.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/gfp.h	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/gfp.h	2015-01-21 12:02:58.862811224 +0300
@@ -52,6 +52,8 @@ struct vm_area_struct;
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
+#define __GFP_UBC	((__force gfp_t)0x100000u)/* charge kmem in buddy and slab */
+#define __GFP_SOFT_UBC	((__force gfp_t)0x800000u)/* use soft charging */
 
 #ifdef CONFIG_KMEMCHECK
 #define __GFP_NOTRACK	((__force gfp_t)0x200000u)  /* Don't track with kmemcheck */
@@ -68,19 +70,22 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 23	/* Room for 23 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 24	/* Room for __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
 #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
 /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
 #define GFP_ATOMIC	(__GFP_HIGH)
+#define GFP_ATOMIC_UBC	(__GFP_HIGH | __GFP_UBC)
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
 #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_UBC	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC)
 #define GFP_TEMPORARY	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
 			 __GFP_RECLAIMABLE)
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_USER_UBC	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC)
 #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
 			 __GFP_HIGHMEM)
 #define GFP_HIGHUSER_MOVABLE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
@@ -341,6 +346,7 @@ void *alloc_pages_exact_nid(int nid, siz
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_page(struct page *page);
+extern void free_hot_cold_page_list(struct list_head *list, int cold);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/hardirq.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hardirq.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/hardirq.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hardirq.h	2015-01-21 12:02:54.123937014 +0300
@@ -2,7 +2,7 @@
 #define LINUX_HARDIRQ_H
 
 #include <linux/preempt.h>
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 #include <linux/smp_lock.h>
 #endif
 #include <linux/lockdep.h>
@@ -10,6 +10,9 @@
 #include <asm/hardirq.h>
 #include <asm/system.h>
 
+#include <bc/task.h>
+#include <linux/ve_task.h>
+
 /*
  * We put the hardirq and softirq counter into the preemption
  * counter. The bitmask has the following meaning:
@@ -97,7 +100,7 @@
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
-#if defined(CONFIG_PREEMPT)
+#if defined(CONFIG_PREEMPT_COUNT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
 # define PREEMPT_CHECK_OFFSET 1
 #else
@@ -121,7 +124,7 @@
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
@@ -155,6 +158,24 @@ extern void rcu_nmi_exit(void);
 # define rcu_nmi_exit() do { } while (0)
 #endif /* #if defined(CONFIG_NO_HZ) */
 
+#define save_context()		do {				\
+		struct task_struct *tsk;			\
+		if (hardirq_count() == HARDIRQ_OFFSET) {	\
+			tsk = current;				\
+			ve_save_context(tsk);			\
+			ub_save_context(tsk);			\
+		}						\
+	} while (0)
+
+#define restore_context()		do {			\
+		struct task_struct *tsk;			\
+		if (hardirq_count() == HARDIRQ_OFFSET) {	\
+			tsk = current;				\
+			ve_restore_context(tsk);		\
+			ub_restore_context(tsk);		\
+		}						\
+	} while (0)
+
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
  * because NMI handlers may not preempt and the ops are
@@ -165,6 +186,7 @@ extern void rcu_nmi_exit(void);
 	do {						\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
+		save_context();				\
 		trace_hardirq_enter();			\
 	} while (0)
 
@@ -180,6 +202,7 @@ extern void irq_enter(void);
 	do {						\
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
+		restore_context();			\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
 	} while (0)
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/hrtimer.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hrtimer.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/hrtimer.h	2014-12-12 23:29:17.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hrtimer.h	2015-01-21 12:02:57.967834979 +0300
@@ -149,7 +149,21 @@ struct hrtimer_clock_base {
 	ktime_t			offset;
 };
 
-#define HRTIMER_MAX_CLOCK_BASES 2
+/*
+ * This enum is not in the same order
+ * as vanilla kernel has, but just to
+ * make CLOCK_BOOTTIME backport easier.
+ * Thus once reworked hrtimers will be
+ * merged this hunk should be dropped
+ * off.
+ * 	-- gorcunov@
+ */
+enum  hrtimer_base_type {
+	HRTIMER_BASE_REALTIME,
+	HRTIMER_BASE_MONOTONIC,
+	HRTIMER_BASE_BOOTTIME,
+	HRTIMER_MAX_CLOCK_BASES,
+};
 
 /*
  * struct hrtimer_cpu_base - the per cpu clock bases
@@ -319,7 +333,7 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 extern ktime_t ktime_get_boottime(void);
 extern ktime_t ktime_get_monotonic_offset(void);
-extern ktime_t ktime_get_update_offsets(ktime_t *offs_real);
+extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
@@ -423,6 +437,9 @@ extern long hrtimer_nanosleep(struct tim
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
+#ifdef CONFIG_COMPAT
+extern long compat_nanosleep_restart(struct restart_block *restart);
+#endif
 
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/huge_mm.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/huge_mm.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/huge_mm.h	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/huge_mm.h	2015-01-21 12:02:58.019833600 +0300
@@ -78,6 +78,7 @@ extern pmd_t *page_check_address_pmd(str
 extern unsigned long transparent_hugepage_flags;
 extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			  pmd_t *dst_pmd, pmd_t *src_pmd,
+			  struct vm_area_struct *dst_vma,
 			  struct vm_area_struct *vma,
 			  unsigned long addr, unsigned long end);
 extern int handle_pte_fault(struct mm_struct *mm,
@@ -108,9 +109,9 @@ extern void __split_huge_page_pmd(struct
 #if HPAGE_PMD_ORDER > MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
-extern int hugepage_madvise(unsigned long *vm_flags, int advice);
+extern int hugepage_madvise(struct vm_area_struct *vma,
+			    unsigned long *vm_flags, int advice);
 
-extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
 extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
@@ -191,7 +192,8 @@ static inline int split_huge_page(struct
 	do { } while (0)
 #define wait_split_huge_page(__anon_vma, __pmd)	\
 	do { } while (0)
-static inline int hugepage_madvise(unsigned long *vm_flags, int advice)
+static inline int hugepage_madvise(struct vm_area_struct *vma,
+				   unsigned long *vm_flags, int advice)
 {
 	BUG();
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/hugetlb.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hugetlb.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/hugetlb.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hugetlb.h	2015-01-21 12:02:41.773264895 +0300
@@ -50,7 +50,7 @@ int hugetlb_fault(struct mm_struct *mm, 
 			unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
-						int acctflags);
+						vm_flags_t vm_flags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
@@ -177,7 +177,7 @@ static inline struct hugetlbfs_sb_info *
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags);
 
 static inline int is_file_hugepages(struct file *file)
@@ -199,7 +199,7 @@ static inline void set_file_hugepages(st
 #define is_file_hugepages(file)			0
 #define set_file_hugepages(file)		BUG()
 static inline struct file *hugetlb_file_setup(const char *name, size_t size,
-		int acctflag, struct user_struct **user, int creat_flags)
+		vm_flags_t acctflag, struct user_struct **user, int creat_flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/hugetlb_inline.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hugetlb_inline.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/hugetlb_inline.h	2014-12-12 23:28:59.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/hugetlb_inline.h	2015-01-21 12:02:41.773264895 +0300
@@ -7,7 +7,7 @@
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-	return vma->vm_flags & VM_HUGETLB;
+	return !!(vma->vm_flags & VM_HUGETLB);
 }
 
 #else
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/idr.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/idr.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/idr.h	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/idr.h	2015-01-21 12:02:50.306038362 +0300
@@ -42,12 +42,6 @@
 #define MAX_ID_BIT (1U << MAX_ID_SHIFT)
 #define MAX_ID_MASK (MAX_ID_BIT - 1)
 
-/* Leave the possibility of an incomplete final layer */
-#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
-
-/* Number of id_layer structs to leave in free list */
-#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL
-
 struct idr_layer {
 	unsigned long		 bitmap; /* A zero bit means "space here" */
 	struct idr_layer	*ary[1<<IDR_BITS];
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/if_bridge.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/if_bridge.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/if_bridge.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/if_bridge.h	2015-01-21 12:02:49.623056493 +0300
@@ -45,6 +45,7 @@
 #define BRCTL_SET_PORT_PRIORITY 16
 #define BRCTL_SET_PATH_COST 17
 #define BRCTL_GET_FDB_ENTRIES 18
+#define BRCTL_SET_VIA_ORIG_DEV 19
 
 #define BR_STATE_DISABLED 0
 #define BR_STATE_LISTENING 1
@@ -73,6 +74,7 @@ struct __bridge_info
 	__u32 tcn_timer_value;
 	__u32 topology_change_timer_value;
 	__u32 gc_timer_value;
+	__u8 via_phys_dev;
 };
 
 struct __port_info
@@ -165,9 +167,12 @@ struct br_mdb_entry {
 
 #include <linux/netdevice.h>
 
+#define BR_ALREADY_SEEN 1
+
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
 					       struct sk_buff *skb);
+extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port);
 extern int (*br_should_route_hook)(struct sk_buff *skb);
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/if_macvlan.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/if_macvlan.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/if_macvlan.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/if_macvlan.h	2015-01-21 12:02:51.143016142 +0300
@@ -86,7 +86,7 @@ extern void macvlan_count_rx(const struc
 			     unsigned int len, bool success,
 			     bool multicast);
 
-extern void macvlan_dellink(struct net_device *dev);
+extern void macvlan_dellink(struct net_device *dev, struct list_head *head);
 
 extern int macvlan_link_register(struct rtnl_link_ops *ops);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/if_vlan.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/if_vlan.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/if_vlan.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/if_vlan.h	2015-01-21 12:02:45.061177602 +0300
@@ -89,6 +89,9 @@ struct vlan_group {
 	struct hlist_node	hlist;	/* linked list */
 	struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS];
 	struct rcu_head		rcu;
+#ifdef CONFIG_VE
+	struct ve_struct	*owner;
+#endif
 };
 
 struct vlan_group* vlan_find_group(struct net_device *dev);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/inetdevice.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/inetdevice.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/inetdevice.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/inetdevice.h	2015-01-21 12:02:57.967834979 +0300
@@ -134,6 +134,9 @@ struct in_ifaddr
 extern int register_inetaddr_notifier(struct notifier_block *nb);
 extern int unregister_inetaddr_notifier(struct notifier_block *nb);
 
+extern struct in_ifaddr *inet_alloc_ifa(void);
+extern struct in_device *inetdev_init(struct net_device *dev);
+extern int inet_insert_ifa(struct in_ifaddr *ifa);
 extern struct net_device *ip_dev_find(struct net *net, __be32 addr);
 extern int		inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
 extern int		devinet_ioctl(struct net *net, unsigned int cmd, void __user *);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/init_task.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/init_task.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/init_task.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/init_task.h	2015-01-21 12:02:44.548191222 +0300
@@ -10,6 +10,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/securebits.h>
+#include <linux/seqlock.h>
 #include <net/net_namespace.h>
 
 extern struct files_struct init_files;
@@ -37,13 +38,29 @@ extern struct fs_struct init_fs;
 		.running = 0,						\
 		.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
 	},								\
+	.oom_score_adj  = OOM_SCORE_ADJ_UNSET,				\
 	INIT_THREADGROUP_FORK_LOCK(sig)					\
 }
 
+#ifdef CONFIG_VE
+/* one for ve0, one for init_task */
+#define INIT_NSPROXY_COUNT	ATOMIC_INIT(2)
+#define INIT_VE_TASK_INFO						\
+	.ve_task_info.exec_env	= &ve0,					\
+	.ve_task_info.owner_env	= &ve0,					\
+	.ve_task_info.sleep_time	= 0,				\
+	.ve_task_info.wakeup_stamp	= 0,				\
+	.ve_task_info.sched_time	= 0,				\
+	.ve_task_info.wakeup_lock	= SEQCNT_ZERO,
+#else
+#define INIT_NSPROXY_COUNT	ATOMIC_INIT(1)
+#define INIT_VE_TASK_INFO
+#endif
+
 extern struct nsproxy init_nsproxy;
 #define INIT_NSPROXY(nsproxy) {						\
 	.pid_ns		= &init_pid_ns,					\
-	.count		= ATOMIC_INIT(1),				\
+	.count		= INIT_NSPROXY_COUNT,				\
 	.uts_ns		= &init_uts_ns,					\
 	.mnt_ns		= NULL,						\
 	INIT_NET_NS(net_ns)                                             \
@@ -193,6 +210,7 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_VE_TASK_INFO						\
 }
 
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/inotify.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/inotify.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/inotify.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/inotify.h	2015-01-21 12:02:49.674055139 +0300
@@ -101,6 +101,11 @@ struct inotify_operations {
 	void (*destroy_watch)(struct inotify_watch *);
 };
 
+struct fsnotify_group;
+extern const struct file_operations inotify_fops;
+int __inotify_new_watch(struct fsnotify_group *group,
+			     struct path *path, __u32 mask, int wd);
+
 #ifdef CONFIG_INOTIFY
 
 /* Kernel API for producing events */
@@ -137,6 +142,8 @@ extern void put_inotify_watch(struct ino
 extern int pin_inotify_watch(struct inotify_watch *);
 extern void unpin_inotify_watch(struct inotify_watch *);
 
+extern struct file *inotify_create(int flags);
+
 #else
 
 static inline void inotify_d_instantiate(struct dentry *dentry,
@@ -239,6 +246,11 @@ extern inline void unpin_inotify_watch(s
 {
 }
 
+static inline struct file *inotify_create(int flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 #endif	/* CONFIG_INOTIFY */
 
 #endif	/* __KERNEL __ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/iocontext.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/iocontext.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/iocontext.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/iocontext.h	2015-01-21 12:02:55.652896427 +0300
@@ -82,6 +82,9 @@ struct io_context {
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void *ioc_data;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ioc_ub;
+#endif
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -102,8 +105,10 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 int put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
+void ioc_task_unlink(struct io_context *ioc);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+struct io_context *current_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 #else
 static inline void exit_io_context(struct task_struct *task)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ioprio.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ioprio.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ioprio.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ioprio.h	2015-01-21 12:02:43.346223134 +0300
@@ -39,6 +39,7 @@ enum {
 	IOPRIO_WHO_PROCESS = 1,
 	IOPRIO_WHO_PGRP,
 	IOPRIO_WHO_USER,
+	IOPRIO_WHO_UBC = 1000,
 };
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ipc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ipc.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ipc.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ipc.h	2015-01-21 12:02:57.967834979 +0300
@@ -79,6 +79,7 @@ struct ipc_kludge {
 
 #ifdef __KERNEL__
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
 
@@ -98,6 +99,15 @@ struct kern_ipc_perm
 	void		*security;
 };
 
+struct ipc_ids;
+
+extern struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+static inline void ipc_unlock(struct kern_ipc_perm *perm)
+{
+	spin_unlock(&perm->lock);
+	rcu_read_unlock();
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_IPC_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/irq.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/irq.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/irq.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/irq.h	2015-01-21 12:02:42.782238107 +0300
@@ -418,6 +418,9 @@ extern unsigned int create_irq_nr(unsign
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
+extern int __irq_to_vector(int nr);
+#define irq_to_vector(nr)	__irq_to_vector(nr)
+
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_has_action(unsigned int irq)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kdev_t.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kdev_t.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kdev_t.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kdev_t.h	2015-01-21 12:02:42.049257568 +0300
@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de
 	return dev & 0x3ffff;
 }
 
+#define UNNAMED_MAJOR_COUNT	16
+
+#if UNNAMED_MAJOR_COUNT > 1
+
+extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT];
+
+static inline dev_t make_unnamed_dev(int idx)
+{
+	/*
+	 * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the
+	 * unnamed device index into major number.
+	 */
+	return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)],
+		     idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8));
+}
+
+static inline int unnamed_dev_idx(dev_t dev)
+{
+	int i;
+	for (i = 0; i < UNNAMED_MAJOR_COUNT &&
+				MAJOR(dev) != unnamed_dev_majors[i]; i++);
+	return MINOR(dev) | (i << 8);
+}
+
+static inline int is_unnamed_dev(dev_t dev)
+{
+	int i;
+	for (i = 0; i < UNNAMED_MAJOR_COUNT &&
+				MAJOR(dev) != unnamed_dev_majors[i]; i++);
+	return i < UNNAMED_MAJOR_COUNT;
+}
+
+#else /* UNNAMED_MAJOR_COUNT */
+
+static inline dev_t make_unnamed_dev(int idx)
+{
+	return MKDEV(0, idx);
+}
+
+static inline int unnamed_dev_idx(dev_t dev)
+{
+	return MINOR(dev);
+}
+
+static inline int is_unnamed_dev(dev_t dev)
+{
+	return MAJOR(dev) == 0;
+}
+
+#endif /* UNNAMED_MAJOR_COUNT */
+
 #else /* __KERNEL__ */
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kernel.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kernel.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kernel.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kernel.h	2015-01-21 12:02:43.837210097 +0300
@@ -416,6 +416,12 @@ extern struct ratelimit_state printk_rat
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
+asmlinkage int ve_printk(int, const char * fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+void prepare_printk(void);
+
 
 extern int printk_delay_msec;
 extern int dmesg_restrict;
@@ -445,6 +451,15 @@ static inline int printk_ratelimit(void)
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+static inline int ve_printk(int d, const char *s, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int ve_printk(int d, const char *s, ...)
+{
+	return 0;
+}
+static inline void prepare_printk(void)
+{
+}
 
 /* No effect, but we still get type checking even in the !PRINTK case: */
 #define printk_once(x...) printk(x)
@@ -470,10 +485,18 @@ extern void asmlinkage __attribute__((fo
 	early_printk(const char *fmt, ...);
 
 unsigned long int_sqrt(unsigned long);
+extern int console_silence_loglevel;
+
+#define VE0_LOG		1
+#define VE_LOG		2
+#define VE_LOG_BOTH	(VE0_LOG | VE_LOG)
 
 static inline void console_silent(void)
 {
-	console_loglevel = 0;
+	if (console_loglevel > console_silence_loglevel) {
+		printk(KERN_EMERG "console shuts up ...\n");
+		console_loglevel = 0;
+	}
 }
 
 static inline void console_verbose(void)
@@ -487,6 +510,7 @@ extern void wake_up_klogd(void);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_timeout;
 extern int panic_on_oops;
+extern int decode_call_traces;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
 extern const char *print_tainted(void);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kernel_stat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kernel_stat.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kernel_stat.h	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kernel_stat.h	2015-01-21 12:02:54.646923131 +0300
@@ -27,6 +27,56 @@ struct cpu_usage_stat {
 	cputime64_t guest;
 };
 
+enum cpu_usage_stat_enum {
+	USER,
+	NICE,
+	SYSTEM,
+	IDLE,
+	IOWAIT,
+	USED,
+	STEAL,
+	NR_STATS,
+};
+
+struct kernel_cpustat {
+	u64 cpustat[NR_STATS];
+};
+
+static inline u64 kernel_cpustat_total_usage(const struct kernel_cpustat *p)
+{
+	return p->cpustat[USER] + p->cpustat[NICE] + p->cpustat[SYSTEM];
+}
+
+static inline u64 kernel_cpustat_total_idle(const struct kernel_cpustat *p)
+{
+	return p->cpustat[IDLE] + p->cpustat[IOWAIT];
+}
+
+static inline void kernel_cpustat_zero(struct kernel_cpustat *p)
+{
+	memset(p, 0, sizeof(*p));
+}
+
+static inline void kernel_cpustat_add(const struct kernel_cpustat *lhs,
+				      const struct kernel_cpustat *rhs,
+				      struct kernel_cpustat *res)
+{
+	int i;
+
+	for (i = 0; i < NR_STATS; i++)
+		res->cpustat[i] = lhs->cpustat[i] + rhs->cpustat[i];
+}
+
+static inline void kernel_cpustat_sub(const struct kernel_cpustat *lhs,
+				      const struct kernel_cpustat *rhs,
+				      struct kernel_cpustat *res)
+{
+	int i;
+
+	for (i = 0; i < NR_STATS; i++)
+		res->cpustat[i] = lhs->cpustat[i] - rhs->cpustat[i];
+}
+
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
 #ifndef CONFIG_GENERIC_HARDIRQS
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kexec.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kexec.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kexec.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kexec.h	2015-01-21 12:02:53.077964777 +0300
@@ -219,7 +219,8 @@ extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
+		unsigned long long *crash_size, unsigned long long *crash_base,
+		int *strict);
 int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
@@ -230,4 +231,11 @@ struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #endif /* CONFIG_KEXEC */
+
+#ifdef CONFIG_KEXEC_REUSE_CRASH
+void kexec_crash_init(void);
+#else
+static inline void kexec_crash_init(void) { }
+#endif
+
 #endif /* LINUX_KEXEC_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kmemleak.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kmemleak.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kmemleak.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kmemleak.h	2015-01-21 12:02:58.438822479 +0300
@@ -32,8 +32,7 @@ extern void kmemleak_padding(const void 
 			     size_t size) __ref;
 extern void kmemleak_not_leak(const void *ptr) __ref;
 extern void kmemleak_ignore(const void *ptr) __ref;
-extern void kmemleak_scan_area(const void *ptr, unsigned long offset,
-			       size_t length, gfp_t gfp) __ref;
+extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
 extern void kmemleak_no_scan(const void *ptr) __ref;
 
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
@@ -84,8 +83,7 @@ static inline void kmemleak_not_leak(con
 static inline void kmemleak_ignore(const void *ptr)
 {
 }
-static inline void kmemleak_scan_area(const void *ptr, unsigned long offset,
-				      size_t length, gfp_t gfp)
+static inline void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
 {
 }
 static inline void kmemleak_erase(void **ptr)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kmod.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kmod.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kmod.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kmod.h	2015-01-21 12:02:45.507165762 +0300
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/workqueue.h>
 #include <linux/sysctl.h>
+#include <linux/sched.h>
 
 #define KMOD_PATH_LEN 256
 
@@ -43,8 +44,18 @@ static inline int request_module_nowait(
 #define try_then_request_module(x, mod...) (x)
 #endif
 
+extern int ve0_request_module(const char *name,...) \
+	__attribute__((format(printf, 1, 2)));
+#define ve0_try_then_request_module(x, mod...) \
+	((x) ?: (ve0_request_module(mod), (x)))
 
-struct key;
+#ifdef CONFIG_VE_IPTABLES
+extern bool module_payload_allowed(const char *module);
+#else
+static inline bool module_payload_allowed(const char *module) { return true; }
+#endif
+
+struct cred;
 struct file;
 
 enum umh_wait {
@@ -58,13 +69,12 @@ enum umh_wait {
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
-	struct cred *cred;
 	char *path;
 	char **argv;
 	char **envp;
 	enum umh_wait wait;
 	int retval;
-	int (*init)(struct subprocess_info *info);
+	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
 };
@@ -74,36 +84,41 @@ struct subprocess_info *call_usermodehel
 						  char **envp, gfp_t gfp_mask);
 
 /* Set various pieces of state into the subprocess_info structure */
-void call_usermodehelper_setkeys(struct subprocess_info *info,
-				 struct key *session_keyring);
-int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
-				  struct file **filp);
 void call_usermodehelper_setfns(struct subprocess_info *info,
-		    int (*init)(struct subprocess_info *info),
+		    int (*init)(struct subprocess_info *info, struct cred *new),
 		    void (*cleanup)(struct subprocess_info *info),
 		    void *data);
 
 /* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
+int call_usermodehelper_exec_wq(struct subprocess_info *sub_info,
+				enum umh_wait wait,
+				struct workqueue_struct *khelper_wq);
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
+			     enum umh_wait wait);
 
 /* Free the subprocess_info. This is only needed if you're not going
    to call call_usermodehelper_exec */
 void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
-static inline int
+extern int
+call_usermodehelper_fns_wq(char *path, char **argv, char **envp,
+			enum umh_wait wait,
+			int (*init)(struct subprocess_info *info, struct cred *new),
+			void (*cleanup)(struct subprocess_info *), void *data,
+			struct workqueue_struct *khelper_wq);
+
+extern int
 call_usermodehelper_fns(char *path, char **argv, char **envp,
 			enum umh_wait wait,
-			int (*init)(struct subprocess_info *info),
-			void (*cleanup)(struct subprocess_info *), void *data)
-{
-	struct subprocess_info *info;
-	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+			int (*init)(struct subprocess_info *info, struct cred *new),
+			void (*cleanup)(struct subprocess_info *), void *data);
 
-	info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
-	if (info == NULL)
-		return -ENOMEM;
-	call_usermodehelper_setfns(info, init, cleanup, data);
-	return call_usermodehelper_exec(info, wait);
+static inline int
+call_usermodehelper_wq(char *path, char **argv, char **envp, enum umh_wait wait,
+					struct workqueue_struct *khelper_wq)
+{
+	return call_usermodehelper_fns_wq(path, argv, envp,
+				       wait, NULL, NULL, NULL, khelper_wq);
 }
 
 static inline int
@@ -113,21 +128,6 @@ call_usermodehelper(char *path, char **a
 				       wait, NULL, NULL, NULL);
 }
 
-static inline int
-call_usermodehelper_keys(char *path, char **argv, char **envp,
-			 struct key *session_keyring, enum umh_wait wait)
-{
-	struct subprocess_info *info;
-	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-
-	info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
-	if (info == NULL)
-		return -ENOMEM;
-
-	call_usermodehelper_setkeys(info, session_keyring);
-	return call_usermodehelper_exec(info, wait);
-}
-
 extern struct ctl_table usermodehelper_table[];
 
 extern void usermodehelper_init(void);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kobject.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kobject.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kobject.h	2014-12-12 23:29:02.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kobject.h	2015-01-21 12:02:44.809184293 +0300
@@ -34,7 +34,12 @@
 extern char uevent_helper[];
 
 /* counter to tag the uevent, read only except for the kobject core */
+#ifdef CONFIG_VE
+#define ve_uevent_seqnum	(get_exec_env()->_uevent_seqnum)
+#else
+#define ve_uevent_seqnum uevent_seqnum
 extern u64 uevent_seqnum;
+#endif
 
 /*
  * The actions here must match the index to the string array
@@ -51,6 +56,8 @@ enum kobject_action {
 	KOBJ_REMOVE,
 	KOBJ_CHANGE,
 	KOBJ_MOVE,
+	KOBJ_START,
+	KOBJ_STOP,
 	KOBJ_ONLINE,
 	KOBJ_OFFLINE,
 	KOBJ_MAX
@@ -69,6 +76,7 @@ struct kobject {
 	unsigned int state_add_uevent_sent:1;
 	unsigned int state_remove_uevent_sent:1;
 	unsigned int uevent_suppress:1;
+	struct list_head	env_head;
 };
 
 extern int kobject_set_name(struct kobject *kobj, const char *name, ...)
@@ -200,10 +208,17 @@ extern struct kobject *power_kobj;
 /* The global /sys/firmware/ kobject for people to chain off of */
 extern struct kobject *firmware_kobj;
 
+/* Initialize kernel sysfs part for VE */
+extern int ksysfs_init_ve(struct ve_struct *ve, struct kobject **kernel_kobj);
+/* Remove group attributes and put kernel sysfs directory */
+extern void ksysfs_fini_ve(struct ve_struct *ve, struct kobject **kernel_kobj);
+
 #if defined(CONFIG_HOTPLUG)
 int kobject_uevent(struct kobject *kobj, enum kobject_action action);
 int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 			char *envp[]);
+int kobject_uevent_env_one(struct kobject *kobj, enum kobject_action action,
+			char *envp[]);
 
 int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
 	__attribute__((format (printf, 2, 3)));
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/kthread.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kthread.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/kthread.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/kthread.h	2015-01-21 12:02:45.837157001 +0300
@@ -10,11 +10,19 @@ struct task_struct *kthread_create_on_no
 					   const char namefmt[], ...)
 	__attribute__((format(printf, 4, 5)));
 
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_ve(struct ve_struct *ve,
+				   int (*threadfn)(void *data),
 				   void *data,
 				   const char namefmt[], ...)
-	__attribute__((format(printf, 3, 4)));
+	__attribute__((format(printf, 4, 5)));
 
+#define kthread_create(threadfn, data, namefmt, ...)			\
+({									\
+	struct task_struct *__k						\
+		= kthread_create_ve(get_ve0(), threadfn, data, namefmt,	\
+				 ## __VA_ARGS__);			\
+	__k;								\
+})
 
 /**
  * kthread_run - create and wake a thread.
@@ -34,12 +42,29 @@ struct task_struct *kthread_create(int (
 	__k;								   \
 })
 
+/* Like kthread_run() but run a thread in VE context */
+#define kthread_run_ve(ve, threadfn, data, namefmt, ...)		   \
+({									   \
+	struct task_struct *__k						   \
+		= kthread_create_ve(ve, threadfn, data, namefmt,	   \
+				    ## __VA_ARGS__);			   \
+	if (!IS_ERR(__k))						   \
+		wake_up_process(__k);					   \
+	__k;								   \
+})
+
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
 int kthread_should_stop(void);
+int kthreadd_create(void);
+void kthreadd_stop(struct ve_struct *ve);
 
 int kthreadd(void *unused);
+#ifdef CONFIG_VE
+#define kthreadd_task get_exec_env()->_kthreadd_task
+#else
 extern struct task_struct *kthreadd_task;
+#endif
 extern int tsk_fork_get_node(struct task_struct *tsk);
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/lglock.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lglock.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/lglock.h	2015-01-21 12:02:42.784238053 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lglock.h	2015-01-21 12:02:42.784238053 +0300
@@ -0,0 +1,172 @@
+/*
+ * Specialised local-global spinlock. Can only be declared as global variables
+ * to avoid overhead and keep things simple (and we don't want to start using
+ * these inside dynamically allocated structures).
+ *
+ * "local/global locks" (lglocks) can be used to:
+ *
+ * - Provide fast exclusive access to per-CPU data, with exclusive access to
+ *   another CPU's data allowed but possibly subject to contention, and to
+ *   provide very slow exclusive access to all per-CPU data.
+ * - Or to provide very fast and scalable read serialisation, and to provide
+ *   very slow exclusive serialisation of data (not necessarily per-CPU data).
+ *
+ * Brlocks are also implemented as a short-hand notation for the latter use
+ * case.
+ *
+ * Copyright 2009, 2010, Nick Piggin, Novell Inc.
+ */
+#ifndef __LINUX_LGLOCK_H
+#define __LINUX_LGLOCK_H
+
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+#include <linux/percpu.h>
+
+/* can make br locks by using local lock for read side, global lock for write */
+#define br_lock_init(name)	name##_lock_init()
+#define br_read_lock(name)	name##_local_lock()
+#define br_read_unlock(name)	name##_local_unlock()
+#define br_write_lock(name)	name##_global_lock_online()
+#define br_write_unlock(name)	name##_global_unlock_online()
+
+#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
+#define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
+
+
+#define lg_lock_init(name)	name##_lock_init()
+#define lg_local_lock(name)	name##_local_lock()
+#define lg_local_unlock(name)	name##_local_unlock()
+#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
+#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
+#define lg_global_lock(name)	name##_global_lock()
+#define lg_global_unlock(name)	name##_global_unlock()
+#define lg_global_lock_online(name) name##_global_lock_online()
+#define lg_global_unlock_online(name) name##_global_unlock_online()
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define LOCKDEP_INIT_MAP lockdep_init_map
+
+#define DEFINE_LGLOCK_LOCKDEP(name)					\
+ struct lock_class_key name##_lock_key;					\
+ struct lockdep_map name##_lock_dep_map;				\
+ EXPORT_SYMBOL(name##_lock_dep_map)
+
+#else
+#define LOCKDEP_INIT_MAP(a, b, c, d)
+
+#define DEFINE_LGLOCK_LOCKDEP(name)
+#endif
+
+
+#define DECLARE_LGLOCK(name)						\
+ extern void name##_lock_init(void);					\
+ extern void name##_local_lock(void);					\
+ extern void name##_local_unlock(void);					\
+ extern void name##_local_lock_cpu(int cpu);				\
+ extern void name##_local_unlock_cpu(int cpu);				\
+ extern void name##_global_lock(void);					\
+ extern void name##_global_unlock(void);				\
+ extern void name##_global_lock_online(void);				\
+ extern void name##_global_unlock_online(void);				\
+
+#define DEFINE_LGLOCK(name)						\
+									\
+ DEFINE_PER_CPU(raw_spinlock_t, name##_lock);				\
+ DEFINE_LGLOCK_LOCKDEP(name);						\
+									\
+ void name##_lock_init(void) {						\
+	int i;								\
+	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
+	for_each_possible_cpu(i) {					\
+		raw_spinlock_t *lock;					\
+		lock = &per_cpu(name##_lock, i);			\
+		*lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;	\
+	}								\
+ }									\
+ EXPORT_SYMBOL(name##_lock_init);					\
+									\
+ void name##_local_lock(void) {						\
+	raw_spinlock_t *lock;						\
+	preempt_disable();						\
+	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
+	lock = &__get_cpu_var(name##_lock);				\
+	__raw_spin_lock(lock);						\
+ }									\
+ EXPORT_SYMBOL(name##_local_lock);					\
+									\
+ void name##_local_unlock(void) {					\
+	raw_spinlock_t *lock;						\
+	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
+	lock = &__get_cpu_var(name##_lock);				\
+	__raw_spin_unlock(lock);					\
+	preempt_enable();						\
+ }									\
+ EXPORT_SYMBOL(name##_local_unlock);					\
+									\
+ void name##_local_lock_cpu(int cpu) {					\
+	raw_spinlock_t *lock;						\
+	preempt_disable();						\
+	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
+	lock = &per_cpu(name##_lock, cpu);				\
+	__raw_spin_lock(lock);						\
+ }									\
+ EXPORT_SYMBOL(name##_local_lock_cpu);					\
+									\
+ void name##_local_unlock_cpu(int cpu) {				\
+	raw_spinlock_t *lock;						\
+	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
+	lock = &per_cpu(name##_lock, cpu);				\
+	__raw_spin_unlock(lock);					\
+	preempt_enable();						\
+ }									\
+ EXPORT_SYMBOL(name##_local_unlock_cpu);				\
+									\
+ void name##_global_lock_online(void) {					\
+	int i;								\
+	preempt_disable();						\
+	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
+	for_each_online_cpu(i) {					\
+		raw_spinlock_t *lock;					\
+		lock = &per_cpu(name##_lock, i);			\
+		__raw_spin_lock(lock);					\
+	}								\
+ }									\
+ EXPORT_SYMBOL(name##_global_lock_online);				\
+									\
+ void name##_global_unlock_online(void) {				\
+	int i;								\
+	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
+	for_each_online_cpu(i) {					\
+		raw_spinlock_t *lock;					\
+		lock = &per_cpu(name##_lock, i);			\
+		__raw_spin_unlock(lock);				\
+	}								\
+	preempt_enable();						\
+ }									\
+ EXPORT_SYMBOL(name##_global_unlock_online);				\
+									\
+ void name##_global_lock(void) {					\
+	int i;								\
+	preempt_disable();						\
+	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
+	for_each_possible_cpu(i) {					\
+		raw_spinlock_t *lock;					\
+		lock = &per_cpu(name##_lock, i);			\
+		__raw_spin_lock(lock);					\
+	}								\
+ }									\
+ EXPORT_SYMBOL(name##_global_lock);					\
+									\
+ void name##_global_unlock(void) {					\
+	int i;								\
+	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
+	for_each_possible_cpu(i) {					\
+		raw_spinlock_t *lock;					\
+		lock = &per_cpu(name##_lock, i);			\
+		__raw_spin_unlock(lock);				\
+	}								\
+	preempt_enable();						\
+ }									\
+ EXPORT_SYMBOL(name##_global_unlock);
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/linux_logo.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/linux_logo.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/linux_logo.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/linux_logo.h	2015-01-21 12:02:58.151830096 +0300
@@ -47,6 +47,7 @@ extern const struct linux_logo logo_supe
 extern const struct linux_logo logo_superh_clut224;
 extern const struct linux_logo logo_m32r_clut224;
 extern const struct linux_logo logo_spe_clut224;
+extern const struct linux_logo logo_psbm_clut224;
 
 extern const struct linux_logo *fb_find_logo(int depth);
 #ifdef CONFIG_FB_LOGO_EXTRA
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/lockd/bind.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lockd/bind.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/lockd/bind.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lockd/bind.h	2015-01-21 12:02:51.095017417 +0300
@@ -55,5 +55,14 @@ extern int	nlmclnt_proc(struct nlm_host 
 					struct file_lock *fl);
 extern int	lockd_up(void);
 extern void	lockd_down(void);
+extern void	grace_ender(struct work_struct *work);
+
+extern int	nlmclnt_set_lockowner(struct inode *,
+		struct file_lock *, int);
+
+struct nlm_reserved_pid {
+	int pid;
+	struct hlist_node list;
+};
 
 #endif /* LINUX_LOCKD_BIND_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/lockd/lockd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lockd/lockd.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/lockd/lockd.h	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lockd/lockd.h	2015-01-21 12:02:47.686107915 +0300
@@ -67,6 +67,7 @@ struct nlm_host {
 	struct list_head	h_reclaim;	/* Locks in RECLAIM state */
 	struct nsm_handle	*h_nsmhandle;	/* NSM status handle */
 	char			*h_addrbuf;	/* address eyecatcher */
+	struct ve_struct *	owner_env;	/* VE owning the host */
 };
 
 /*
@@ -193,8 +194,7 @@ extern struct svc_procedure	nlmsvc_proce
 #ifdef CONFIG_LOCKD_V4
 extern struct svc_procedure	nlmsvc_procedures4[];
 #endif
-extern int			nlmsvc_grace_period;
-extern unsigned long		nlmsvc_timeout;
+
 extern int			nsm_use_hostnames;
 extern u32			nsm_local_state;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/lru_cache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lru_cache.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/lru_cache.h	2015-01-21 12:02:58.390823752 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/lru_cache.h	2015-01-21 12:02:58.390823752 +0300
@@ -0,0 +1,318 @@
+/*
+   lru_cache.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#ifndef LRU_CACHE_H
+#define LRU_CACHE_H
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/string.h> /* for memset */
+#include <linux/seq_file.h>
+
+/* { compatibility crap */
+
+/* needs to be included here,
+ * because of various old kernel compatibility wrappers */
+#include <linux/drbd_config.h>
+#ifdef USE_KMEM_CACHE_S
+#define kmem_cache kmem_cache_s
+#endif
+
+#ifdef NEED_BACKPORT_OF_KZALLOC
+static inline void *kzalloc(size_t size, int flags)
+{
+	void *rv = kmalloc(size, flags);
+	if (rv)
+		memset(rv, 0, size);
+
+	return rv;
+}
+#undef NEED_BACKPORT_OF_KZALLOC
+#endif
+
+/* } compatibility crap */
+
+
+/*
+This header file (and its .c file; kernel-doc of functions see there)
+  define a helper framework to easily keep track of index:label associations,
+  and changes to an "active set" of objects, as well as pending transactions,
+  to persistently record those changes.
+
+  We use an LRU policy if it is necessary to "cool down" a region currently in
+  the active set before we can "heat" a previously unused region.
+
+  Because of this later property, it is called "lru_cache".
+  As it actually Tracks Objects in an Active SeT, we could also call it
+  toast (incidentally that is what may happen to the data on the
+  backend storage uppon next resync, if we don't get it right).
+
+What for?
+
+We replicate IO (more or less synchronously) to local and remote disk.
+
+For crash recovery after replication node failure,
+  we need to resync all regions that have been target of in-flight WRITE IO
+  (in use, or "hot", regions), as we don't know wether or not those WRITEs have
+  made it to stable storage.
+
+  To avoid a "full resync", we need to persistently track these regions.
+
+  This is known as "write intent log", and can be implemented as on-disk
+  (coarse or fine grained) bitmap, or other meta data.
+
+  To avoid the overhead of frequent extra writes to this meta data area,
+  usually the condition is softened to regions that _may_ have been target of
+  in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
+  bitmap, trading frequency of meta data transactions against amount of
+  (possibly unnecessary) resync traffic.
+
+  If we set a hard limit on the area that may be "hot" at any given time, we
+  limit the amount of resync traffic needed for crash recovery.
+
+For recovery after replication link failure,
+  we need to resync all blocks that have been changed on the other replica
+  in the mean time, or, if both replica have been changed independently [*],
+  all blocks that have been changed on either replica in the mean time.
+  [*] usually as a result of a cluster split-brain and insufficient protection.
+      but there are valid use cases to do this on purpose.
+
+  Tracking those blocks can be implemented as "dirty bitmap".
+  Having it fine-grained reduces the amount of resync traffic.
+  It should also be persistent, to allow for reboots (or crashes)
+  while the replication link is down.
+
+There are various possible implementations for persistently storing
+write intent log information, three of which are mentioned here.
+
+"Chunk dirtying"
+  The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
+  To reduce the frequency of bitmap updates for write-intent log purposes,
+  one could dirty "chunks" (of some size) at a time of the (fine grained)
+  on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
+  possible, flushing it to disk again when a previously "hot" (and on-disk
+  dirtied as full chunk) area "cools down" again (no IO in flight anymore,
+  and none expected in the near future either).
+
+"Explicit (coarse) write intent bitmap"
+  An other implementation could chose a (probably coarse) explicit bitmap,
+  for write-intent log purposes, additionally to the fine grained dirty bitmap.
+
+"Activity log"
+  Yet an other implementation may keep track of the hot regions, by starting
+  with an empty set, and writing down a journal of region numbers that have
+  become "hot", or have "cooled down" again.
+
+  To be able to use a ring buffer for this journal of changes to the active
+  set, we not only record the actual changes to that set, but also record the
+  not changing members of the set in a round robin fashion. To do so, we use a
+  fixed (but configurable) number of slots which we can identify by index, and
+  associate region numbers (labels) with these indices.
+  For each transaction recording a change to the active set, we record the
+  change itself (index: -old_label, +new_label), and which index is associated
+  with which label (index: current_label) within a certain sliding window that
+  is moved further over the available indices with each such transaction.
+
+  Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
+  accurately reconstruct the active set.
+
+  Sufficiently large depends only on maximum number of active objects, and the
+  size of the sliding window recording "index: current_label" associations within
+  each transaction.
+
+  This is what we call the "activity log".
+
+  Currently we need one activity log transaction per single label change, which
+  does not give much benefit over the "dirty chunks of bitmap" approach, other
+  than potentially less seeks.
+
+  We plan to change the transaction format to support multiple changes per
+  transaction, which then would reduce several (disjoint, "random") updates to
+  the bitmap into one transaction to the activity log ring buffer.
+*/
+
+/* this defines an element in a tracked set
+ * .colision is for hash table lookup.
+ * When we process a new IO request, we know its sector, thus can deduce the
+ * region number (label) easily.  To do the label -> object lookup without a
+ * full list walk, we use a simple hash table.
+ *
+ * .list is on one of three lists:
+ *  in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
+ *     lru: unused but ready to be reused or recycled
+ *          (lc_refcnt == 0, lc_number != LC_FREE),
+ *    free: unused but ready to be recycled
+ *          (lc_refcnt == 0, lc_number == LC_FREE),
+ *
+ * an element is said to be "in the active set",
+ * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
+ *
+ * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
+ * (total memory usage 2 pages), and up to 3833 elements on the act_log
+ * lru_cache, totalling ~215 kB for 64bit architecture, ~53 pages.
+ *
+ * We usually do not actually free these objects again, but only "recycle"
+ * them, as the change "index: -old_label, +LC_FREE" would need a transaction
+ * as well.  Which also means that using a kmem_cache to allocate the objects
+ * from wastes some resources.
+ * But it avoids high order page allocations in kmalloc.
+ */
+struct lc_element {
+	struct hlist_node colision;
+	struct list_head list;		 /* LRU list or free list */
+	unsigned refcnt;
+	/* back "pointer" into lc_cache->element[index],
+	 * for paranoia, and for "lc_element_to_index" */
+	unsigned lc_index;
+	/* if we want to track a larger set of objects,
+	 * it needs to become arch independend u64 */
+	unsigned lc_number;
+
+	/* special label when on free list */
+#define LC_FREE (~0U)
+};
+
+struct lru_cache {
+	/* the least recently used item is kept at lru->prev */
+	struct list_head lru;
+	struct list_head free;
+	struct list_head in_use;
+
+	/* the pre-created kmem cache to allocate the objects from */
+	struct kmem_cache *lc_cache;
+
+	/* size of tracked objects, used to memset(,0,) them in lc_reset */
+	size_t element_size;
+	/* offset of struct lc_element member in the tracked object */
+	size_t element_off;
+
+	/* number of elements (indices) */
+	unsigned int  nr_elements;
+	/* Arbitrary limit on maximum tracked objects. Practical limit is much
+	 * lower due to allocation failures, probably. For typical use cases,
+	 * nr_elements should be a few thousand at most.
+	 * This also limits the maximum value of lc_element.lc_index, allowing the
+	 * 8 high bits of .lc_index to be overloaded with flags in the future. */
+#define LC_MAX_ACTIVE	(1<<24)
+
+	/* statistics */
+	unsigned used; /* number of lelements currently on in_use list */
+	unsigned long hits, misses, starving, dirty, changed;
+
+	/* see below: flag-bits for lru_cache */
+	unsigned long flags;
+
+	/* when changing the label of an index element */
+	unsigned int  new_number;
+
+	/* for paranoia when changing the label of an index element */
+	struct lc_element *changing_element;
+
+	void  *lc_private;
+	const char *name;
+
+	/* nr_elements there */
+	struct hlist_head *lc_slot;
+	struct lc_element **lc_element;
+};
+
+
+/* flag-bits for lru_cache */
+enum {
+	/* debugging aid, to catch concurrent access early.
+	 * user needs to guarantee exclusive access by proper locking! */
+	__LC_PARANOIA,
+	/* if we need to change the set, but currently there is a changing
+	 * transaction pending, we are "dirty", and must deferr further
+	 * changing requests */
+	__LC_DIRTY,
+	/* if we need to change the set, but currently there is no free nor
+	 * unused element available, we are "starving", and must not give out
+	 * further references, to guarantee that eventually some refcnt will
+	 * drop to zero and we will be able to make progress again, changing
+	 * the set, writing the transaction.
+	 * if the statistics say we are frequently starving,
+	 * nr_elements is too small. */
+	__LC_STARVING,
+};
+#define LC_PARANOIA (1<<__LC_PARANOIA)
+#define LC_DIRTY    (1<<__LC_DIRTY)
+#define LC_STARVING (1<<__LC_STARVING)
+
+extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+		unsigned e_count, size_t e_size, size_t e_off);
+extern void lc_reset(struct lru_cache *lc);
+extern void lc_destroy(struct lru_cache *lc);
+extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
+extern void lc_del(struct lru_cache *lc, struct lc_element *element);
+
+extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
+extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
+extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
+extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
+extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
+
+struct seq_file;
+extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
+
+extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
+				void (*detail) (struct seq_file *, struct lc_element *));
+
+/**
+ * lc_try_lock - can be used to stop lc_get() from changing the tracked set
+ * @lc: the lru cache to operate on
+ *
+ * Note that the reference counts and order on the active and lru lists may
+ * still change.  Returns true if we aquired the lock.
+ */
+static inline int lc_try_lock(struct lru_cache *lc)
+{
+	return !test_and_set_bit(__LC_DIRTY, &lc->flags);
+}
+
+/**
+ * lc_unlock - unlock @lc, allow lc_get() to change the set again
+ * @lc: the lru cache to operate on
+ */
+static inline void lc_unlock(struct lru_cache *lc)
+{
+	clear_bit(__LC_DIRTY, &lc->flags);
+	smp_mb__after_clear_bit();
+}
+
+static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e = lc_find(lc, enr);
+	return e && e->refcnt;
+}
+
+#define lc_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
+extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/major.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/major.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/major.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/major.h	2015-01-21 12:02:42.049257568 +0300
@@ -174,4 +174,7 @@
 #define BLOCK_EXT_MAJOR		259
 #define SCSI_OSD_MAJOR		260	/* open-osd's OSD scsi device */
 
+#define UNNAMED_EXTRA_MAJOR		130
+#define UNNAMED_EXTRA_MAJOR_COUNT	120
+
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/memcontrol.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/memcontrol.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/memcontrol.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/memcontrol.h	2015-01-21 12:02:58.673816242 +0300
@@ -200,14 +200,16 @@ static inline void mem_cgroup_uncharge_c
 static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 						    struct mem_cgroup *memcg)
 {
-	return &zone->lruvec;
+	return NULL;
+//	return &zone->lruvec;
 }
 
 static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone,
 						     struct page *page,
 						     enum lru_list lru)
 {
-	return &zone->lruvec;
+	return NULL;
+//	return &zone->lruvec;
 }
 
 static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
@@ -223,7 +225,8 @@ static inline struct lruvec *mem_cgroup_
 						       enum lru_list from,
 						       enum lru_list to)
 {
-	return &zone->lruvec;
+	return NULL;
+	//return &zone->lruvec;
 }
 
 static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -341,6 +344,7 @@ unsigned long mem_cgroup_soft_limit_recl
 	return 0;
 }
 
+static inline
 void mem_cgroup_split_hugepage_commit(struct page *page, struct page *head)
 {
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mm.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mm.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mm.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mm.h	2015-01-21 12:02:58.674816215 +0300
@@ -63,7 +63,9 @@ extern int overcommit_kbytes_handler(str
  * mmap() functions).
  */
 
-extern struct kmem_cache *vm_area_cachep;
+extern struct kmem_cache *__vm_area_cachep;
+#define allocate_vma(mm, gfp_flags)	ub_kmem_alloc((mm)->mm_ub, __vm_area_cachep, gfp_flags)
+#define free_vma(mm, vma)		ub_kmem_free((mm)->mm_ub, __vm_area_cachep, vma)
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -180,12 +182,12 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return (vma->vm_flags & VM_PFN_AT_MMAP);
+	return !!(vma->vm_flags & VM_PFN_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
 {
-	return (vma->vm_flags & VM_PFNMAP);
+	return !!(vma->vm_flags & VM_PFNMAP);
 }
 
 /*
@@ -642,6 +644,37 @@ static inline struct zone *page_zone(str
 	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
 }
 
+#ifndef CONFIG_MEMORY_GANGS
+
+static inline struct gang *zone_init_gang(struct zone *zone)
+{
+       return &zone->init_gang;
+}
+
+static inline struct zone *gang_zone(struct gang *gang)
+{
+	return container_of(gang, struct zone, init_gang);
+}
+
+#else /* CONFIG_MEMORY_GANGS */
+
+static inline struct gang *zone_init_gang(struct zone *zone)
+{
+	return &zone->zone_pgdat->init_gangs[zone_idx(zone)];
+}
+
+static inline struct gang *zone_junk_gang(struct zone *zone)
+{
+	return &zone->zone_pgdat->junk_gangs[zone_idx(zone)];
+}
+
+static inline struct zone *gang_zone(struct gang *gang)
+{
+	return gang->lruvec.zone;
+}
+
+#endif /* CONFIG_MEMORY_GANGS */
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 static inline unsigned long page_to_section(struct page *page)
 {
@@ -847,6 +880,7 @@ extern void show_free_areas(void);
 extern void __show_free_areas(unsigned int flags);
 
 int shmem_lock(struct file *file, int lock, struct user_struct *user);
+#define shmem_nopage filemap_nopage
 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
 int shmem_zero_setup(struct vm_area_struct *);
 
@@ -859,6 +893,8 @@ extern unsigned long shmem_get_unmapped_
 #endif
 
 extern int can_do_mlock(void);
+extern int __mlock(unsigned long, size_t, bool);
+extern int __munlock(unsigned long, size_t, bool);
 extern int user_shm_lock(size_t, struct user_struct *);
 extern void user_shm_unlock(size_t, struct user_struct *);
 
@@ -874,6 +910,9 @@ struct zap_details {
 	unsigned long truncate_count;		/* Compare vm_truncate_count */
 };
 
+void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
+		pte_t pte, struct page *page);
+
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		pte_t pte);
 
@@ -917,9 +956,14 @@ int walk_page_range(unsigned long addr, 
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma);
+		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma,
+		      unsigned long addr, size_t size);
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
+void zap_mapping_range(struct address_space *mapping,
+		struct zap_details *details);
+void synchronize_mapping_faults(struct address_space *mapping);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
@@ -957,6 +1001,10 @@ static inline int handle_mm_fault(struct
 }
 #endif
 
+extern int install_anon_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			     unsigned long addr, struct page *page);
+
+extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
@@ -981,6 +1029,7 @@ int redirty_page_for_writepage(struct wr
 				struct page *page);
 void account_page_dirtied(struct page *page, struct address_space *mapping);
 int set_page_dirty(struct page *page);
+int set_page_dirty_mm(struct page *page, struct mm_struct *mm);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
@@ -1027,7 +1076,7 @@ struct shrinker {
 
 	/* These are for internal use */
 	struct list_head list;
-	long nr;	/* objs pending delete */
+	atomic_long_t nr_in_batch; /* objs pending delete */
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 extern void register_shrinker(struct shrinker *);
@@ -1212,7 +1261,6 @@ extern void set_dma_reserve(unsigned lon
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
 extern void setup_per_zone_wmarks(void);
-extern void calculate_zone_inactive_ratio(struct zone *zone);
 extern void mem_init(void);
 extern void __init mmap_init(void);
 extern void show_mem(unsigned int flags);
@@ -1264,6 +1312,7 @@ extern int insert_vm_struct(struct mm_st
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
 extern void unlink_file_vma(struct vm_area_struct *);
+extern void __vma_link_file(struct vm_area_struct *vma);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
 extern void exit_mmap(struct mm_struct *);
@@ -1301,7 +1350,7 @@ extern unsigned long do_mmap_pgoff(struc
 	unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
-	unsigned int vm_flags, unsigned long pgoff);
+	vm_flags_t vm_flags, unsigned long pgoff);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
@@ -1328,6 +1377,8 @@ extern void truncate_inode_pages_range(s
 /* generic vm_area_ops exported for stackable file systems */
 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+struct page *pick_peer_page(struct inode *inode, struct file_ra_state *ra,
+			    pgoff_t index, unsigned ra_size);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
@@ -1389,6 +1440,7 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+extern const struct vm_operations_struct special_mapping_vmops;
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
@@ -1467,7 +1519,12 @@ unsigned long shrink_slab(unsigned long 
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
 #else
-extern int randomize_va_space;
+extern int _randomize_va_space;
+#ifndef CONFIG_VE
+#define randomize_va_space _randomize_va_space
+#else
+#define randomize_va_space (get_exec_env()->_randomize_va_space)
+#endif
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mm_inline.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mm_inline.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mm_inline.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mm_inline.h	2015-01-21 12:02:58.674816215 +0300
@@ -2,6 +2,7 @@
 #define LINUX_MM_INLINE_H
 
 #include <linux/huge_mm.h>
+#include <linux/rcupdate.h>
 
 /**
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
@@ -21,22 +22,115 @@ static inline int page_is_file_cache(str
 	return !PageSwapBacked(page);
 }
 
+static struct zone *lruvec_zone(struct lruvec *lruvec)
+{
+	return lruvec->zone;
+}
+
+static inline struct lruvec *page_lruvec(struct page *page)
+{
+	return rcu_dereference(page->lruvec);
+}
+
+static inline struct lruvec *__page_lruvec(struct page *page)
+{
+	return rcu_access_pointer(page->lruvec);
+}
+
+static inline void set_page_lruvec(struct page *page, struct lruvec *lruvec)
+{
+	rcu_assign_pointer(page->lruvec, lruvec);
+}
+
+static inline struct lruvec *
+relock_lruvec(struct lruvec *locked, struct lruvec *lruvec)
+{
+	if (unlikely(locked != lruvec)) {
+		if (locked)
+			spin_unlock(&locked->lru_lock);
+		spin_lock(&lruvec->lru_lock);
+	}
+	return lruvec;
+}
+
 static inline void
-add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+unlock_lruvec(struct lruvec *lruvec)
+{
+	if (lruvec)
+		spin_unlock(&lruvec->lru_lock);
+}
+
+static inline struct lruvec *lock_page_lru(struct page *page)
 {
 	struct lruvec *lruvec;
 
-	lruvec = mem_cgroup_lru_add_list(zone, page, l);
-	list_add(&page->lru, &lruvec->lists[l]);
-	__mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
+	rcu_read_lock();
+	while (1) {
+		lruvec = page_lruvec(page);
+		spin_lock(&lruvec->lru_lock);
+		if (likely(__page_lruvec(page) == lruvec))
+			break;
+		spin_unlock(&lruvec->lru_lock);
+	}
+	rcu_read_unlock();
+
+	return lruvec;
+}
+
+static inline struct lruvec *
+relock_page_lru(struct lruvec *locked, struct page *page)
+{
+	struct lruvec *lruvec = __page_lruvec(page);
+
+	if (unlikely(locked != lruvec)) {
+		if (locked)
+			spin_unlock(&locked->lru_lock);
+		lruvec = lock_page_lru(page);
+	}
+	return lruvec;
+}
+
+static inline bool
+try_relock_page_lru(struct lruvec **locked, struct page *page)
+{
+	struct lruvec *lruvec;
+
+	while (PageLRU(page)) {
+		rcu_read_lock();
+		lruvec = page_lruvec(page);
+		if (lruvec) {
+			*locked = relock_lruvec(*locked, lruvec);
+			if (__page_lruvec(page) == lruvec) {
+				rcu_read_unlock();
+				return PageLRU(page);
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	return false;
+}
+
+static inline void
+add_page_to_lru_list(struct lruvec *lruvec, struct page *page, enum lru_list l)
+{
+	struct zone *zone = lruvec_zone(lruvec);
+	int numpages = hpage_nr_pages(page);
+
+	list_add(&page->lru, &lruvec->lru_list[l]);
+	lruvec->nr_pages[l] += numpages;
+	__mod_zone_page_state(zone, NR_LRU_BASE + l, numpages);
 }
 
 static inline void
-del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+del_page_from_lru_list(struct lruvec *lruvec, struct page *page, enum lru_list l)
 {
-	mem_cgroup_lru_del_list(page, l);
+	struct zone *zone = lruvec_zone(lruvec);
+	int numpages = hpage_nr_pages(page);
+
 	list_del(&page->lru);
-	__mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
+	lruvec->nr_pages[l] -= numpages;
+	__mod_zone_page_state(zone, NR_LRU_BASE + l, -numpages);
 }
 
 /**
@@ -55,7 +149,7 @@ static inline enum lru_list page_lru_bas
 }
 
 static inline void
-del_page_from_lru(struct zone *zone, struct page *page)
+del_page_from_lru(struct lruvec *lruvec, struct page *page)
 {
 	enum lru_list l;
 
@@ -69,9 +163,7 @@ del_page_from_lru(struct zone *zone, str
 			l += LRU_ACTIVE;
 		}
 	}
-	mem_cgroup_lru_del_list(page, l);
-	list_del(&page->lru);
-	__mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
+	del_page_from_lru_list(lruvec, page, l);
 }
 
 /**
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mm_types.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mm_types.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mm_types.h	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mm_types.h	2015-01-21 12:02:58.674816215 +0300
@@ -21,6 +21,7 @@
 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
 
 struct address_space;
+struct lruvec;
 
 #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
 
@@ -52,28 +53,27 @@ struct page {
 		};
 	};
 	union {
-	    struct {
 		unsigned long private;		/* Mapping-private opaque data:
-					 	 * usually used for buffer_heads
+						 * usually used for buffer_heads
 						 * if PagePrivate set; used for
 						 * swp_entry_t if PageSwapCache;
 						 * indicates order in the buddy
 						 * system if PG_buddy is set.
 						 */
-		struct address_space *mapping;	/* If low bit clear, points to
-						 * inode address_space, or NULL.
-						 * If page mapped as anonymous
-						 * memory, low bit is set, and
-						 * it points to anon_vma object:
-						 * see PAGE_MAPPING_ANON below.
-						 */
-	    };
+		atomic_t vswap_count;		/* if PageVSwap() set */
 #if USE_SPLIT_PTLOCKS
-	    spinlock_t ptl;
+		spinlock_t ptl;
 #endif
-	    struct kmem_cache *slab;	/* SLUB: Pointer to slab */
-	    struct page *first_page;	/* Compound tail pages */
+		struct kmem_cache *slab;	/* SLUB: Pointer to slab */
+		struct page *first_page;	/* Compound tail pages */
 	};
+	struct address_space *mapping;	/* If low bit clear, points to
+					 * inode address_space, or NULL.
+					 * If page mapped as anonymous
+					 * memory, low bit is set, and
+					 * it points to anon_vma object:
+					 * see PAGE_MAPPING_ANON below.
+					 */
 	union {
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* SLUB: freelist req. slab lock */
@@ -106,8 +106,17 @@ struct page {
 	 */
 	void *shadow;
 #endif
+	union {
+		struct lruvec *lruvec;
+#ifdef CONFIG_BEANCOUNTERS
+		struct user_beancounter *kmem_ub;
+		struct user_beancounter **slub_ubs;
+#endif
+	};
 };
 
+typedef unsigned long __nocast vm_flags_t;
+
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
  * conditions.  These are held in a global tree and are pinned by the VMAs that
@@ -115,7 +124,7 @@ struct page {
  */
 struct vm_region {
 	struct rb_node	vm_rb;		/* link in global region tree */
-	unsigned long	vm_flags;	/* VMA vm_flags */
+	vm_flags_t	vm_flags;	/* VMA vm_flags */
 	unsigned long	vm_start;	/* start address of region */
 	unsigned long	vm_end;		/* region initialised to here */
 	unsigned long	vm_top;		/* region allocated to here */
@@ -245,11 +254,13 @@ struct mm_struct {
 	mm_counter_t _anon_rss;
 	mm_counter_t _swap_usage;
 
+	long page_table_precharge;	/* protected by mmap_sem and page_table_lock */
+
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 
 	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
-	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes, nr_ptds;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
@@ -276,6 +287,13 @@ struct mm_struct {
 
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
+	unsigned int vps_dumpable:2;
+	unsigned int global_oom:1;
+	unsigned int ub_oom:1;
+
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *mm_ub;
+#endif
 	struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
@@ -310,17 +328,22 @@ struct mm_struct {
 #ifdef __GENKSYMS__
 	unsigned long rh_reserved[2];
 #else
-	/* How many tasks sharing this mm are OOM_DISABLE */
-	union {
-		unsigned long rh_reserved_aux;
-		atomic_t oom_disable_count;
-	};
-
 	/* base of lib map area (ASCII armour) */
 	unsigned long shlib_base;
 #endif
 };
 
+/* tasks entered to VE from host, no ptrace,
+ * or coredump or licdata access allowed
+ */
+#define VD_VE_ENTER_TASK	0
+/* tasks with ptrace and coredump allowed */
+#define VD_PTRACE_COREDUMP	1
+/* tasks accessed containers license data,
+ *  no ptrace and no coredump allowed
+ */
+#define VD_LICDATA_ACCESS	2
+
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
 #define mm_cpumask(mm) (&(mm)->cpu_vm_mask)
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mman.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mman.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mman.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mman.h	2015-01-21 12:02:41.645268295 +0300
@@ -88,6 +88,9 @@ static inline unsigned long
 calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
+#ifdef MAP_GROWSUP
+	       _calc_vm_trans(flags, MAP_GROWSUP,    VM_GROWSUP ) |
+#endif
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mmgang.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mmgang.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mmgang.h	2015-01-21 12:02:43.391221939 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mmgang.h	2015-01-21 12:02:58.939809182 +0300
@@ -0,0 +1,361 @@
+#ifndef _LINIX_MMGANG_H
+#define _LINIX_MMGANG_H
+
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/sched.h>
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+
+void setup_zone_gang(struct gang_set *gs, struct zone *zone, struct gang *gang);
+
+#ifndef CONFIG_BC_RSS_ACCOUNTING
+
+extern struct gang_set init_gang_set;
+
+static inline struct gang_set *get_mapping_gang(struct address_space *mapping)
+{
+	return &init_gang_set;
+}
+
+static inline struct gang_set *get_mm_gang(struct mm_struct *mm)
+{
+	return &init_gang_set;
+}
+
+static inline struct gang_set *get_ub_gs(struct user_beancounter *ub)
+{
+	return &init_gang_set;
+}
+
+static inline struct user_beancounter *get_gangs_ub(struct gang_set *gs)
+{
+	return get_ub0();
+}
+
+static inline struct user_beancounter *get_gang_ub(struct gang *gang)
+{
+	return get_ub0();
+}
+
+#else /* CONFIG_BC_RSS_ACCOUNTING */
+
+#define init_gang_set	(ub0.gang_set)
+
+static inline struct gang_set *get_mapping_gang(struct address_space *mapping)
+{
+	return &get_exec_ub()->gang_set;
+}
+
+static inline struct gang_set *get_mm_gang(struct mm_struct *mm)
+{
+	return &mm_ub(mm)->gang_set;
+}
+
+static inline struct gang_set *get_ub_gs(struct user_beancounter *ub)
+{
+	return &ub->gang_set;
+}
+
+static inline struct user_beancounter *get_gangs_ub(struct gang_set *gs)
+{
+	return container_of(gs, struct user_beancounter, gang_set);
+}
+
+static inline struct user_beancounter *get_gang_ub(struct gang *gang)
+{
+	return get_gangs_ub(gang->set);
+}
+
+#endif /* CONFIG_BC_RSS_ACCOUNTING */
+
+static inline struct gang *lruvec_gang(struct lruvec *lruvec)
+{
+	return container_of(lruvec, struct gang, lruvec);
+}
+
+#ifdef CONFIG_MEMORY_GANGS
+
+static inline struct gang *page_gang(struct page *page)
+{
+	return container_of(rcu_dereference(page->lruvec), struct gang, lruvec);
+}
+
+static inline void set_page_gang(struct page *page, struct gang *gang)
+{
+	set_page_lruvec(page, &gang->lruvec);
+}
+
+static inline struct gang *mem_zone_gang(struct gang_set *gs, struct zone *zone)
+{
+	return &gs->gangs[zone_to_nid(zone)][zone_idx(zone)];
+}
+
+static inline struct gang *mem_page_gang(struct gang_set *gs, struct page *page)
+{
+	return &gs->gangs[page_to_nid(page)][page_zonenum(page)];
+}
+
+static inline bool gang_in_shadow(struct gang *gang)
+{
+	return test_bit(GANG_IN_SHADOW, &gang->flags);
+}
+
+static inline bool gang_of_junk(struct gang *gang)
+{
+	return test_bit(GANG_OF_JUNK, &gang->flags);
+}
+
+static inline struct gang *gang_to_shadow_gang(struct gang *gang)
+{
+	return gang->shadow;
+}
+
+static inline bool page_in_gang(struct page *page, struct gang_set *gs)
+{
+	struct gang *gang;
+	bool ret;
+
+	rcu_read_lock();
+	gang = page_gang(page);
+	ret = (gang->set == gs) && !gang_in_shadow(gang);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+void add_zone_gang(struct zone *zone, struct gang *gang);
+void set_gang_priority(struct gang *gang, int priority);
+void update_vmscan_priority(struct gang *gang);
+void set_gang_limits(struct gang_set *gs, unsigned long *limit, nodemask_t *nodemask);
+static inline int get_zone_nr_gangs(struct zone *zone) { return zone->nr_gangs; }
+int alloc_mem_gangs(struct gang_set *gs);
+void free_mem_gangs(struct gang_set *gs);
+void add_mem_gangs(struct gang_set *gs);
+void del_mem_gangs(struct gang_set *gs);
+void junk_mem_gangs(struct gang_set *gs);
+#define for_each_gang(gang, zone)			\
+	list_for_each_entry_rcu(gang, &zone->gangs, list)
+static inline int pin_mem_gang(struct gang *gang)
+{
+	struct user_beancounter *ub = get_gang_ub(gang);
+	if (!get_beancounter_rcu(ub))
+		return -EBUSY;
+	ub_percpu_inc(ub, pincount);
+	return 0;
+}
+static inline void unpin_mem_gang(struct gang *gang)
+{
+	struct user_beancounter *ub = get_gang_ub(gang);
+	ub_percpu_dec(ub, pincount);
+	put_beancounter(ub);
+}
+
+static inline void gang_add_free_page(struct page *page)
+{
+	set_page_gang(page, NULL);
+}
+static inline int gang_add_user_page(struct page *page,
+		struct gang_set *gs, gfp_t gfp_mask)
+{
+	VM_BUG_ON(page->lruvec);
+	if (ub_phys_charge(get_gangs_ub(gs), hpage_nr_pages(page), gfp_mask))
+		return -ENOMEM;
+	set_page_gang(page, mem_page_gang(gs, page));
+	return 0;
+}
+static inline int gang_mod_user_page(struct page *page,
+		struct gang_set *gs, gfp_t gfp_mask)
+{
+	int numpages = hpage_nr_pages(page);
+	struct gang *gang = page_gang(page);
+	struct user_beancounter *ub = get_gang_ub(gang);
+
+	if (ub_phys_charge(get_gangs_ub(gs), numpages,
+				gfp_mask|__GFP_NORETRY))
+		return -ENOMEM;
+	if (!gang_in_shadow(gang)) {
+		ub_phys_uncharge(ub, numpages);
+	} else {
+		uncharge_beancounter_fast(ub, UB_SHADOWPAGES, numpages);
+		if (PageSwapBacked(page))
+			uncharge_beancounter_fast(ub, UB_SWAPPAGES, numpages);
+	}
+
+	VM_BUG_ON(PageLRU(page));
+	spin_lock_irq(&gang->lruvec.lru_lock);
+	set_page_gang(page, mem_page_gang(gs, page));
+	spin_unlock_irq(&gang->lruvec.lru_lock);
+	return 0;
+}
+static inline int gang_mod_shadow_page(struct page *page)
+{
+	int numpages = hpage_nr_pages(page);
+	struct gang *gang = page_gang(page);
+	struct user_beancounter *ub = get_gang_ub(gang);
+
+	VM_BUG_ON(gang_in_shadow(gang));
+	VM_BUG_ON(PageLRU(page));
+
+	if (PageSwapBacked(page)) {
+		if (charge_beancounter_fast(ub, UB_SWAPPAGES,
+					    numpages, UB_SOFT | UB_TEST))
+			return -ENOMEM;
+	}
+
+	ub_phys_uncharge(ub, numpages);
+	charge_beancounter_fast(ub, UB_SHADOWPAGES, numpages, UB_FORCE);
+	spin_lock_irq(&gang->lruvec.lru_lock);
+	set_page_gang(page, gang_to_shadow_gang(gang));
+	spin_unlock_irq(&gang->lruvec.lru_lock);
+	return 0;
+}
+static inline void gang_del_user_page(struct page *page)
+{
+	struct gang *gang = page_gang(page);
+	int numpages = hpage_nr_pages(page);
+	struct user_beancounter *ub = get_gang_ub(gang);
+
+	if (!gang_in_shadow(gang)) {
+		ub_phys_uncharge(ub, numpages);
+	} else {
+		uncharge_beancounter_fast(ub, UB_SHADOWPAGES, numpages);
+		if (PageSwapBacked(page))
+			uncharge_beancounter_fast(ub, UB_SWAPPAGES, numpages);
+	}
+	set_page_gang(page, NULL);
+}
+
+static inline bool
+is_lru_milestone(struct lruvec *lruvec, struct list_head *list)
+{
+	struct gang *gang = lruvec_gang(lruvec);
+
+	return list >= gang->milestones[0].lru &&
+	       list < gang->milestones[NR_LRU_MILESTONES].lru;
+}
+
+extern bool insert_lru_milestone(struct gang *gang, unsigned long now,
+				 unsigned long *eldest_milestone);
+extern void remove_lru_milestone(struct lruvec *lruvec, enum lru_list lru);
+
+extern struct gang *init_gang_array[];
+
+extern unsigned long total_committed_pages;
+
+#else /* CONFIG_MEMORY_GANGS */
+
+static inline struct gang *page_gang(struct page *page)
+{
+       return zone_init_gang(page_zone(page));
+}
+
+static inline void set_page_gang(struct page *page, struct gang *gang)
+{
+}
+
+static inline struct gang *mem_zone_gang(struct gang_set *gs, struct zone *zone)
+{
+	return &zone->init_gang;
+}
+
+static inline struct gang *mem_page_gang(struct gang_set *gs, struct page *page)
+{
+	return &page_zone(page)->init_gang;
+}
+
+static inline bool gang_in_shadow(struct gang *gang)
+{
+	return false;
+}
+
+static inline bool page_in_gang(struct page *page, struct gang_set *gs)
+{
+	return true;
+}
+
+static inline void add_zone_gang(struct zone *zone, struct gang *gang) { }
+static inline void set_gang_priority(struct gang *gang, int priority) { }
+static inline void update_vmscan_priority(struct gang *gang) { }
+static inline void set_gang_limits(struct gang_set *gs,
+		unsigned long *limit, nodemask_t *nodemask) { }
+static inline int get_zone_nr_gangs(struct zone *zone) { return 1; }
+static inline void free_mem_gangs(struct gang_set *gs) { }
+static inline int alloc_mem_gangs(struct gang_set *gs) { return 0; }
+static inline void add_mem_gangs(struct gang_set *gs) { }
+static inline void del_mem_gangs(struct gang_set *gs) { }
+static inline void junk_mem_gangs(struct gang_set *gs)  { }
+#define for_each_gang(gang, zone)			\
+	for ( gang = &(zone)->init_gang ; gang ; gang = NULL )
+static inline int pin_mem_gang(struct gang *gang) { return 0; }
+static inline void unpin_mem_gang(struct gang *gang) { }
+
+static inline void gang_add_free_page(struct page *page) { }
+static inline int gang_add_user_page(struct page *page,
+		struct gang_set *gs, gfp_t gfp_mask) { return 0; }
+static inline int gang_mod_user_page(struct page *page,
+		struct gang_set *gs, gfp_t gfp_mask) { return 0; }
+static inline int gang_mod_shadow_page(struct page *page) { return 0; }
+static inline void gang_del_user_page(struct page *page) { }
+
+static inline bool
+is_lru_milestone(struct lruvec *lruvec, struct list_head *list)
+{
+	return false;
+}
+static inline bool insert_lru_milestone(struct lruvec *lruvec, unsigned long now,
+					unsigned long *eldest_milestone)
+{
+	return false;
+}
+static inline void remove_lru_milestone(struct lruvec *lruvec, enum lru_list lru)
+{
+}
+
+#endif /* CONFIG_MEMORY_GANGS */
+
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+extern unsigned int gangs_migration_max_isolate;
+extern unsigned int gangs_migration_min_batch;
+extern unsigned int gangs_migration_max_batch;
+extern unsigned int gangs_migration_interval;
+
+extern int schedule_gangs_migration(struct gang_set *gs,
+		const nodemask_t *src_nodes, const nodemask_t *dest_nodes);
+extern int cancel_gangs_migration(struct gang_set *gs);
+extern int gangs_migration_pending(struct gang_set *gs, nodemask_t *pending);
+
+extern int gangs_migration_batch_sysctl_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+#else
+static inline int schedule_gangs_migration(struct gang_set *gs,
+		const nodemask_t *src_nodes, const nodemask_t *dest_nodes)
+{
+	return 1;
+}
+static inline int cancel_gangs_migration(struct gang_set *gs)
+{
+	return 0;
+}
+static inline int gangs_migration_pending(struct gang_set *gs,
+					  nodemask_t *pending)
+{
+	if (pending)
+		nodes_clear(*pending);
+	return 0;
+}
+#endif
+
+void gang_page_stat(struct gang_set *gs, nodemask_t *nodemask,
+		    unsigned long *stat, unsigned long *shadow);
+void gang_show_state(struct gang_set *gs);
+
+#ifdef CONFIG_KSTALED
+void gang_idle_page_stat(struct gang_set *gs,
+		nodemask_t *nodemask, struct idle_page_stats *stats);
+#else
+static inline void gang_idle_page_stat(struct gang_set *gs,
+		nodemask_t *nodemask, struct idle_page_stats *stats) { }
+#endif
+
+#endif /* _LINIX_MMGANG_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mmzone.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mmzone.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mmzone.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mmzone.h	2015-01-21 12:02:58.930809420 +0300
@@ -16,6 +16,8 @@
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
 #include <linux/bounds.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
 #include <asm/atomic.h>
 #include <asm/page.h>
 
@@ -104,6 +106,9 @@ enum zone_stat_item {
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
+#ifdef CONFIG_MEMORY_VSWAP
+	NR_VSWAP,
+#endif
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
 	NUMA_MISS,		/* allocated in non intended node */
@@ -134,12 +139,14 @@ enum lru_list {
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
 	LRU_UNEVICTABLE,
+	NR_EVICTABLE_LRU_LISTS = LRU_UNEVICTABLE,
 	NR_LRU_LISTS
 };
 
 #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
 
-#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++)
+#define for_each_evictable_lru(l) \
+	for (l = LRU_ACTIVE_FILE; (int)l >= LRU_INACTIVE_ANON; l--)
 
 static inline int is_file_lru(enum lru_list l)
 {
@@ -162,6 +169,8 @@ static inline int is_unevictable_lru(enu
 #define ISOLATE_ACTIVE		((__force isolate_mode_t)0x2)
 /* Isolate clean file */
 #define ISOLATE_CLEAN		((__force isolate_mode_t)0x4)
+/* Isolate unmapped file */
+#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x8)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x10)
 
@@ -169,7 +178,35 @@ static inline int is_unevictable_lru(enu
 typedef unsigned __bitwise__ isolate_mode_t;
 
 struct lruvec {
-	struct list_head lists[NR_LRU_LISTS];
+	spinlock_t		lru_lock;
+
+	struct list_head	lru_list[NR_LRU_LISTS];
+	unsigned long		nr_pages[NR_LRU_LISTS];
+
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+
+	/*
+	 * accumulated for batching
+	 */
+	unsigned long		nr_saved_scan[NR_LRU_LISTS];
+
+	/*
+	 * Progress counter for local reclaimer
+	 */
+	atomic_long_t		pages_scanned;
+
+	unsigned int		priority;
+
+	struct zone		*zone;
 };
 
 enum zone_watermarks {
@@ -282,22 +319,99 @@ enum zone_type {
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
 
-struct zone_reclaim_stat {
-	/*
-	 * The pageout code in vmscan.c keeps track of how many of the
-	 * mem/swap backed and file backed pages are refeferenced.
-	 * The higher the rotated/scanned ratio, the more valuable
-	 * that cache is.
-	 *
-	 * The anon LRU stats live in [0], file LRU stats in [1]
-	 */
-	unsigned long		recent_rotated[2];
-	unsigned long		recent_scanned[2];
+/*
+ * The "priority" of VM scanning is how much of the queues we will scan in one
+ * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
+ * queues ("queue_length >> 12") during an aging round.
+ */
+#define DEF_PRIORITY		12
 
-	/*
-	 * accumulated for batching
-	 */
-	unsigned long		nr_saved_scan[NR_LRU_LISTS];
+/*
+ * Topmost priority for scanning LRU vector. At each priority kernel scans LRU
+ * vectors with this priority or higher. On one priority LRU verctors with
+ * higher priority got more pressure, see get_scan_count() for details.
+ */
+#define MAX_VMSCAN_PRIORITY	41
+
+#define NR_VMSCAN_PRIORITIES	(MAX_VMSCAN_PRIORITY + 1)
+
+struct gang;
+
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+struct gangs_migration_work {
+	struct delayed_work dwork;
+	nodemask_t src_nodes;
+	nodemask_t dest_nodes;
+	int cur_node, preferred_node;
+	unsigned long batch;
+	struct mutex lock;
+};
+#endif
+
+struct gang_set {
+#ifdef CONFIG_MEMORY_GANGS
+	struct gang		**gangs;
+	unsigned long		memory_limit;
+	unsigned long		memory_portion;
+	unsigned long		memory_available;
+	unsigned long		memory_committed;
+	nodemask_t		nodemask;
+#endif
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+	struct gangs_migration_work migration_work;
+#endif
+};
+
+/* bits in gang->flags */
+enum {
+	GANG_IN_SHADOW,
+	GANG_OF_JUNK,
+	GANG_UNHASHED,
+	GANG_NEED_RESCHED,
+};
+
+#define	MIN_MILESTONE_INTERVAL	HZ
+#define	MAX_MILESTONE_INTERVAL	(HZ * 60)
+
+#define NR_LRU_MILESTONES	50
+
+struct lru_milestone {
+	unsigned long		timestamp;
+	struct list_head	lru[NR_EVICTABLE_LRU_LISTS];
+};
+
+struct idle_page_stats {
+#ifdef CONFIG_KSTALED
+	unsigned long idle_clean;
+	unsigned long idle_dirty_file;
+	unsigned long idle_dirty_swap;
+#endif
+};
+
+struct gang {
+	struct lruvec		lruvec;
+
+	struct gang_set		*set;
+#ifdef CONFIG_MEMORY_GANGS
+	struct list_head	list;
+#endif
+	unsigned long		flags;
+	struct list_head	vmscan_list;
+#ifdef CONFIG_MEMORY_GANGS
+	struct lru_milestone	milestones[NR_LRU_MILESTONES];
+	unsigned long		timestamp[NR_EVICTABLE_LRU_LISTS];
+	unsigned int		last_milestone;
+	unsigned long		committed;
+	unsigned long		portion;
+	struct gang		*shadow;
+#endif
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+	unsigned long nr_migratepages; /* number of pages to migrate */
+#endif
+#ifdef CONFIG_KSTALED
+	seqcount_t idle_page_stats_lock;
+	struct idle_page_stats idle_page_stats, idle_scan_stats;
+#endif
 };
 
 struct zone {
@@ -367,45 +481,28 @@ struct zone {
 	ZONE_PADDING(_pad1_)
 
 	/* Fields commonly accessed by the page reclaim scanner */
-	spinlock_t		lru_lock;
-#ifdef __GENKSYMS__
-	struct zone_lru {
-		struct list_head list;
-	} lru[NR_LRU_LISTS];
+#ifndef CONFIG_MEMORY_GANGS
+	struct gang		init_gang;
 #else
-	struct lruvec		lruvec;
-#endif
+	spinlock_t		gangs_lock;
+	int			nr_gangs;
+	struct list_head	gangs;
+	unsigned long		vmscan_mask[BITS_TO_LONGS(NR_VMSCAN_PRIORITIES)];
+	struct list_head	vmscan_prio[NR_VMSCAN_PRIORITIES];
+	struct list_head	*vmscan_iter[NR_VMSCAN_PRIORITIES];
+	atomic_t		vmscan_round[NR_VMSCAN_PRIORITIES];
+	unsigned long		eldest_timestamp;
+	unsigned long		committed;
+	unsigned int		nr_unlimited_gangs;
+	bool			force_scan;
+#endif /* CONFIG_MEMORY_GANGS */
 
-	struct zone_reclaim_stat reclaim_stat;
-
-	unsigned long		pages_scanned;	   /* since last reclaim */
+	atomic_long_t		pages_scanned;     /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 
-	/*
-	 * prev_priority holds the scanning priority for this zone.  It is
-	 * defined as the scanning priority at which we achieved our reclaim
-	 * target at the previous try_to_free_pages() or balance_pgdat()
-	 * invokation.
-	 *
-	 * We use prev_priority as a measure of how much stress page reclaim is
-	 * under - it drives the swappiness decision: whether to unmap mapped
-	 * pages.
-	 *
-	 * Access to both this field is quite racy even on uniprocessor.  But
-	 * it is expected to average out OK.
-	 */
-	int prev_priority;
-
-	/*
-	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
-	 * this zone's LRU.  Maintained by the pageout code.
-	 */
-	unsigned int inactive_ratio;
-
-
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
 
@@ -517,13 +614,6 @@ static inline int zone_is_oom_locked(con
 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 
-/*
- * The "priority" of VM scanning is how much of the queues we will scan in one
- * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
- * queues ("queue_length >> 12") during an aging round.
- */
-#define DEF_PRIORITY 12
-
 /* Maximum number of zones on a zonelist */
 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
 
@@ -670,6 +760,14 @@ struct bootmem_data;
 typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[MAX_ZONELISTS];
+#ifdef CONFIG_MEMORY_GANGS
+	struct gang init_gangs[MAX_NR_ZONES];
+	struct gang init_shadow_gangs[MAX_NR_ZONES];
+	struct gang junk_gangs[MAX_NR_ZONES];
+	int milestone_interval;
+	unsigned long next_milestone;
+	struct timer_list milestone_timer;
+#endif
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mnt_namespace.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mnt_namespace.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mnt_namespace.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mnt_namespace.h	2015-01-21 12:02:57.862837766 +0300
@@ -23,8 +23,17 @@ struct proc_mounts {
 	struct mnt_namespace *ns;
 	struct path root;
 	int event;
+	struct list_head *iter;
+	loff_t iter_pos;
+	int iter_advanced;
+	struct list_head reader;
 };
 
+extern unsigned int sysctl_ve_mount_nr;
+
+extern void register_mounts_reader(struct proc_mounts *p);
+extern void unregister_mounts_reader(struct proc_mounts *p);
+
 struct fs_struct;
 
 extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt);
@@ -40,5 +49,7 @@ extern const struct seq_operations mount
 extern const struct seq_operations mountinfo_op;
 extern const struct seq_operations mountstats_op;
 
+extern struct rw_semaphore namespace_sem;
+
 #endif
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/module.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/module.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/module.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/module.h	2015-01-21 12:02:41.313277109 +0300
@@ -398,9 +398,6 @@ struct module
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
 
-	/* Who is waiting for us to be unloaded */
-	struct task_struct *waiter;
-
 	/* Destruction function. */
 	void (*exit)(void);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/mount.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mount.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/mount.h	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/mount.h	2015-01-21 12:02:57.968834952 +0300
@@ -36,6 +36,18 @@ struct mnt_namespace;
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
 #define MNT_PNODE_MASK	0x3000	/* propagation flag mask */
 
+#define MNT_CPT		0x1000000
+
+#define MNT_BEHAVIOR_FLAGS	(MNT_NOSUID | MNT_NODEV | MNT_NOEXEC |		\
+				MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME |	\
+				MNT_READONLY | MNT_STRICTATIME | MNT_SHRINKABLE)
+
+/*
+ * Remounts, which change any flags except for following ones,
+ * are forbidden inside containers.
+ */
+#define MNT_VE_RMT_MASK	MNT_READONLY
+
 struct vfsmount {
 	struct list_head mnt_hash;
 	struct vfsmount *mnt_parent;	/* fs we are mounted on */
@@ -71,6 +83,7 @@ struct vfsmount {
 #else
 	int mnt_writers;
 #endif
+	unsigned owner;
 };
 
 static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
@@ -91,6 +104,7 @@ static inline struct vfsmount *mntget(st
 
 struct file; /* forward dec */
 
+extern struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root);
 extern int mnt_want_write(struct vfsmount *mnt);
 extern int mnt_want_write_file(struct file *file);
 extern int mnt_clone_write(struct vfsmount *mnt);
@@ -116,6 +130,7 @@ struct file_system_type;
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
+extern struct vfsmount *vfs_bind_mount(struct vfsmount *, struct dentry *);
 
 struct nameidata;
 
@@ -125,6 +140,7 @@ extern int do_add_mount(struct vfsmount 
 
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
+extern void replace_mount(struct vfsmount *src_mnt, struct vfsmount *dst_mnt);
 
 extern spinlock_t vfsmount_lock;
 extern dev_t name_to_dev_t(char *name);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/msg.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/msg.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/msg.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/msg.h	2015-01-21 12:02:47.939101197 +0300
@@ -107,6 +107,14 @@ extern long do_msgsnd(int msqid, long mt
 extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext,
 			size_t msgsz, long msgtyp, int msgflg);
 
+int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg);
+int sysvipc_setup_msg(key_t key, int msqid, int msgflg);
+int sysv_msg_store(struct msg_msg *msg,
+		   int (*store)(void * src, int len, int offset, void * data),
+		   int len, void * data);
+struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset,
+					  void * data), int len, void * data);
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_MSG_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/namei.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/namei.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/namei.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/namei.h	2015-01-21 12:02:52.024992729 +0300
@@ -59,6 +59,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
 #define LOOKUP_CREATE		0x0200
 #define LOOKUP_EXCL		0x0400
 #define LOOKUP_RENAME_TARGET	0x0800
+#define LOOKUP_STRICT		0x2000	/* no symlinks or other filesystems */
+#define LOOKUP_EMPTY		0x4000
+#define LOOKUP_DIVE		0x8000	/* no follow mount */
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
@@ -88,6 +91,8 @@ extern int follow_up(struct path *);
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
+extern int lookup_flags(unsigned int f);
+
 static inline void nd_set_link(struct nameidata *nd, char *path)
 {
 	nd->saved_names[nd->depth] = path;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/net.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/net.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/net.h	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/net.h	2015-01-21 12:02:49.580057635 +0300
@@ -220,6 +220,7 @@ enum {
 extern int	     sock_wake_async(struct socket *sk, int how, int band);
 extern int	     sock_register(const struct net_proto_family *fam);
 extern void	     sock_unregister(int family);
+extern int	     is_sock_registered(int family);
 extern int	     __sock_create(struct net *net, int family, int type, int proto,
 				 struct socket **res, int kern);
 extern int	     sock_create(int family, int type, int proto,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netdevice.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netdevice.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netdevice.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netdevice.h	2015-01-21 12:02:51.176015266 +0300
@@ -314,6 +314,11 @@ enum netdev_state_t
 	__LINK_STATE_DORMANT,
 };
 
+struct netdev_bc {
+	struct user_beancounter *exec_ub, *owner_ub;
+};
+
+#define netdev_bc(dev)		(&(dev)->dev_bc)
 
 /*
  * This structure holds at boot time configured netdevice settings. They
@@ -658,6 +663,11 @@ struct netdev_tc_txq {
 	u16 offset;
 };
 
+struct cpt_context;
+struct cpt_ops;
+struct rst_ops;
+struct cpt_netdev_image;
+
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 /*
  * This structure is to hold information about the device
@@ -911,6 +921,9 @@ struct net_device_ops {
 	int			(*ndo_fcoe_get_wwn)(struct net_device *dev,
 						    u64 *wwn, int type);
 #endif
+	void			(*ndo_cpt)(struct net_device *dev,
+						struct cpt_ops *,
+						struct cpt_context *);
 };
 
 /**
@@ -939,6 +952,18 @@ struct net_device_ops_ext {
 
 typedef u64 netdev_features_t;
 
+struct netdev_rst {
+	int			cpt_object;
+	int			(*ndo_rst)(loff_t, struct cpt_netdev_image *,
+						struct rst_ops *,
+						struct cpt_context *);
+	struct list_head	list;
+};
+
+void register_netdev_rst(struct netdev_rst *ops);
+void unregister_netdev_rst(struct netdev_rst *ops);
+struct netdev_rst *netdev_find_rst(int cpt_object, struct netdev_rst *ops);
+
 /*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
@@ -1064,11 +1089,20 @@ struct net_device
 	/* changeable features with no special hardware requirements */
 #define NETIF_F_SOFT_FEATURES	(NETIF_F_GSO | NETIF_F_GRO)
 
+	unsigned long		vz_features;
+/* The field features doesn't have free flags */
+#define NETIF_F_VENET		(1 << 0) /* device is venet device */
+#define NETIF_F_VIRTUAL		(1 << 1) /* can be registered inside VE */
+
 	/* Interface index. Unique device identifier	*/
 	int			ifindex;
 	int			iflink;
 
 	struct net_device_stats	stats;
+	/* Statistics from CT start to last suspend */
+	struct net_device_stats	s_stats;
+	/* Buffer to store summary in dev_get_stats() */
+	struct net_device_stats	b_stats;
 
 #ifdef CONFIG_WIRELESS_EXT
 	/* List of functions to handle Wireless Extensions (instead of ioctl).
@@ -1151,6 +1185,7 @@ struct net_device
 						      hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
+	unsigned char		is_leaked;
 
 	struct netdev_queue	rx_queue;
 
@@ -1222,6 +1257,9 @@ struct net_device
 	/* GARP */
 	struct garp_port	*garp_port;
 
+	struct ve_struct	*owner_env; /* Owner VE of the interface */
+	struct netdev_bc	dev_bc;
+
 	/* class/net/name entry */
 	struct device		dev;
 	/* space for optional statistics and wireless sysfs groups */
@@ -1249,6 +1287,17 @@ struct net_device
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
 #define	NETDEV_ALIGN		32
 #define NET_DEVICE_SIZE \
 	ALIGN(sizeof(struct net_device), NETDEV_ALIGN)
@@ -1650,6 +1699,8 @@ extern rwlock_t				dev_base_lock;		/* De
 
 #define for_each_netdev(net, d)		\
 		list_for_each_entry(d, &(net)->dev_base_head, dev_list)
+#define for_each_netdev_reverse(net, d)	\
+		list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_safe(net, d, n)	\
 		list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_continue(net, d)		\
@@ -2153,6 +2204,8 @@ extern int		dev_ethtool(struct net *net,
 extern unsigned		dev_get_flags(const struct net_device *);
 extern int		dev_change_flags(struct net_device *, unsigned);
 extern int		dev_change_name(struct net_device *, const char *);
+int __dev_change_net_namespace(struct net_device *, struct net *, const char *,
+			struct user_beancounter *exec_ub);
 extern int		dev_set_alias(struct net_device *, const char *, size_t);
 extern int		dev_change_net_namespace(struct net_device *,
 						 struct net *, const char *);
@@ -2619,6 +2672,19 @@ void netif_stacked_transfer_operstate(co
 
 int netif_skb_features(struct sk_buff *skb);
 
+#if defined(CONFIG_VE) && defined(CONFIG_NET)
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+	return !((dev->vz_features & NETIF_F_VIRTUAL) ||
+		 (dev->features & NETIF_F_NETNS_LOCAL));
+}
+#else
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+	return 0;
+}
+#endif
+
 static inline int net_gso_ok(int features, int gso_type)
 {
 	int feature = gso_type << NETIF_F_GSO_SHIFT;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/nfnetlink.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/nfnetlink.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/nfnetlink.h	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/nfnetlink.h	2015-01-21 12:02:42.478246179 +0300
@@ -76,11 +76,11 @@ struct nfnetlink_subsystem
 extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
-extern int nfnetlink_has_listeners(unsigned int group);
-extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, 
+extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
+extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group,
 			  int echo, gfp_t flags);
-extern void nfnetlink_set_err(u32 pid, u32 group, int error);
-extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags);
+extern void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error);
+extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags);
 
 extern void nfnl_lock(void);
 extern void nfnl_unlock(void);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/x_tables.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/x_tables.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/x_tables.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/x_tables.h	2015-01-21 12:02:45.405168469 +0300
@@ -375,6 +375,7 @@ struct xt_table_info
 {
 	/* Size per table */
 	unsigned int size;
+	unsigned int alloc_size;
 	/* Number of entries: FIXME. --RR */
 	unsigned int number;
 	/* Initial number of entries. Needed for module usage count */
@@ -590,7 +591,7 @@ extern void xt_compat_unlock(u_int8_t af
 
 extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta);
 extern void xt_compat_flush_offsets(u_int8_t af);
-extern short xt_compat_calc_jump(u_int8_t af, unsigned int offset);
+extern int xt_compat_calc_jump(u_int8_t af, unsigned int offset);
 
 extern int xt_compat_match_offset(const struct xt_match *match);
 extern int xt_compat_match_from_user(struct xt_entry_match *m,
@@ -605,6 +606,23 @@ extern int xt_compat_target_to_user(stru
 				    void __user **dstptr, unsigned int *size);
 
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_VE
+static inline bool ve_xt_table_forbidden(struct xt_table *xt)
+{
+	/*
+	 * The only purpose to have this check as a separate
+	 * helper is "grep"-a-bility
+	 *
+	 * If this helper hit it means that a VE has been
+	 * configured without the particular xt_table support
+	 */
+	return xt == NULL;
+}
+#else
+static inline bool ve_xt_table_forbidden(struct xt_table *xt) { return true; }
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _X_TABLES_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_CONNMARK.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_CONNMARK.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_CONNMARK.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_CONNMARK.h	2015-01-21 12:02:45.405168469 +0300
@@ -18,6 +18,12 @@ enum {
 	XT_CONNMARK_RESTORE
 };
 
+struct xt_connmark_target_info {
+	unsigned long mark;
+	unsigned long mask;
+	__u8 mode;
+};
+
 struct xt_connmark_tginfo1 {
 	__u32 ctmark, ctmask, nfmask;
 	__u8 mode;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_MARK.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_MARK.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_MARK.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_MARK.h	2015-01-21 12:02:45.406168443 +0300
@@ -3,6 +3,23 @@
 
 #include <linux/types.h>
 
+/* Version 0 */
+struct xt_mark_target_info {
+	unsigned long mark;
+};
+
+/* Version 1 */
+enum {
+	XT_MARK_SET=0,
+	XT_MARK_AND,
+	XT_MARK_OR,
+};
+
+struct xt_mark_target_info_v1 {
+	unsigned long mark;
+	__u8 mode;
+};
+
 struct xt_mark_tginfo2 {
 	__u32 mark, mask;
 };
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_connmark.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_connmark.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_connmark.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_connmark.h	2015-01-21 12:02:45.406168443 +0300
@@ -12,6 +12,11 @@
  * (at your option) any later version.
  */
 
+struct xt_connmark_info {
+	unsigned long mark, mask;
+	__u8 invert;
+};
+
 struct xt_connmark_mtinfo1 {
 	__u32 mark, mask;
 	__u8 invert;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_conntrack.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_conntrack.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_conntrack.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_conntrack.h	2015-01-21 12:02:45.406168443 +0300
@@ -32,6 +32,42 @@ enum {
 	XT_CONNTRACK_DIRECTION    = 1 << 12,
 };
 
+/* This is exposed to userspace, so remains frozen in time. */
+struct ip_conntrack_old_tuple
+{
+	struct {
+		__be32 ip;
+		union {
+			__u16 all;
+		} u;
+	} src;
+
+	struct {
+		__be32 ip;
+		union {
+			__u16 all;
+		} u;
+
+		/* The protocol. */
+		__u16 protonum;
+	} dst;
+};
+
+struct xt_conntrack_info
+{
+	unsigned int statemask, statusmask;
+
+	struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
+	struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
+
+	unsigned long expires_min, expires_max;
+
+	/* Flags word */
+	__u8 flags;
+	/* Inverse flags */
+	__u8 invflags;
+};
+
 struct xt_conntrack_mtinfo1 {
 	union nf_inet_addr origsrc_addr, origsrc_mask;
 	union nf_inet_addr origdst_addr, origdst_mask;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_hashlimit.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_hashlimit.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_hashlimit.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_hashlimit.h	2015-01-21 12:02:45.406168443 +0300
@@ -65,4 +65,11 @@ struct xt_hashlimit_mtinfo1 {
 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+#ifdef __KERNEL__
+struct ve_xt_hashlimit {
+	struct hlist_head	hashlimit_htables;
+	struct proc_dir_entry	*hashlimit_procdir4;
+	struct proc_dir_entry	*hashlimit_procdir6;
+};
+#endif
 #endif /*_XT_HASHLIMIT_H*/
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_mark.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_mark.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_mark.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_mark.h	2015-01-21 12:02:45.406168443 +0300
@@ -3,6 +3,11 @@
 
 #include <linux/types.h>
 
+struct xt_mark_info {
+    unsigned long mark, mask;
+    __u8 invert;
+};
+
 struct xt_mark_mtinfo1 {
 	__u32 mark, mask;
 	__u8 invert;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_recent.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_recent.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter/xt_recent.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter/xt_recent.h	2015-01-21 12:02:45.406168443 +0300
@@ -25,4 +25,15 @@ struct xt_recent_mtinfo {
 	__u8 side;
 };
 
+#ifdef __KERNEL__
+struct ve_ipt_recent {
+	struct list_head	tables;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry	*proc_dir;
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	struct proc_dir_entry	*proc_old_dir;
+#endif
+#endif
+};
+#endif
 #endif /* _LINUX_NETFILTER_XT_RECENT_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter.h	2015-01-21 12:02:47.739106507 +0300
@@ -260,7 +260,8 @@ struct nf_afinfo {
 					    unsigned int dataoff,
 					    unsigned int len,
 					    u_int8_t protocol);
-	int		(*route)(struct dst_entry **dst, struct flowi *fl);
+	int		(*route)(struct net *net, struct dst_entry **dst,
+				 struct flowi *fl);
 	void		(*saveroute)(const struct sk_buff *skb,
 				     struct nf_queue_entry *entry);
 	int		(*reroute)(struct sk_buff *skb,
@@ -366,5 +367,33 @@ extern void (*nf_ct_destroy)(struct nf_c
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
 
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/vziptable_defs.h>
+
+#define net_ipt_permitted(netns, ipt)					\
+	(mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt))
+
+#define net_ipt_module_set(netns, ipt)					\
+	({								\
+		(netns)->owner_ve->_iptables_modules |= ipt##_MOD;	\
+	})
+
+#define net_ipt_module_clear(netns, ipt)				\
+	({								\
+		(netns)->owner_ve->_iptables_modules &= ~ipt##_MOD;	\
+	})
+
+#define net_is_ipt_module_set(netns, ipt)				\
+	((netns)->owner_ve->_iptables_modules & (ipt##_MOD))
+
+#else /* CONFIG_VE_IPTABLES */
+
+#define net_ipt_permitted(netns, ipt)		(1)
+#define net_is_ipt_module_set(netns, ipt)	(1)
+#define net_ipt_module_set(netns, ipt)
+#define net_ipt_module_clear(netns, ipt)
+
+#endif /* CONFIG_VE_IPTABLES */
+
 #endif /*__KERNEL__*/
 #endif /*__LINUX_NETFILTER_H*/
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_arp/arp_tables.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_arp/arp_tables.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_arp/arp_tables.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_arp/arp_tables.h	2015-01-21 12:02:58.554819400 +0300
@@ -67,9 +67,9 @@ struct arpt_arp {
 #define arpt_standard_target xt_standard_target
 
 /* Values for "flag" field in struct arpt_ip (general arp structure).
- * No flags defined yet.
  */
-#define ARPT_F_MASK		0x00	/* All possible flag bits mask. */
+#define ARPT_WDOGTMO		0x80
+#define ARPT_F_MASK		0x80	/* All possible flag bits mask. */
 
 /* Values for "inv" field in struct arpt_arp. */
 #define ARPT_INV_VIA_IN		0x0001	/* Invert the sense of IN IFACE. */
@@ -82,7 +82,8 @@ struct arpt_arp {
 #define ARPT_INV_ARPHRD		0x0080	/* Invert the sense of ARP HRD. */
 #define ARPT_INV_ARPPRO		0x0100	/* Invert the sense of ARP PRO. */
 #define ARPT_INV_ARPHLN		0x0200	/* Invert the sense of ARP HLN. */
-#define ARPT_INV_MASK		0x03FF	/* All possible flag bits mask. */
+#define ARPT_INV_WDOGTMO	0x8000	/* Invert the sense if ARPT_WDOGTMO flag */
+#define ARPT_INV_MASK		0x83FF	/* All possible flag bits mask. */
 
 /* This structure defines each of the firewall rules.  Consists of 3
    parts which are 1) general ARP header stuff 2) match specific
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_TOS.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_TOS.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_TOS.h	2015-01-21 12:02:45.406168443 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_TOS.h	2015-01-21 12:02:45.406168443 +0300
@@ -0,0 +1,12 @@
+#ifndef _IPT_TOS_H_target
+#define _IPT_TOS_H_target
+
+#ifndef IPTOS_NORMALSVC
+#define IPTOS_NORMALSVC 0
+#endif
+
+struct ipt_tos_target_info {
+        u_int8_t tos;
+};
+
+#endif /*_IPT_TOS_H_target*/
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_iprange.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_iprange.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_iprange.h	2015-01-21 12:02:45.406168443 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_iprange.h	2015-01-21 12:02:45.406168443 +0300
@@ -0,0 +1,23 @@
+#ifndef _IPT_IPRANGE_H
+#define _IPT_IPRANGE_H
+
+#define IPRANGE_SRC             0x01    /* Match source IP address */
+#define IPRANGE_DST             0x02    /* Match destination IP address */
+#define IPRANGE_SRC_INV         0x10    /* Negate the condition */
+#define IPRANGE_DST_INV         0x20    /* Negate the condition */
+
+struct ipt_iprange {
+        /* Inclusive: network order. */
+        u_int32_t min_ip, max_ip;
+};
+
+struct ipt_iprange_info
+{
+        struct ipt_iprange src;
+        struct ipt_iprange dst;
+
+        /* Flags from above */
+        u_int8_t flags;
+};
+
+#endif /* _IPT_IPRANGE_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_owner.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_owner.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_owner.h	2015-01-21 12:02:45.407168417 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_owner.h	2015-01-21 12:02:45.407168417 +0300
@@ -0,0 +1,20 @@
+#ifndef _IPT_OWNER_H
+#define _IPT_OWNER_H
+
+/* match and invert flags */
+#define IPT_OWNER_UID   0x01
+#define IPT_OWNER_GID   0x02
+#define IPT_OWNER_PID   0x04
+#define IPT_OWNER_SID   0x08
+#define IPT_OWNER_COMM  0x10
+        
+struct ipt_owner_info {
+    uid_t uid;
+    gid_t gid;
+    pid_t pid;
+    pid_t sid;
+    char comm[16];
+    u_int8_t match, invert;     /* flags */
+};
+        
+#endif /*_IPT_OWNER_H*/
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_tos.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_tos.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv4/ipt_tos.h	2015-01-21 12:02:45.407168417 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv4/ipt_tos.h	2015-01-21 12:02:45.407168417 +0300
@@ -0,0 +1,13 @@
+#ifndef _IPT_TOS_H
+#define _IPT_TOS_H
+
+struct ipt_tos_info {
+    u_int8_t tos;
+    u_int8_t invert;
+};
+
+#ifndef IPTOS_NORMALSVC
+#define IPTOS_NORMALSVC 0
+#endif
+
+#endif /*_IPT_TOS_H*/
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv6/ip6t_owner.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv6/ip6t_owner.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/netfilter_ipv6/ip6t_owner.h	2015-01-21 12:02:45.407168417 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/netfilter_ipv6/ip6t_owner.h	2015-01-21 12:02:45.407168417 +0300
@@ -0,0 +1,18 @@
+#ifndef _IP6T_OWNER_H
+#define _IP6T_OWNER_H
+
+/* match and invert flags */
+#define IP6T_OWNER_UID  0x01
+#define IP6T_OWNER_GID  0x02
+#define IP6T_OWNER_PID  0x04
+#define IP6T_OWNER_SID  0x08
+
+struct ip6t_owner_info {
+    uid_t uid;
+    gid_t gid;
+    pid_t pid;
+    pid_t sid;
+    u_int8_t match, invert;     /* flags */
+};  
+    
+#endif /*_IPT_OWNER_H*/
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nfs_fs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfs_fs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nfs_fs.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfs_fs.h	2015-01-21 12:02:53.581951399 +0300
@@ -211,6 +211,11 @@ struct nfs_inode {
 	struct fscache_cookie	*fscache;
 #endif
 	struct inode		vfs_inode;
+#ifdef CONFIG_NFS_QUOTA
+	qsize_t			i_reserved_quota;
+	struct list_head	prealloc;
+	struct mutex		quota_sync;
+#endif
 };
 
 /*
@@ -353,7 +358,7 @@ extern void nfs_zap_mapping(struct inode
 extern void nfs_zap_caches(struct inode *);
 extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
-				struct nfs_fattr *);
+				struct nfs_fattr *, struct inode *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
@@ -425,6 +430,13 @@ extern int  nfs_root_data(char **root_de
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
 
+#ifdef CONFIG_NFS_FSCACHE
+/*
+ * linux/fs/nfs/fscache.c
+ */
+extern void nfs_fscache_dup_uniq_id(char *dst, struct super_block *sb);
+#endif
+
 /*
  * linux/fs/nfs/file.c
  */
@@ -441,7 +453,7 @@ extern const struct address_space_operat
 
 static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
 {
-	return filp->private_data;
+	return file_private(filp);
 }
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
@@ -494,7 +506,8 @@ extern const struct file_operations nfs_
 extern const struct dentry_operations nfs_dentry_operations;
 
 extern void nfs_force_lookup_revalidate(struct inode *dir);
-extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
+extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
+				struct nfs_fattr *fattr, struct inode *inode);
 extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
 extern void nfs_access_zap_cache(struct inode *inode);
 
@@ -517,6 +530,7 @@ extern void nfs_unregister_sysctl(void);
 /*
  * linux/fs/nfs/namespace.c
  */
+extern struct list_head nfs_automount_list;
 extern const struct inode_operations nfs_mountpoint_inode_operations;
 extern const struct inode_operations nfs_referral_inode_operations;
 extern int nfs_mountpoint_expiry_timeout;
@@ -531,6 +545,15 @@ extern void nfs_block_sillyrename(struct
 extern void nfs_unblock_sillyrename(struct dentry *dentry);
 extern int  nfs_sillyrename(struct inode *dir, struct dentry *dentry);
 
+struct nfs_unlinkdata {
+	struct hlist_node list;
+	struct nfs_removeargs args;
+	struct nfs_removeres res;
+	struct inode *dir;
+	struct rpc_cred	*cred;
+	struct nfs_fattr dir_attr;
+};
+
 /*
  * linux/fs/nfs/write.c
  */
@@ -615,6 +638,7 @@ nfs_fileid_to_ino_t(u64 fileid)
 
 #define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
 extern struct file_system_type nfs_fs_type;
+extern struct file_system_type nfs4_fs_type;
 
 #endif /* __KERNEL__ */
 
@@ -636,6 +660,7 @@ extern struct file_system_type nfs_fs_ty
 #define NFSDBG_PNFS		0x1000
 #define NFSDBG_PNFS_LD		0x2000
 #define NFSDBG_STATE		0x4000
+#define NFSDBG_QUOTA		0x8000
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nfs_fs_sb.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfs_fs_sb.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nfs_fs_sb.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfs_fs_sb.h	2015-01-21 12:02:53.564951851 +0300
@@ -87,6 +87,7 @@ struct nfs_client {
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
+	struct ve_struct	*owner_env;
 };
 
 /*
@@ -166,6 +167,10 @@ struct nfs_server {
 	u32			mountd_version;
 	unsigned short		mountd_port;
 	unsigned short		mountd_protocol;
+#ifdef CONFIG_NFS_QUOTA
+	struct list_head	prealloc_list;
+	spinlock_t		prealloc_lock;
+#endif
 };
 
 /* Server capabilities */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nfs_mount.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfs_mount.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nfs_mount.h	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfs_mount.h	2015-01-21 12:02:50.210040910 +0300
@@ -12,6 +12,7 @@
 #include <linux/nfs.h>
 #include <linux/nfs2.h>
 #include <linux/nfs3.h>
+#include <linux/nfs4.h>
 
 /*
  * WARNING!  Do not delete or change the order of these fields.  If
@@ -45,6 +46,42 @@ struct nfs_mount_data {
 	char		context[NFS_MAX_CONTEXT_LEN + 1];	/* 6 */
 };
 
+struct nfs_mount_data_dump {
+	int			version;
+	int			flags;
+	int			rsize, wsize;
+	int			timeo, retrans;
+	int			acregmin, acregmax,
+				acdirmin, acdirmax;
+	int			namlen;
+	unsigned int		options;
+	unsigned int		bsize;
+	unsigned int		auth_flavors;
+	char			client_address[48];
+	unsigned int		minorversion;
+	char			fscache_uniq[256];
+
+	struct {
+		struct sockaddr_storage	address;
+		size_t			addrlen;
+		char			hostname[NFS4_MAXNAMLEN];
+		u32			version;
+		int			port;
+		unsigned short		protocol;
+	} mount_server;
+
+	struct {
+		struct sockaddr_storage	address;
+		size_t			addrlen;
+		char			hostname[NFS4_MAXNAMLEN];
+		char			export_path[NFS4_MAXPATHLEN];
+		int			port;
+		unsigned short		protocol;
+	} nfs_server;
+
+	struct nfs3_fh		root;
+};
+
 /* bits in the flags field visible to user space */
 
 #define NFS_MOUNT_SOFT		0x0001	/* 1 */
@@ -65,6 +102,8 @@ struct nfs_mount_data {
 #define NFS_MOUNT_UNSHARED	0x8000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
 
+#define NFS_MOUNT_RESTORE	0x80000000
+
 /* The following are for internal use only */
 #define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
 #define NFS_MOUNT_LOOKUP_CACHE_NONE	0x20000
@@ -73,4 +112,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_LOCAL_FLOCK	0x100000
 #define NFS_MOUNT_LOCAL_FCNTL	0x200000
 
+/* Special mount options version, used only for migration support */
+#define NFS_MOUNT_MIGRATED	0x80000000
+
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nfsd/export.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfsd/export.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nfsd/export.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfsd/export.h	2015-01-21 12:02:46.935127851 +0300
@@ -145,12 +145,9 @@ int			exp_rootfh(struct auth_domain *, 
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
 __be32			nfserrno(int errno);
 
-extern struct cache_detail svc_export_cache;
+dev_t exp_get_dev(struct svc_export *ex);
 
-static inline void exp_put(struct svc_export *exp)
-{
-	cache_put(&exp->h, &svc_export_cache);
-}
+extern void exp_put(struct svc_export *exp);
 
 static inline void exp_get(struct svc_export *exp)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nfsd/stats.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfsd/stats.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nfsd/stats.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nfsd/stats.h	2015-01-21 12:02:47.672108285 +0300
@@ -40,11 +40,7 @@ struct nfsd_stats {
 
 };
 
-
-extern struct nfsd_stats	nfsdstats;
-extern struct svc_stat		nfsd_svcstats;
-
-void	nfsd_stat_init(void);
+int	nfsd_stat_init(void);
 void	nfsd_stat_shutdown(void);
 
 #endif /* __KERNEL__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nmi.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nmi.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nmi.h	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nmi.h	2015-01-21 12:02:41.571270260 +0300
@@ -56,4 +56,6 @@ extern int proc_dowatchdog_enabled(struc
 			void __user *, size_t *, loff_t *);
 #endif
 
+extern void nmi_show_regs(struct pt_regs *regs, int in_nmi);
+extern int do_nmi_show_regs(struct pt_regs *regs, int cpu);
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/notifier.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/notifier.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/notifier.h	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/notifier.h	2015-01-21 12:02:43.128228921 +0300
@@ -153,8 +153,9 @@ extern int __srcu_notifier_call_chain(st
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
+#define NOTIFY_FAIL		0x0002		/* Reject */
 #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|NOTIFY_FAIL)
 						/* Bad/Veto action */
 /*
  * Clean way to return from the notifier and stop further calls.
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/nsproxy.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nsproxy.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/nsproxy.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/nsproxy.h	2015-01-21 12:02:43.574217080 +0300
@@ -62,10 +62,12 @@ static inline struct nsproxy *task_nspro
 	return rcu_dereference(tsk->nsproxy);
 }
 
-int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+struct nsproxy *duplicate_nsproxy(struct nsproxy *nsproxy);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk, int force_admin);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
+struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
 	struct fs_struct *);
 int __init nsproxy_cache_init(void);
@@ -77,9 +79,10 @@ static inline void put_nsproxy(struct ns
 	}
 }
 
-static inline void get_nsproxy(struct nsproxy *ns)
+static inline struct nsproxy *get_nsproxy(struct nsproxy *ns)
 {
 	atomic_inc(&ns->count);
+	return ns;
 }
 
 #ifdef CONFIG_CGROUP_NS
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/oom.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/oom.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/oom.h	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/oom.h	2015-01-21 12:02:43.470219841 +0300
@@ -15,12 +15,15 @@
  */
 #define OOM_SCORE_ADJ_MIN	(-1000)
 #define OOM_SCORE_ADJ_MAX	1000
+#define OOM_SCORE_ADJ_UNSET	1001
 
 #ifdef __KERNEL__
 
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
 
 struct zonelist;
 struct notifier_block;
@@ -39,8 +42,17 @@ enum oom_constraint {
 
 extern int test_set_oom_score_adj(int new_val);
 
-extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
-			const nodemask_t *nodemask, unsigned long totalpages);
+struct task_struct *select_bad_process(int *ppoints,
+		unsigned long totalpages, struct user_beancounter *ub,
+		struct mem_cgroup *mem, const nodemask_t *nodemask);
+int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+			    int points, unsigned long totalpages,
+			    struct user_beancounter *ub, struct mem_cgroup *mem,
+			    nodemask_t *nodemask, const char *message);
+/* linux/mm/oom_group.c */
+extern int get_task_oom_score_adj(struct task_struct *t);
+
+extern int oom_badness(struct task_struct *p, unsigned long totalpages, long *overdraft);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
@@ -67,5 +79,18 @@ extern unsigned long badness(struct task
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+struct oom_control {
+	int			generation;
+	int			kill_counter;
+	unsigned long		last_kill;
+	int			oom_rage;
+	spinlock_t		lock;
+	wait_queue_head_t 	wq;
+};
+
+extern struct oom_control global_oom_ctrl;
+
+extern void init_oom_control(struct oom_control *oom_ctrl);
+
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/page-flags.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/page-flags.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/page-flags.h	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/page-flags.h	2015-01-21 12:02:58.930809420 +0300
@@ -111,11 +111,21 @@ enum pageflags {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
+#ifdef CONFIG_KSTALED
+	PG_young,		/* kstaled cleared pte young bit */
+	PG_idle,		/* idle since start of kstaled interval */
+#endif
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
 	PG_checked = PG_owner_priv_1,
 
+	/* VZ checkpointing */
+	PG_checkpointed = PG_owner_priv_1,
+
+	/* Page has vswap ptes */
+	PG_vswap = PG_private_2,
+
 	/* Two page bits are conscripted by FS-Cache to maintain local caching
 	 * state.  These bits are set on pages belonging to the netfs's inodes
 	 * when those inodes are being locally cached.
@@ -212,6 +222,7 @@ __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+PAGEFLAG(Checkpointed, checkpointed)			/* VZ checkpointing */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
@@ -230,6 +241,12 @@ PAGEFLAG(Private, private) __SETPAGEFLAG
 PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
 PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
 
+#ifdef CONFIG_MEMORY_VSWAP
+PAGEFLAG(VSwap, vswap)
+#else
+PAGEFLAG_FALSE(VSwap)
+#endif
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
@@ -285,6 +302,15 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#ifdef CONFIG_KSTALED
+PAGEFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#else
+PAGEFLAG_FALSE(Young)
+CLEARPAGEFLAG_NOOP(Young)
+CLEARPAGEFLAG_NOOP(Idle)
+#endif
+
 u64 stable_page_flags(struct page *page);
 
 static inline int PageUptodate(struct page *page)
@@ -432,7 +458,7 @@ static inline void ClearPageCompound(str
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+	 1 << PG_unevictable | __PG_MLOCKED  | __PG_HWPOISON | \
 	 __PG_COMPOUND_LOCK)
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pagemap.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pagemap.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pagemap.h	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pagemap.h	2015-01-21 12:02:58.675816188 +0300
@@ -15,6 +15,8 @@
 #include <linux/hardirq.h> /* for in_interrupt() */
 #include <linux/hugetlb_inline.h>
 
+#include <bc/vmpages.h> /* for ub_check_ram_limits() */
+
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
  * allocation mode flags.
@@ -24,6 +26,7 @@ enum mapping_flags {
 	AS_ENOSPC	= __GFP_BITS_SHIFT + 1,	/* ENOSPC on async write */
 	AS_MM_ALL_LOCKS	= __GFP_BITS_SHIFT + 2,	/* under mm_take_all_locks() */
 	AS_UNEVICTABLE	= __GFP_BITS_SHIFT + 3,	/* e.g., ramdisk, SHM_LOCK */
+	AS_CHECKPOINT	= __GFP_BITS_SHIFT + 4,	/* mapping is checkpointed */
 };
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -134,7 +137,7 @@ static inline int page_cache_get_specula
 	VM_BUG_ON(in_interrupt());
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
 	/*
@@ -172,7 +175,7 @@ static inline int page_cache_add_specula
 	VM_BUG_ON(in_interrupt());
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
 	VM_BUG_ON(page_count(page) == 0);
@@ -205,6 +208,9 @@ extern struct page *__page_cache_alloc(g
 #else
 static inline struct page *__page_cache_alloc(gfp_t gfp)
 {
+	if (unlikely(ub_check_ram_limits(get_exec_ub(), gfp)))
+		return NULL;
+
 	return alloc_pages(gfp, 0);
 }
 #endif
@@ -225,6 +231,12 @@ static inline struct page *page_cache_al
 				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN);
 }
 
+static inline void check_pagecache_limits(struct address_space *mapping,
+					  gfp_t gfp_mask)
+{
+	ub_check_ram_limits(get_exec_ub(), gfp_mask);
+}
+
 typedef int filler_t(void *, struct page *);
 
 extern struct page * find_get_page(struct address_space *mapping,
@@ -523,6 +535,8 @@ static inline int fault_in_multipages_re
 	return ret;
 }
 
+int add_to_page_cache_nogang(struct page *page, struct address_space *mapping,
+				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pagevec.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pagevec.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pagevec.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pagevec.h	2015-01-21 12:02:58.863811197 +0300
@@ -23,7 +23,6 @@ struct pagevec {
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
-void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
@@ -60,7 +59,6 @@ static inline unsigned pagevec_add(struc
 	return pagevec_space(pvec);
 }
 
-
 static inline void pagevec_release(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pci_ids.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pci_ids.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pci_ids.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pci_ids.h	2015-01-21 12:02:42.688240602 +0300
@@ -2833,6 +2833,7 @@
 #define PCI_DEVICE_ID_NETMOS_9835	0x9835
 #define PCI_DEVICE_ID_NETMOS_9845	0x9845
 #define PCI_DEVICE_ID_NETMOS_9855	0x9855
+#define PCI_DEVICE_ID_NETMOS_9865	0x9865
 #define PCI_DEVICE_ID_NETMOS_9900	0x9900
 #define PCI_DEVICE_ID_NETMOS_9901	0x9901
 #define PCI_DEVICE_ID_NETMOS_9904	0x9904
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/perf_event.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/perf_event.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/perf_event.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/perf_event.h	2015-01-21 12:02:58.429822718 +0300
@@ -1427,6 +1427,11 @@ static inline bool perf_paranoid_kernel(
 	return sysctl_perf_event_paranoid > 1;
 }
 
+static inline bool perf_paranoid_container(void)
+{
+	return sysctl_perf_event_paranoid > -1;
+}
+
 extern void perf_event_init(void);
 extern void perf_tp_event(int event_id, u64 addr, u64 count,
 		void *record, int entry_size);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pfcache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pfcache.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pfcache.h	2015-01-21 12:02:52.152989331 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pfcache.h	2015-01-21 12:02:52.152989331 +0300
@@ -0,0 +1,71 @@
+#ifndef LINUX_PFCACHE_H
+#define LINUX_PFCACHE_H
+
+/**
+ * include/linux/pfcache
+ *
+ * Parallels File Cache
+ *
+ * Copyright (C) 2012. Parallels IP Holdings GmbH.
+ * All rights reserved.
+ *
+ * Author: Konstantin Khlebnikov
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define FS_IOC_PFCACHE_OPEN	_IO('f', 50)
+#define FS_IOC_PFCACHE_CLOSE	_IO('f', 51)
+#define FS_IOC_PFCACHE_DUMP	_IO('f', 52)
+
+#define PFCACHE_CSUM_SIZE	20	/* SHA-1 (FIPS 180-1) */
+
+#define PFCACHE_XATTR_NAME	"trusted.pfcache"
+
+/* extendable FS_IOC_PFCACHE_DUMP argument, must be 32/64-bits compatible */
+struct pfcache_dump_request {
+	__u32	header_size;		/* this struct size */
+	__u32	buffer_size;		/* tail buffer size */
+	__u64	filter;			/* filter flags */
+	__u64	payload;		/* payload flags */
+	__u32	offset;			/* skip inodes, after filtering */
+	__u8	csum_filter[PFCACHE_CSUM_SIZE];
+	/* -- add fields above this line -- */
+	__u8	buffer[0];
+};
+
+/* to check new fields presence */
+#define PFCACHE_DUMP_HAS(req, field)	((req)->header_size >= \
+		offsetof(typeof(*(req)), field) + sizeof((req)->field))
+
+/* filter bits, what to skip */
+#define PFCACHE_FILTER_WITH_CSUM	0x0001ll
+#define PFCACHE_FILTER_WITHOUT_CSUM	0x0002ll
+#define PFCACHE_FILTER_WITH_PEER	0x0004ll
+#define PFCACHE_FILTER_WITHOUT_PEER	0x0008ll
+#define PFCACHE_FILTER_COMPARE_CSUM	0x0010ll /* check csum_filter */
+#define PFCACHE_FILTER_MASK		0x001Fll /* all known filters */
+
+/* payload bits, what to dump */
+#define PFCACHE_PAYLOAD_CSUM		0x0001ll /* u8[EXT4_DATA_CSUM_SIZE] */
+#define PFCACHE_PAYLOAD_FHANDLE		0x0002ll /* struct file_handle */
+#define PFCACHE_PAYLOAD_STATE		0x0004ll /* u64 filter-state */
+#define PFCACHE_PAYLOAD_FSIZE		0x0008ll /* u64 file size */
+#define PFCACHE_PAYLOAD_PAGES		0x0010ll /* u64 page-cache size */
+#define PFCACHE_PAYLOAD_MASK		0x001Fll /* all known payloads */
+
+/* MAX_HANDLE_SZ */
+#define PFCACHE_FHANDLE_MAX		256
+
+/* see fs/fhandle.c */
+#define PFCACHE_FHANDLE_SIZE(ptr)	(*(__u32*)(ptr) + sizeof(__u32) * 2)
+
+/* all payload fields aligned to 8 bytes boundary */
+#define PFCACHE_PAYLOAD_MAX_SIZE			\
+	(ALIGN(PFCACHE_CSUM_SIZE, sizeof(__u64)) +	\
+	 PFCACHE_FHANDLE_MAX +				\
+	 sizeof(__u64) * 3)
+
+#endif /* LINUX_PFCACHE_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pid.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pid.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pid.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pid.h	2015-01-21 12:02:44.714186814 +0300
@@ -60,6 +60,9 @@ struct pid
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+#endif
 	struct rcu_head rcu;
 	struct upid numbers[1];
 };
@@ -96,6 +99,12 @@ extern void change_pid(struct task_struc
 			struct pid *pid);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
 			 enum pid_type);
+extern void reattach_pid(struct task_struct *, struct pid *);
+extern int alloc_pidmap(struct pid_namespace *pid_ns);
+extern int set_pidmap(struct pid_namespace *pid_ns, pid_t pid);
+extern void free_pidmap(struct upid *upid);
+
+extern spinlock_t pidmap_lock;
 
 struct pid_namespace;
 extern struct pid_namespace init_pid_ns;
@@ -119,8 +128,13 @@ extern struct pid *find_get_pid(int nr);
 extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
 
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid);
 extern void free_pid(struct pid *pid);
+extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *);
+extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *);
+pid_t pid_to_vpid(pid_t nr);
+struct ve_struct;
+pid_t vpid_to_pid_ve(pid_t nr, struct ve_struct *);
 
 /*
  * ns_of_pid() returns the pid namespace in which the specified pid was
@@ -172,6 +186,7 @@ static inline pid_t pid_nr(struct pid *p
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
 pid_t pid_vnr(struct pid *pid);
+pid_t ve_task_ppid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 
 #define do_each_pid_task(pid, type, task)				\
 	do {								\
@@ -196,7 +211,7 @@ pid_t pid_vnr(struct pid *pid);
 		do {
 
 #define while_each_pid_thread(pid, type, task)				\
-		} while_each_thread(tg___, task);			\
+		} while_each_thread_ve(tg___, task);			\
 		task = tg___;						\
 	} while_each_pid_task(pid, type, task)
 #endif /* _LINUX_PID_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pid_namespace.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pid_namespace.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pid_namespace.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pid_namespace.h	2015-01-21 12:02:45.780158514 +0300
@@ -16,14 +16,24 @@ struct pidmap {
 
 struct bsd_acct_struct;
 
+/* pid namespace flags */
+
+/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */
+#define PID_NS_HIDE_CHILD	0x00000001
+
+/* if set newly created processes invisible from parent ns*/
+#define PID_NS_HIDDEN		0x00000002
+
 struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	int last_pid;
+	int pid_max;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
 	struct pid_namespace *parent;
+	unsigned flags;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pipe_fs_i.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pipe_fs_i.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pipe_fs_i.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pipe_fs_i.h	2015-01-21 12:02:49.386062785 +0300
@@ -145,6 +145,8 @@ void pipe_wait(struct pipe_inode_info *p
 struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
 void free_pipe_info(struct inode * inode);
 void __free_pipe_info(struct pipe_inode_info *);
+int pipe_release(struct inode *inode, int decr, int decw);
+void swap_pipe_info(struct inode *, struct inode *);
 
 /* Generic pipe buffer ops functions */
 void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/compat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/compat.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/compat.h	2015-01-21 12:02:55.360904177 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/compat.h	2015-01-21 12:02:57.754840634 +0300
@@ -0,0 +1,69 @@
+#ifndef _LINUX_PLOOP_COMPAT_H_
+#define _LINUX_PLOOP_COMPAT_H_
+
+#include <linux/version.h>
+
+/* Macros to provide compatibility layer for 2.6.18, where bio layer
+ * was different
+ */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31)
+#define BIO_RW_SYNC BIO_RW_SYNCIO
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define DEFINE_BIO_CB(func) \
+static int func(struct bio *bio, unsigned int bytes_done, int err) { \
+	if (bio->bi_size) return 1;
+
+#define END_BIO_CB(func) return 0; }
+
+
+#define BIO_ENDIO(_bio, _err)  bio_endio((_bio), (_bio)->bi_size, (_err))
+
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata);
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
+
+
+#define F_DENTRY(file)	(file)->f_dentry
+#define F_MNT(file)	(file)->f_vfsmnt
+
+#define KOBJECT_INIT(_kobj, _ktype) do { \
+	(_kobj)->ktype = (_ktype); kobject_init(_kobj); } while (0)
+
+#define KOBJECT_ADD(_kobj, _parent, fmt, arg...) ({ \
+	struct kobject * _tmp = (_kobj); \
+	_tmp->parent = _parent; \
+	snprintf(_tmp->name, KOBJ_NAME_LEN, fmt, arg); \
+	kobject_add(_tmp); })
+
+#else
+
+#define DEFINE_BIO_CB(func) \
+static void func(struct bio *bio, int err) {
+
+#define END_BIO_CB(func)  }
+
+#define BIO_ENDIO(_queue, _bio, _err) ({ \
+	trace_block_bio_complete(_queue, _bio); \
+	bio_endio(_bio, _err); })
+
+#define F_DENTRY(file)	(file)->f_path.dentry
+#define F_MNT(file)	(file)->f_path.mnt
+
+#define KOBJECT_INIT(kobj, ktype) kobject_init(kobj, ktype)
+#define KOBJECT_ADD(kobj, parent, fmt, arg...) kobject_add(kobj, parent, fmt, arg)
+
+#endif
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,32)
+#define FOP_FSYNC(file, datasync) fsync(file, datasync)
+#else
+#define FOP_FSYNC(file, datasync) fsync(file, F_DENTRY(file), datasync)
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/internal.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/internal.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/internal.h	2015-01-21 12:02:55.361904150 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/internal.h	2015-01-21 12:02:55.360904177 +0300
@@ -0,0 +1,63 @@
+#ifndef _BIO_HELP_H_
+#define _BIO_HELP_H_ 1
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+
+struct bio_list
+{
+	struct bio *head;
+	struct bio *tail;
+};
+
+static inline int bio_list_empty(const struct bio_list *bl)
+{
+	return bl->head == NULL;
+}
+
+static inline void bio_list_init(struct bio_list *bl)
+{
+	bl->head = bl->tail = NULL;
+}
+
+#define bio_list_for_each(bio, bl) \
+	for (bio = (bl)->head; bio; bio = bio->bi_next)
+
+static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = NULL;
+
+	if (bl->tail)
+		bl->tail->bi_next = bio;
+	else
+		bl->head = bio;
+
+	bl->tail = bio;
+}
+
+static inline struct bio *bio_list_pop(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	if (bio) {
+		bl->head = bl->head->bi_next;
+		if (!bl->head)
+			bl->tail = NULL;
+
+		bio->bi_next = NULL;
+	}
+
+	return bio;
+}
+
+static inline struct bio *bio_list_get(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	bl->head = bl->tail = NULL;
+
+	return bio;
+}
+
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/ploop.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/ploop.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/ploop.h	2015-01-21 12:02:55.363904097 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/ploop.h	2015-01-21 12:02:57.809839173 +0300
@@ -0,0 +1,783 @@
+#ifndef _LINUX_PLOOP_H_
+#define _LINUX_PLOOP_H_
+
+#include <linux/rbtree.h>
+#include <linux/timer.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include "ploop_if.h"
+#include "compat.h"
+#include "internal.h"
+
+#define PLOOP_NAME_SIZE		64
+#define PLOOP_MAX_FORMATS	32
+#define PLOOP_DEVICE_MAJOR	182
+#define PLOOP_DEVICE_RANGE	(1UL << MINORBITS)
+#define PLOOP_PART_SHIFT	4
+#define PLOOP_PART_MAX		(1UL << PLOOP_PART_SHIFT)
+
+/* 1. fastpath_reqs is subtracted because they don't consume preq-s
+ * 2. typically, entry_qlen and bio_qlen are close to zero */
+#define PLOOP_CONGESTED(plo)    (plo->entry_qlen + plo->active_reqs - \
+				 plo->fastpath_reqs + plo->bio_qlen)
+/* 32 bits for virtual block. Enough. */
+typedef u32	cluster_t;
+typedef u32	iblock_t;
+
+struct ploop_request;
+struct ploop_delta;
+
+enum {
+	PLOOP_S_RUNNING,	/* Device is active */
+	PLOOP_S_ATTENTION,	/* Device is processing a barrier, everything
+				 * is queued to be totally serialized */
+	PLOOP_S_WAIT_PROCESS,	/* Main thread is waiting for requests */
+	PLOOP_S_EXITING,	/* Exiting */
+	PLOOP_S_ABORT,		/* Device is aborted due to unrecoverable
+				 * error. Reads are still allowed. */
+	PLOOP_S_SYNC,		/* Unplug was requested */
+	PLOOP_S_CHANGED,	/* Media changed */
+	PLOOP_S_WRITE_CONG,	/* Write direction was congested */
+	PLOOP_S_READ_CONG,	/* Read direction was congested */
+	PLOOP_S_TRACK,		/* Write tracker is ON */
+	PLOOP_S_TRACK_ABORT,	/* Write tracker is aborted */
+	PLOOP_S_ENOSPC_EVENT,	/* ENOSPC event happened but but was not
+				 * consumed by userspace yet */
+	PLOOP_S_CONGESTED,	/* Too many bios submitted to us */
+	PLOOP_S_DISCARD,	/* ploop is ready to handle discard request */
+	PLOOP_S_DISCARD_LOADED,	/* A discard request was handled and
+				   free blocks loaded */
+	PLOOP_S_LOCKED,	        /* ploop is locked by userspace
+				   (for minor mgmt only) */
+	PLOOP_S_ONCE,	        /* An event (e.g. printk once) happened */
+};
+
+struct ploop_snapdata
+{
+	/* top_delta file reopened read-only. */
+	struct file		*file;
+};
+
+
+
+struct ploop_file
+{
+	struct list_head	list;
+
+	loff_t		vpos;	/* Position of this chunk in virtual map */
+	loff_t		start;	/* Start of data in this file, usually 0 */
+	loff_t		length;	/* Length of data in this file */
+	loff_t		limit;	/* Maximal size of this file. If it is
+				 * exceeded we must switch to the next chunk
+				 */
+	struct file		*file;	/* File */
+	struct address_space	*mapping;
+	struct inode		*inode;
+	struct extent_map_tree	*em_tree;
+	struct block_device	*bdev;
+	int flags; /* file flags */
+};
+
+/* Real functions are hidden deeply. :-)
+ *
+ * This struct describes how we do real IO on particular backing file.
+ */
+
+
+struct ploop_io
+{
+	struct ploop_device	*plo;
+
+	loff_t		       *size_ptr; /* NULL or points to ploop_mapping */
+	loff_t			prealloced_size;
+	struct ploop_request   *prealloc_preq;  /* preq who does prealloc */
+	loff_t			max_size;	/* Infinity */
+	int			n_chunks;	/* 1. */
+	struct ploop_file	files;		/* Only 1 file is supported */
+
+	iblock_t		alloc_head;
+
+	struct list_head	fsync_queue;
+	struct task_struct	*fsync_thread;
+	int			fsync_qlen;
+	wait_queue_head_t	fsync_waitq;
+	struct timer_list	fsync_timer;
+
+	struct ploop_io_ops	*ops;
+};
+
+struct ploop_io_ops
+{
+	struct list_head	list;
+	unsigned int		id;
+	char			*name;
+	struct module		*owner;
+
+	void		(*unplug)(struct ploop_io *);
+	int		(*congested)(struct ploop_io *, int bits);
+
+	/* Allocate new block, return its index in image.
+	 * Data must be initialized to zeros and commited to disk.
+	 *
+	 * This function is slow and it is used only to allocate
+	 * index tables.
+	 */
+	int	(*alloc)(struct ploop_io *, loff_t pos, loff_t len);
+
+	/* These functions must schedule IO from/to disk.
+	 * If it returns 1, this means write is not complete and
+	 * preq is added to some internal queue.
+	 *
+	 * submit() makes IO to already allocated space (preq->iblock)
+	 * and must fail when writing to unallocated area.
+	 *
+	 * submit_alloc() assumes that storage is not allocated and allocates
+	 * new area in image.
+	 */
+	void	(*submit)(struct ploop_io *, struct ploop_request *,
+			  unsigned long rw,
+			  struct bio_list *sbl, iblock_t iblk, unsigned int size);
+	void	(*submit_alloc)(struct ploop_io *, struct ploop_request *,
+				struct bio_list *sbl, unsigned int size);
+
+	int	(*disable_merge)(struct ploop_io * io, sector_t isector, unsigned int len);
+	int	(*fastmap)(struct ploop_io * io, struct bio *orig_bio,
+			   struct bio * bio, sector_t isec);
+
+	void	(*read_page)(struct ploop_io * io, struct ploop_request * preq,
+			     struct page * page, sector_t sec);
+	void	(*write_page)(struct ploop_io * io, struct ploop_request * preq,
+			      struct page * page, sector_t sec, int fua);
+
+
+	int	(*sync_read)(struct ploop_io * io, struct page * page,
+			     unsigned int len, unsigned int off, sector_t sec);
+	int	(*sync_write)(struct ploop_io * io, struct page * page,
+			      unsigned int len, unsigned int off, sector_t sec);
+
+
+	int	(*sync_readvec)(struct ploop_io * io, struct page ** pvec,
+				unsigned int nr, sector_t sec);
+	int	(*sync_writevec)(struct ploop_io * io, struct page ** pvec,
+				unsigned int nr, sector_t sec);
+
+	int	(*init)(struct ploop_io * io);
+	void	(*destroy)(struct ploop_io * io);
+	int	(*open)(struct ploop_io * io);
+	int	(*sync)(struct ploop_io * io);
+	int	(*stop)(struct ploop_io * io);
+	int	(*prepare_snapshot)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*complete_snapshot)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*prepare_merge)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*start_merge)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*truncate)(struct ploop_io *, struct file *, __u32 alloc_head);
+	void	(*queue_settings)(struct ploop_io *, struct request_queue *q);
+
+	void	(*issue_flush)(struct ploop_io*, struct ploop_request * preq);
+
+	int	(*dump)(struct ploop_io*);
+
+	loff_t  (*i_size_read)(struct ploop_io*);
+	fmode_t (*f_mode)(struct ploop_io*);
+
+	int     (*autodetect)(struct ploop_io * io);
+};
+
+static inline loff_t generic_i_size_read(struct ploop_io *io)
+{
+	BUG_ON(!io->files.file);
+	BUG_ON(!io->files.inode);
+
+	return i_size_read(io->files.inode);
+}
+static inline fmode_t generic_f_mode(struct ploop_io *io)
+{
+	BUG_ON(!io->files.file);
+
+	return io->files.file->f_mode;
+}
+
+enum {
+	PLOOP_MAP_IDENTICAL,
+	PLOOP_MAP_DEAD,
+};
+
+#define PLOOP_LRU_BUFFER	8
+
+struct ploop_map
+{
+	struct ploop_device	*plo;
+	struct list_head	delta_list;
+
+	struct rb_root		rb_root;
+	unsigned long		flags;
+	unsigned long		last_activity;
+
+	unsigned int		pages;
+	unsigned int		max_index;
+
+	struct map_node		*lru_buffer[PLOOP_LRU_BUFFER];
+	unsigned int		lru_buffer_ptr;
+
+	wait_queue_head_t	destroy_waitq;
+};
+
+#define PLOOP_FMT_CAP_DELTA	1
+#define PLOOP_FMT_CAP_WRITABLE	2
+#define PLOOP_FMT_CAP_IDENTICAL	4
+
+struct ploop_delta_ops
+{
+	struct list_head	list;
+	unsigned int		id;
+	char			*name;
+	struct module		*owner;
+
+	unsigned int		capability;
+
+	/* Return location of index page */
+	int		(*map_index)(struct ploop_delta *, unsigned long index,
+				     sector_t *sec);
+	void		(*read_index)(struct ploop_delta *, struct ploop_request * preq,
+				      struct page * page, sector_t sec);
+
+	/* Allocate new block in delta and write request there.
+	 * If request does not cover whole block, this function
+	 * must pad with zeros
+	 */
+	void		(*allocate)(struct ploop_delta *, struct ploop_request *,
+				    struct bio_list *sbl, unsigned int size);
+	void		(*allocate_complete)(struct ploop_delta *, struct ploop_request *);
+
+	int		(*compose)(struct ploop_delta *, int, struct ploop_ctl_chunk *);
+	int		(*open)(struct ploop_delta *);
+	void		(*destroy)(struct ploop_delta *);
+	int		(*start)(struct ploop_delta *);
+	int		(*stop)(struct ploop_delta *);
+	int		(*refresh)(struct ploop_delta *);
+	int		(*sync)(struct ploop_delta *);
+	int		(*prepare_snapshot)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*complete_snapshot)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*prepare_merge)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*start_merge)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*truncate)(struct ploop_delta *, struct file *, __u32 alloc_head);
+	int		(*prepare_grow)(struct ploop_delta *, u64 *new_size, int *reloc);
+	int		(*complete_grow)(struct ploop_delta *, u64 new_size);
+};
+
+/* Virtual image. */
+struct ploop_delta
+{
+	struct list_head	list;
+
+	int			level;		/* Level of delta. 0 is base image */
+	unsigned int		cluster_log;	/* In 512=1<<9 byte sectors */
+	unsigned int		flags;
+
+	struct ploop_device	*plo;
+
+	struct ploop_io		io;
+
+	void			*priv;
+
+	struct ploop_delta_ops	*ops;
+
+	struct kobject		kobj;
+
+	u64			max_delta_size; /* in sectors */
+};
+
+struct ploop_tunable
+{
+	int	max_requests;
+	int	batch_entry_qlen;
+	int	batch_entry_delay;
+	int	fsync_max;
+	int	fsync_delay;
+	int	min_map_pages;
+	int	max_map_inactivity;
+	int	congestion_high_watermark;
+	int	congestion_low_watermark;
+	int	max_active_requests;
+	unsigned int pass_flushes : 1, pass_fuas : 1,
+		     congestion_detection : 1,
+		     check_zeros : 1,
+		     disable_root_threshold : 1,
+		     disable_user_threshold : 1;
+};
+
+#define DEFAULT_PLOOP_MAXRQ 256
+#define DEFAULT_PLOOP_BATCH_ENTRY_QLEN 32
+
+#define DEFAULT_PLOOP_TUNE \
+(struct ploop_tunable) { \
+.max_requests = DEFAULT_PLOOP_MAXRQ, \
+.batch_entry_qlen = 32, \
+.batch_entry_delay = HZ/20, \
+.fsync_max = DEFAULT_PLOOP_BATCH_ENTRY_QLEN, \
+.fsync_delay = HZ/10, \
+.min_map_pages = 32, \
+.max_map_inactivity = 10*HZ, \
+.congestion_high_watermark = 3*DEFAULT_PLOOP_MAXRQ/4, \
+.congestion_low_watermark = DEFAULT_PLOOP_MAXRQ/2, \
+.pass_flushes = 1, \
+.pass_fuas = 1, \
+.check_zeros = 1, \
+.max_active_requests = DEFAULT_PLOOP_BATCH_ENTRY_QLEN / 2, }
+
+struct ploop_stats
+{
+#define __DO(_at)	__u32	_at;
+#include "ploop_stat.h"
+#undef __DO
+};
+
+struct ploop_freeblks_desc;
+
+struct ploop_device
+{
+	unsigned long		state;
+	spinlock_t		lock;
+
+	struct list_head	free_list;
+	struct list_head	entry_queue;
+	int			entry_qlen;
+	int			read_sync_reqs;
+
+	struct bio		*bio_head;
+	struct bio		*bio_tail;
+	struct bio		*bio_sync;
+	struct bio_list		bio_discard_list;
+	int			bio_discard_qlen;
+	int			bio_qlen;
+	int			bio_total;
+
+	struct rb_root		entry_tree[2];
+
+	struct list_head	ready_queue;
+
+	struct rb_root		lockout_tree;
+
+	int			cluster_log;
+	int			fmt_version;
+
+	int			active_reqs;
+	int			fastpath_reqs;
+	int			barrier_reqs;
+
+	struct bio		*cached_bio;
+
+	struct timer_list	mitigation_timer;
+	struct timer_list	freeze_timer;
+
+	wait_queue_head_t	waitq;
+	wait_queue_head_t	req_waitq;
+	wait_queue_head_t	freeze_waitq;
+	wait_queue_head_t	event_waitq;
+
+	struct ploop_map	map;
+	struct ploop_map	*trans_map;
+
+	struct ploop_tunable	tune;
+
+	int			index;
+	struct mutex		ctl_mutex;
+	atomic_t		open_count;
+	u64			bd_size;
+	struct gendisk		*disk;
+	struct block_device	*bdev;
+	struct request_queue	*queue;
+	struct task_struct	*thread;
+	struct rb_node		link;
+
+	/* someone who wants to quiesce state-machine waits
+	 * here for signal from state-machine saying that
+	 * processing came to PLOOP_REQ_BARRIER request */
+	struct completion	*quiesce_comp;
+
+	/* state-machine in 'quiesce' state waits here till
+	 * someone call ploop_relax() */
+	struct completion	relax_comp;
+
+	/* someone who call ploop_relax() waits here to know
+	 * that 'relax' really happened and state-machine is
+	 * ready for next ploop_quiesce(). This is important
+	 * because someone might call ploop_quiesce() immediately
+	 * after ploop_relax() succeeded */
+	struct completion	relaxed_comp;
+
+	spinlock_t		track_lock;
+	struct rb_root		track_tree;
+	sector_t		track_end;
+	u32			track_cluster;
+	u32			track_ptr;
+
+	u32			merge_ptr;
+
+	atomic_t		maintenance_cnt;
+	struct completion	maintenance_comp;
+	int			maintenance_type;
+
+	u32			grow_start;
+	u32			grow_end;
+	u32			grow_relocated;
+	u64			grow_new_size;
+
+	spinlock_t		dummy_lock;
+	struct mutex		sysfs_mutex;
+	struct kobject		kobj;
+	struct kobject		*pstat_dir;
+	struct kobject		*pstate_dir;
+	struct kobject		*ptune_dir;
+
+	struct ploop_stats	st;
+	char                    cookie[PLOOP_COOKIE_SIZE];
+
+	struct ploop_freeblks_desc *fbd;
+
+	unsigned long		locking_state; /* plo locked by userspace */
+};
+
+enum
+{
+	PLOOP_REQ_LOCKOUT,	/* This preq is locking overapping requests */
+	PLOOP_REQ_SYNC,
+	PLOOP_REQ_BARRIER,
+	PLOOP_REQ_UNSTABLE,
+	PLOOP_REQ_TRACK,
+	PLOOP_REQ_SORTED,
+	PLOOP_REQ_TRANS,
+	PLOOP_REQ_MERGE,
+	PLOOP_REQ_RELOC_A,	/* 'A' stands for allocate() */
+	PLOOP_REQ_RELOC_S,	/* 'S' stands for submit() */
+	PLOOP_REQ_ZERO,
+	PLOOP_REQ_DISCARD,
+	PLOOP_REQ_RSYNC,
+};
+
+enum
+{
+	PLOOP_E_ENTRY,		/* Not yet processed */
+	PLOOP_E_COMPLETE,	/* Complete. Maybe, with an error */
+	PLOOP_E_RELOC_COMPLETE,	/* Reloc complete. Maybe, with an error */
+	PLOOP_E_INDEX_READ,	/* Reading an index page */
+	PLOOP_E_TRANS_INDEX_READ,/* Reading a trans index page */
+	PLOOP_E_DELTA_READ,	/* Write request reads data from previos delta */
+	PLOOP_E_DELTA_COPIED,	/* Data from previos delta was bcopy-ied */
+	PLOOP_E_TRANS_DELTA_READ,/* Write request reads data from trans delta */
+	PLOOP_E_RELOC_DATA_READ,/* Read user data to relocate */
+	PLOOP_E_RELOC_NULLIFY,  /* Zeroing relocated block is in progress */
+	PLOOP_E_INDEX_DELAY,	/* Index update is blocked by already queued
+				 * index update.
+				 */
+	PLOOP_E_INDEX_WB,	/* Index writeback is in progress */
+	PLOOP_E_DATA_WBI,	/* Data writeback is in progress and index
+				 * is not updated.
+				 */
+	PLOOP_E_ZERO_INDEX,	/* Zeroing index of free block; original request
+				   can use .submit on completion */
+	PLOOP_E_DELTA_ZERO_INDEX,/* the same but for PLOOP_E_DELTA_READ */
+};
+
+#define BIO_BDEV_REUSED	14	/* io_context is stored in bi_bdev */
+
+struct ploop_request
+{
+	struct list_head	list;	/* List link.
+					 * Req can be on
+					 * - free list
+					 * - entry queue
+					 * - ready queue
+					 * - delay_list of another request
+					 * nowhere
+					 */
+
+	struct ploop_device	*plo;
+
+	cluster_t		req_cluster;
+	sector_t		req_sector;
+	unsigned int		req_size;
+	unsigned int		req_rw;
+	unsigned long		tstamp;
+	struct io_context	*ioc;
+
+	struct bio_list		bl;
+
+	struct bio		*aux_bio;
+
+	atomic_t		io_count;
+
+	unsigned long		state;
+	unsigned long		eng_state;
+	int			error;
+
+	struct map_node		*map;
+	struct map_node		*trans_map;
+
+	iblock_t		iblock;
+
+	/* relocation info */
+	iblock_t		src_iblock;
+	iblock_t		dst_iblock;
+	cluster_t		dst_cluster;
+	struct rb_node		reloc_link;
+
+	/* State specific information */
+	union {
+		/* E_INDEX_READ */
+		struct {
+			struct page	* tpage;
+			int		level;
+		} ri;
+
+		/* E_INDEX_WB */
+		struct {
+			struct page	* tpage;
+		} wi;
+	} sinfo;
+
+	u64			verf;
+
+	/* List of requests blocked until completion of this request. */
+	struct list_head	delay_list;
+
+	/* Link to tree of "blocking requests". Blocking request
+	 * is a request which triggers a kind of a change in image format,
+	 * which does not allow to proceed requests to the same area.
+	 * F.e. when we do not have mapping in delta and request
+	 * requires a copy of data block from previous delta,
+	 * this request locks all subseqent requests to the same virtual block
+	 * until we allocate and initialize block in delta.
+	 */
+	struct rb_node		lockout_link;
+
+	u32			track_cluster;
+
+	/* # bytes in tail of image file to prealloc on behalf of this preq */
+	loff_t			prealloc_size;
+};
+
+static inline struct ploop_delta * ploop_top_delta(struct ploop_device * plo)
+{
+	return list_empty(&plo->map.delta_list) ? NULL :
+		list_first_entry(&plo->map.delta_list,
+				 struct ploop_delta, list);
+}
+
+static inline struct ploop_delta * map_top_delta(struct ploop_map * map)
+{
+	return list_first_entry(&map->delta_list, struct ploop_delta, list);
+}
+
+void ploop_complete_io_state(struct ploop_request * preq);
+void ploop_fail_request(struct ploop_request * preq, int err);
+void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
+		      int keep_locked);
+
+static inline void ploop_set_error(struct ploop_request * preq, int err)
+{
+	if (!preq->error) {
+		preq->error = err;
+		if (!test_bit(PLOOP_S_ABORT, &preq->plo->state)) {
+			if (err != -ENOSPC) {
+				printk("ploop_set_error=%d on ploop%d\n",
+				       err, preq->plo->index);
+				return;
+			}
+			printk("No space left on device! Either free some "
+			       "space on disk or abort ploop%d manually.\n",
+				preq->plo->index);
+		}
+	}
+}
+
+static inline void ploop_prepare_io_request(struct ploop_request * preq)
+{
+	atomic_set(&preq->io_count, 1);
+}
+
+static inline void ploop_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		ploop_complete_io_state(preq);
+}
+
+static inline void ploop_prepare_tracker(struct ploop_request * preq,
+					 sector_t sec)
+{
+	if (unlikely(test_bit(PLOOP_S_TRACK, &preq->plo->state))) {
+		BUG_ON(test_bit(PLOOP_REQ_TRACK, &preq->state));
+		set_bit(PLOOP_REQ_TRACK, &preq->state);
+		preq->track_cluster = sec >> preq->plo->cluster_log;
+	}
+}
+
+void ploop_tracker_notify(struct ploop_device *, sector_t sec);
+
+static inline void ploop_acc_ff_in_locked(struct ploop_device *plo,
+					  unsigned long rw)
+{
+	if (unlikely(rw & BIO_FLUSH))
+		plo->st.bio_flush_in++;
+	if (unlikely(rw & BIO_FUA))
+		plo->st.bio_fua_in++;
+}
+static inline void ploop_acc_ff_in(struct ploop_device *plo,
+				   unsigned long rw)
+{
+	if (unlikely(rw & BIO_FLUSH)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_flush_in++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+	if (unlikely(rw & BIO_FUA)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_fua_in++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+}
+static inline void ploop_acc_ff_out_locked(struct ploop_device *plo,
+					   unsigned long rw)
+{
+	if (unlikely(rw & BIO_FLUSH))
+		plo->st.bio_flush_out++;
+	if (unlikely(rw & BIO_FUA))
+		plo->st.bio_fua_out++;
+}
+static inline void ploop_acc_ff_out(struct ploop_device *plo,
+				    unsigned long rw)
+{
+	if (unlikely(rw & BIO_FLUSH)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_flush_out++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+	if (unlikely(rw & BIO_FUA)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_fua_out++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+}
+static inline void ploop_acc_flush_skip_locked(struct ploop_device *plo,
+					       unsigned long rw)
+{
+	if (unlikely(rw & BIO_FLUSH))
+		plo->st.bio_flush_skip++;
+}
+
+static inline void ploop_entry_add(struct ploop_device * plo, struct ploop_request * preq)
+{
+	list_add_tail(&preq->list, &plo->entry_queue);
+	plo->entry_qlen++;
+	if (test_bit(PLOOP_REQ_SYNC, &preq->state) && (!(preq->req_rw & WRITE) || (preq->req_rw & (BIO_FLUSH|BIO_FUA)))) {
+		__set_bit(PLOOP_REQ_RSYNC, &preq->state);
+		plo->read_sync_reqs++;
+	}
+}
+
+static inline void ploop_entry_qlen_dec(struct ploop_request * preq)
+{
+	preq->plo->entry_qlen--;
+	if (test_bit(PLOOP_REQ_RSYNC, &preq->state)) {
+		__clear_bit(PLOOP_REQ_RSYNC, &preq->state);
+		preq->plo->read_sync_reqs--;
+	}
+}
+
+static inline int ploop_map_log(struct ploop_device *plo)
+{
+	switch (plo->fmt_version) {
+	case PLOOP_FMT_V1:
+		return plo->cluster_log;
+	case PLOOP_FMT_V2:
+		return 0;
+	default:
+		BUG();
+	}
+
+	return -1;
+}
+
+struct map_node;
+
+int ploop_fastmap(struct ploop_map * map, cluster_t block, iblock_t *result);
+void ploop_update_map(struct ploop_map * map, int level, cluster_t block, iblock_t iblk);
+void ploop_update_map_hdr(struct ploop_map * map, u8 *hdr, int hdr_size);
+void map_release(struct map_node * m);
+int ploop_find_map(struct ploop_map * map, struct ploop_request * preq);
+int ploop_find_trans_map(struct ploop_map * map, struct ploop_request * preq);
+int ploop_check_map(struct ploop_map * map, struct ploop_request * preq);
+cluster_t map_get_mn_end(struct map_node *m);
+int map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result);
+int trans_map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result);
+int map_index_fault(struct ploop_request * preq);
+void map_read_complete(struct ploop_request * preq);
+int map_index(struct ploop_delta * delta, struct ploop_request * preq, unsigned long *sec);
+struct ploop_delta * map_writable_delta(struct ploop_request * preq);
+void map_init(struct ploop_device *, struct ploop_map * map);
+void ploop_map_start(struct ploop_map * map, u64 bd_size);
+void ploop_map_destroy(struct ploop_map * map);
+void ploop_map_remove_delta(struct ploop_map * map, int level);
+void ploop_index_update(struct ploop_request * preq);
+void ploop_index_wb_complete(struct ploop_request * preq);
+int __init ploop_map_init(void);
+void ploop_map_exit(void);
+
+
+void ploop_quiesce(struct ploop_device * plo);
+void ploop_relax(struct ploop_device * plo);
+
+void track_init(struct ploop_device * plo);
+int ploop_tracker_destroy(struct ploop_device *plo, int force);
+int ploop_tracker_stop(struct ploop_device * plo, int force);
+int ploop_tracker_read(struct ploop_device * plo, unsigned long arg);
+int ploop_tracker_setpos(struct ploop_device * plo, unsigned long arg);
+int ploop_tracker_init(struct ploop_device * plo, unsigned long arg);
+
+
+int ploop_add_lockout(struct ploop_request *preq, int try);
+void del_lockout(struct ploop_request *preq);
+
+int ploop_io_init(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc);
+int ploop_io_open(struct ploop_io *);
+void ploop_io_destroy(struct ploop_io * io);
+void ploop_io_report_fn(struct file * file, char * msg);
+
+int ploop_register_format(struct ploop_delta_ops * ops);
+int ploop_register_io(struct ploop_io_ops * ops);
+void ploop_unregister_format(struct ploop_delta_ops * ops);
+void ploop_unregister_io(struct ploop_io_ops * ops);
+void ploop_format_put(struct ploop_delta_ops * ops);
+
+extern struct kobj_type ploop_delta_ktype;
+void ploop_sysfs_init(struct ploop_device * plo);
+void ploop_sysfs_uninit(struct ploop_device * plo);
+
+void ploop_queue_zero_request(struct ploop_device *plo, struct ploop_request *orig_preq, cluster_t clu);
+
+int ploop_maintenance_wait(struct ploop_device * plo);
+
+extern int max_map_pages;
+
+extern void ploop_msg_once(struct ploop_device *plo, const char *, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
+/* Define PLOOP_TRACE to get full trace of ploop state machine.
+ */
+#undef PLOOP_TRACE
+
+
+#ifdef PLOOP_TRACE
+#define __TRACE(a...)  do { printk(a); } while (0)
+#else
+#define __TRACE(a...)  do { } while (0)
+#endif
+
+#endif /* _LINUX_PLOOP_H_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/ploop_if.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/ploop_if.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/ploop_if.h	2015-01-21 12:02:55.363904097 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/ploop_if.h	2015-01-21 12:02:57.809839173 +0300
@@ -0,0 +1,326 @@
+#ifndef __PLOOP_IF_H__
+#define __PLOOP_IF_H__ 1
+
+#include <linux/ioctl.h>
+
+/* This interface mixes data relevant to delta layer and io layer
+ * to one request. It is too simplistic.
+ *
+ * But this allows to create the whole delta atomically and does
+ * not require maintenance of incomplete composition state inside device.
+ */
+
+/* Formats of deltas. */
+
+#define PLOOP_FMT_RAW		1
+#define PLOOP_FMT_PLOOP1	2
+
+/* PLOOP_FMT_PLOOP1 subversions */
+enum {
+	PLOOP_FMT_UNDEFINED = 0,
+	PLOOP_FMT_V1,
+	PLOOP_FMT_V2,
+};
+
+/* Delta flags. */
+#define PLOOP_FMT_RDONLY	1
+#define PLOOP_FMT_FLAGS		1
+
+#define PLOOP_FLAG_FS_SYNC	0x10000000
+
+#define PLOOP_FMT_PREALLOCATED	2
+
+#define PLOOP_FLAG_COOKIE	4
+#define PLOOP_COOKIE_SIZE	64
+
+#define PLOOP_FLAG_CLUBLKS	8
+
+/* IO types. */
+
+#define PLOOP_IO_AUTO		0
+#define PLOOP_IO_DIRECT		1
+#define PLOOP_IO_NFS		2
+#define PLOOP_IO_RESERVED	3	/* reserved, do not use */
+#define PLOOP_IO_KAIO		4
+
+/*
+ * # slots to skip in the very first page of L2 table
+ * (they are reserved for format-specific header)
+ * Assumptions:
+ * 1) sizeof(map_index_t) == sizeof(u32)
+ * 2) PLOOP_MAP_OFFSET == sizeof(struct ploop_pvd_header) / sizeof(u32)
+ */
+#define PLOOP_MAP_OFFSET	16
+
+/*
+ * in-kernel ploop implementation assumes that L2[index] can never be
+ * equal to this value (this is guaranteed by limitation of bdsize).
+ * So, in-kernel ploop may encode L2[index] == 0 by this value and keep
+ * zero value as special one meaning "iblock is not allocated yet for
+ * given index". User-space may use this value to denote uninitialized
+ * slots of L2[] table.
+ */
+#define PLOOP_ZERO_INDEX	0xFFFFFFFFU
+
+struct ploop_ctl_chunk
+{
+	__s32	pctl_fd;	/* FD of backing file */
+	__u32	pctl_type;	/* IO engine */
+	__u32	pctl_flags;	/* Some modifiers, undefined now */
+	__u32	pctl_offset;	/* Starting cluster of this chunk in image */
+
+	__u64	pctl_start;	/* Position of data in file.  */
+	__u64	pctl_len;	/* Length of data area in file. */
+} __attribute__ ((aligned (8)));
+
+struct ploop_ctl
+{
+	/* Description of delta format */
+	__u32	pctl_format;
+	__u32	pctl_flags;
+	__u32	pctl_cluster_log;
+	__u32	pctl_size;
+
+	/* Description of backing files. */
+	__u16	pctl_chunks;
+	__u8	pctl_level;
+	__u8	__mbz1;
+	__u32	__mbz2;
+	struct ploop_ctl_chunk chunks[0];
+} __attribute__ ((aligned (8)));
+
+/* helper for ADD_DELTA */
+struct ploop_ctl_delta {
+	struct ploop_ctl c;
+	struct ploop_ctl_chunk f;
+};
+
+struct ploop_truncate_ctl
+{
+	int	fd;
+	__u32	alloc_head;
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+} __attribute__ ((aligned (8)));
+
+
+/*
+ * Before relocation l2[req_cluster] == old_iblk.
+ * Then user-space decided to relocate old_iblk to new_iblk.
+ * After relocation is done, we need kernel help to update map_node
+ * structure for req_cluster (if present). When kernel
+ * accomplished this, user-space may safely nullify old_iblk.
+ */
+struct reloc_map
+{
+	__u32 req_cluster;
+	__u32 iblk;
+} __attribute__ ((aligned (8)));
+
+struct ploop_index_update_ctl
+{
+	__u32	n_maps;
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct reloc_map rmap[0];
+} __attribute__ ((aligned (8)));
+
+/*
+ * user-space found out that some blocks are not used
+ * and reports the list of them to kernel. Onwards,
+ * kernel will use them as free blocks instead of
+ * alloc_head++ technique.
+ */
+struct ploop_freeblks_ctl_extent
+{
+	__u32 clu;
+	__u32 iblk;
+	__u32 len;
+
+} __attribute__ ((aligned (8)));
+
+struct ploop_freeblks_ctl
+{
+	__u32	n_extents;
+	__u32	alloc_head; /* out */
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct ploop_freeblks_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_relocblks_ctl_extent
+{
+	__u32 clu;
+	__u32 iblk;
+	__u32 len;
+	__u32 free; /* this extent is also present in freemap */
+} __attribute__ ((aligned (8)));
+
+struct ploop_relocblks_ctl
+{
+	__u32	n_extents;
+	__u32	n_scanned;  /* # bytes scanned */
+	__u32	alloc_head; /* in, for sanity check */
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct ploop_relocblks_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_balloon_ctl
+{
+	__u32	mntn_type;     /* see enum above */
+	__u32	alloc_head;    /* freezed alloc_head */
+	__u8	level;	       /* top-level of ploop device */
+	__u8	inflate;       /* inflate/truncate flag */
+	__u8	keep_intact;   /* keep mntn state intact */
+	__u8	__mbz;
+} __attribute__ ((aligned (8)));
+
+struct ploop_getdevice_ctl
+{
+	__u32	minor;
+	__u32	__mbz1;
+} __attribute__ ((aligned (8)));
+
+/* maintenance types */
+enum {
+	PLOOP_MNTN_OFF = 0,  /* no maintenance is in progress */
+	PLOOP_MNTN_BALLOON,  /* user-space started ballooning */
+	PLOOP_MNTN_FBLOADED, /* list of free-blocks loaded */
+	PLOOP_MNTN_SNAPSHOT, /* bdev is freezed due to snapshot */
+
+	PLOOP_MNTN_TRACK,    /* tracking is in progress */
+	PLOOP_MNTN_DISCARD,  /* ready to handle discard requests */
+
+	PLOOP_MNTN_NOFAST = 256,
+	/* all types below requires fast-path disabled ! */
+
+	PLOOP_MNTN_MERGE,    /* merge is in progress */
+	PLOOP_MNTN_GROW,     /* grow is in progress */
+	PLOOP_MNTN_RELOC,    /* relocation is in progress */
+};
+
+/*
+ * This define should be in sync with enum above.
+ * NB: PLOOP_MNTN_TRACK is handled separately because
+ * READ-requests may go fast-path even while tracking.
+ */
+#define FAST_PATH_DISABLED(t) (t > PLOOP_MNTN_NOFAST)
+
+#define PLOOPCTLTYPE	'P'
+
+/* Add delta. Device must be offline */
+#define PLOOP_IOC_ADD_DELTA	_IOW(PLOOPCTLTYPE, 0, struct ploop_ctl)
+
+/* Close images, free all data, return the device to initial state  */
+#define PLOOP_IOC_CLEAR		_IO(PLOOPCTLTYPE, 1)
+
+/* Stop/start device. */
+#define PLOOP_IOC_STOP		_IO(PLOOPCTLTYPE, 2)
+#define PLOOP_IOC_START		_IO(PLOOPCTLTYPE, 3)
+
+/* Make new snapshot on running device */
+#define PLOOP_IOC_SNAPSHOT	_IOW(PLOOPCTLTYPE, 4, struct ploop_ctl)
+
+/* Remove delta. Argument is delta level. */
+#define PLOOP_IOC_DEL_DELTA	_IOW(PLOOPCTLTYPE, 5, __u32)
+
+struct ploop_track_extent
+{
+	__u64	start;
+	__u64	end;
+};
+
+/* Start tracking of top delta image. */
+#define PLOOP_IOC_TRACK_INIT	_IOR(PLOOPCTLTYPE, 6, struct ploop_track_extent)
+
+/* Stop of top delta image. It is responsibility of caller
+ * to quiesce the device before stopping tracking. The ioctl
+ * will fail if tracking was aborted or if not all dirty bits are read.
+ */
+#define PLOOP_IOC_TRACK_STOP	_IO(PLOOPCTLTYPE, 7)
+
+/* Abort tracker, clear the state */
+#define PLOOP_IOC_TRACK_ABORT	_IO(PLOOPCTLTYPE, 8)
+
+/* User -> ploop : transferred up to this position */
+#define PLOOP_IOC_TRACK_SETPOS	_IOW(PLOOPCTLTYPE, 9, __u64)
+
+/* ploop -> user: get modified bits */
+#define PLOOP_IOC_TRACK_READ	_IOR(PLOOPCTLTYPE, 10, struct ploop_track_extent)
+
+/* sync cacheable state of deltas to disk */
+#define PLOOP_IOC_SYNC		_IO(PLOOPCTLTYPE, 11)
+
+/* Merge top delta to lower one and delete it. */
+#define PLOOP_IOC_MERGE		_IO(PLOOPCTLTYPE, 12)
+
+/* Replace alive delta with equivalent one. */
+#define PLOOP_IOC_REPLACE_DELTA	_IOW(PLOOPCTLTYPE, 13, struct ploop_ctl)
+
+/* Replace alive delta with equivalent one. */
+#define PLOOP_IOC_TRUNCATE	_IOW(PLOOPCTLTYPE, 14, struct ploop_truncate_ctl)
+
+/* Update in-core copy of L2 table */
+#define PLOOP_IOC_UPDATE_INDEX  _IOW(PLOOPCTLTYPE, 16, struct ploop_index_update_ctl)
+
+/* Increase size of block device */
+#define PLOOP_IOC_GROW		_IOW(PLOOPCTLTYPE, 17, struct ploop_ctl)
+
+/* Inquire current state of free block extents */
+#define PLOOP_IOC_FBGET		_IOW(PLOOPCTLTYPE, 18, struct ploop_freeblks_ctl)
+
+/* Start balloning or inquire maintenance_type or flush stale BALLON state */
+#define PLOOP_IOC_BALLOON	_IOW(PLOOPCTLTYPE, 19, struct ploop_balloon_ctl)
+
+/* Load free blocks to ploop */
+#define PLOOP_IOC_FREEBLKS      _IOW(PLOOPCTLTYPE, 20, struct ploop_freeblks_ctl)
+
+/* Load blocks to relocate and initiate relocation process */
+#define PLOOP_IOC_RELOCBLKS     _IOW(PLOOPCTLTYPE, 21, struct ploop_relocblks_ctl)
+
+/* Search ploop_device global tree for first unused minor number */
+#define PLOOP_IOC_GETDEVICE    _IOW(PLOOPCTLTYPE, 22, struct ploop_getdevice_ctl)
+
+/* Start handling discard requests */
+#define PLOOP_IOC_DISCARD_INIT _IO(PLOOPCTLTYPE, 23)
+/* Stop handling discard requests */
+#define PLOOP_IOC_DISCARD_FINI _IO(PLOOPCTLTYPE, 24)
+/* Wait a discard request */
+#define PLOOP_IOC_DISCARD_WAIT _IO(PLOOPCTLTYPE, 25)
+
+/* Drop current state of free block extents */
+#define PLOOP_IOC_FBDROP	_IO(PLOOPCTLTYPE, 26)
+
+/* Filter extents with sizes less than arg */
+#define PLOOP_IOC_FBFILTER	_IOR(PLOOPCTLTYPE, 27, unsigned long)
+
+/* Set maximum size for the top delta . */
+#define PLOOP_IOC_MAX_DELTA_SIZE _IOW(PLOOPCTLTYPE, 28, __u64)
+
+/* Events exposed via /sys/block/ploopN/pstate/event */
+#define PLOOP_EVENT_ABORTED	1
+#define PLOOP_EVENT_STOPPED	2
+#define PLOOP_EVENT_ENOSPC	3
+
+#ifdef __KERNEL__
+
+#define PLOOP_INTERNAL_MAGIC	0x284cd32c
+struct ploop_xops
+{
+	__u32		magic;
+
+	int		(*get_extent)(struct inode *inode, sector_t isec,
+				      unsigned int nr, sector_t *start,
+				      sector_t *psec, int creat);
+};
+
+#define PLOOP_IOC_INTERNAL	_IOR(PLOOPCTLTYPE, 15, struct ploop_xops)
+
+#endif
+
+#endif /* __PLOOP_IF_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/ploop_stat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/ploop_stat.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ploop/ploop_stat.h	2015-01-21 12:02:55.363904097 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ploop/ploop_stat.h	2015-01-21 12:02:55.363904097 +0300
@@ -0,0 +1,48 @@
+__DO(bio_in)
+__DO(bio_fast)
+__DO(bio_full)
+__DO(bio_out)
+__DO(bio_alloc)
+__DO(bio_alloc_whole)
+__DO(bio_splits)
+__DO(coal_back)
+__DO(coal_forw)
+__DO(coal_back2)
+__DO(coal_forw2)
+__DO(coal_oback)
+__DO(coal_oforw)
+__DO(coal_mback)
+__DO(coal_mforw)
+__DO(coal_overlap)
+__DO(coal_flush)
+__DO(bio_barriers)
+__DO(bio_rzero)
+__DO(bio_wzero)
+__DO(bio_syncwait)
+__DO(bio_fsync)
+__DO(bio_cows)
+__DO(bio_whole_cows)
+__DO(merge_neg_cluster)
+__DO(merge_neg_disable)
+__DO(fast_neg_nomap)
+__DO(fast_neg_noem)
+__DO(fast_neg_shortem)
+__DO(fast_neg_backing)
+__DO(bio_lockouts)
+__DO(map_lockouts)
+__DO(merge_lockouts)
+__DO(map_reads)
+__DO(map_merges)
+__DO(map_single_writes)
+__DO(map_multi_writes)
+__DO(map_multi_updates)
+__DO(bio_trans_whole)
+__DO(bio_trans_copy)
+__DO(bio_trans_alloc)
+__DO(bio_trans_index)
+__DO(bio_flush_in)
+__DO(bio_fua_in)
+__DO(bio_flush_out)
+__DO(bio_fua_out)
+__DO(bio_flush_skip)
+
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/poll.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/poll.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/poll.h	2014-12-12 23:29:12.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/poll.h	2015-01-21 12:02:57.968834952 +0300
@@ -67,6 +67,7 @@ struct poll_wqueues {
 	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
 };
 
+extern long do_restart_poll(struct restart_block *restart_block);
 extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
 extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/posix-timers.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/posix-timers.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/posix-timers.h	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/posix-timers.h	2015-01-21 12:02:50.327037805 +0300
@@ -116,6 +116,7 @@ void posix_cpu_timer_schedule(struct k_i
 void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
+long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval);
@@ -124,4 +125,10 @@ long clock_nanosleep_restart(struct rest
 
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
+int timer_create_id(const clockid_t which_clock,
+		    struct sigevent *timer_event_spec, timer_t *timer_id);
+int timer_setup(timer_t timer_id, struct itimerspec *setting,
+		int overrun, int overrun_last, int signal_pending);
+void get_timer_setting(struct k_itimer *timr, struct itimerspec *setting,
+		       int *overrun, int *overrun_last, int *signal_pending);
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pram.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pram.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pram.h	2015-01-21 12:02:52.617976988 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pram.h	2015-01-21 12:02:58.675816188 +0300
@@ -0,0 +1,108 @@
+#ifndef _LINUX_PRAM_H
+#define _LINUX_PRAM_H
+/*
+ * Persistent RAM provides a kernel interface to save data so that it can be
+ * loaded and used after a kexec.
+ *
+ * Usage:
+ * 
+ * * To save data to a PRAM storage:
+ *   pram_open(name, PRAM_WRITE, stream);
+ *   pram_push_page(stream, page, &pfn); // and/or ...
+ *   pram_write(stream, buf, count);
+ *   pram_close(stream, 0); // to save the storage or ...
+ *   pram_close(stream, -1); // to discard data written and destroy the storage
+ *
+ * * To load data from a PRAM storage:
+ *   pram_open(name, PRAM_READ, stream);
+ *   page = pram_pop_page(stream); // and/or ...
+ *   pram_read(stream, buf, count);
+ *   pram_close(stream, 0);
+ *
+ * For PRAM to be restored after a kexec, the PRAM pfn has to be passed to the
+ * kernel at boot time in the 'pram' parameter. The PRAM pfn can be read from
+ * /sys/kernel/pram.
+ */
+
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+
+struct pram_chain;
+struct pram_link;
+
+struct pram_stream {
+	struct pram_chain *chain;
+	struct pram_link *link;
+	unsigned long offset;
+	struct page *data_page;
+	unsigned long data_offset;
+	gfp_t gfp_mask;
+};
+
+#define PRAM_WRITE	1
+#define PRAM_READ	2
+
+extern int __pram_open(const char *name, int mode, gfp_t gfp_mask,
+		       struct pram_stream *stream);
+#define pram_open(name, mode, stream) \
+	__pram_open(name, mode, GFP_KERNEL | __GFP_HIGHMEM, stream)
+extern int pram_push_page(struct pram_stream *stream, struct page *page,
+			  unsigned long *ppfn);
+extern struct page *pram_pop_page(struct pram_stream *stream);
+extern int pram_del_page(struct pram_stream *stream, struct page *page);
+extern ssize_t pram_write(struct pram_stream *stream,
+			  const void *buf, size_t count);
+extern ssize_t pram_read(struct pram_stream *stream,
+			 void *buf, size_t count);
+extern void pram_close(struct pram_stream *stream, int how);
+extern int pram_destroy(const char *name);
+
+extern int __pram_prealloc(gfp_t gfp_mask, int n, ...);
+#define pram_prealloc(gfp, sz) \
+	__pram_prealloc(gfp, 1, (size_t)(sz))
+#define pram_prealloc2(gfp, sz1, sz2) \
+	__pram_prealloc(gfp, 2, (size_t)(sz1), (size_t)(sz2))
+extern void pram_prealloc_end(void);
+
+extern int pram_for_each_page(struct pram_stream *stream,
+		int (*fn)(struct page *page, void *data), void *data);
+extern int pram_del_from_lru(struct pram_stream *stream, int wait);
+
+extern int pram_dirty(struct pram_stream *stream);
+
+#define PRAM_DEL_FROM_LRU_OBSOLETE
+
+#ifdef CONFIG_PRAM
+/*
+ * This function can be used to check if a page extracted from pram is dirty
+ * i.e.  it was not relocated on push and the system has not been rebooted
+ * since it was added to pram.
+ *
+ * To mark page dirty, use PAGE_MAPPING_ANON bit of its mapping. It should not
+ * conflict with memory reclaimer because page_mapping won't return the actual
+ * mapping value then. Neither should it cause any troubles freeing such pages
+ * (see free_hot_cold_page).
+ */
+static inline bool pram_page_dirty(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+extern unsigned long long pram_low;
+extern unsigned long pram_reserved_pages;
+extern void pram_reserve(void);
+extern void pram_init(void);
+extern void pram_ban_region(unsigned long start, unsigned long end);
+extern void pram_show_banned(void);
+#else
+static inline bool pram_page_dirty(struct page *page) { return false; };
+#define pram_low 0ULL
+#define pram_reserved_pages 0UL
+static inline void pram_reserve(void) { }
+static inline void pram_init(void) { }
+static inline void pram_ban_region(unsigned long start, unsigned long end) { }
+static inline void pram_show_banned(void) { }
+#endif
+
+#endif /* _LINUX_PRAM_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/pramcache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pramcache.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/pramcache.h	2015-01-21 12:02:52.679975343 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/pramcache.h	2015-01-21 12:02:53.023966212 +0300
@@ -0,0 +1,19 @@
+#ifndef _LINUX_PRAMCACHE_H
+#define _LINUX_PRAMCACHE_H
+
+struct super_block;
+
+#ifdef CONFIG_PRAMCACHE
+extern void pramcache_load_page_cache(struct super_block *sb);
+extern void pramcache_load_bdev_cache(struct super_block *sb);
+extern void pramcache_save_page_cache(struct super_block *sb, int nosync);
+extern void pramcache_save_bdev_cache(struct super_block *sb);
+#else
+static inline void pramcache_load_page_cache(struct super_block *sb) { }
+static inline void pramcache_load_bdev_cache(struct super_block *sb) { }
+static inline void pramcache_save_page_cache(struct super_block *sb,
+					     int nosync) { }
+static inline void pramcache_save_bdev_cache(struct super_block *sb) { }
+#endif /* CONFIG_PRAMCACHE */
+
+#endif /* _LINUX_PRAMCACHE_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/prctl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/prctl.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/prctl.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/prctl.h	2015-01-21 12:02:51.933995144 +0300
@@ -102,4 +102,8 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_SET_DATA_CSUM	100500
+# define PR_DATA_CSUM_OFF	0
+# define PR_DATA_CSUM_ON	1
+
 #endif /* _LINUX_PRCTL_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/preempt.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/preempt.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/preempt.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/preempt.h	2015-01-21 12:02:54.123937014 +0300
@@ -27,6 +27,21 @@
 
 asmlinkage void preempt_schedule(void);
 
+#define preempt_check_resched() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+		preempt_schedule(); \
+} while (0)
+
+#else /* !CONFIG_PREEMPT */
+
+#define preempt_check_resched()		do { } while (0)
+
+#endif /* CONFIG_PREEMPT */
+
+
+#ifdef CONFIG_PREEMPT_COUNT
+
 #define preempt_disable() \
 do { \
 	inc_preempt_count(); \
@@ -39,12 +54,6 @@ do { \
 	dec_preempt_count(); \
 } while (0)
 
-#define preempt_check_resched() \
-do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
-		preempt_schedule(); \
-} while (0)
-
 #define preempt_enable() \
 do { \
 	preempt_enable_no_resched(); \
@@ -80,18 +89,17 @@ do { \
 	preempt_check_resched(); \
 } while (0)
 
-#else
+#else /* !CONFIG_PREEMPT_COUNT */
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
-#define preempt_check_resched()		do { } while (0)
 
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
 
-#endif
+#endif /* CONFIG_PREEMPT_COUNT */
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/proc_fs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/proc_fs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/proc_fs.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/proc_fs.h	2015-01-21 12:02:44.387195495 +0300
@@ -107,9 +107,14 @@ struct vmcore {
 #ifdef CONFIG_PROC_FS
 
 extern void proc_root_init(void);
+extern struct file_system_type proc_fs_type;
+extern const struct file_operations proc_kmsg_operations;
 
 void proc_flush_task(struct task_struct *task);
 
+extern int proc_dentry_of_dead_task(struct dentry *dentry);
+extern struct file_operations dummy_proc_pid_file_operations;
+
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 						struct proc_dir_entry *parent);
 struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
@@ -153,6 +158,16 @@ extern struct proc_dir_entry *proc_symli
 extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
 extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
 			struct proc_dir_entry *parent);
+extern struct proc_dir_entry *create_proc_hardlink(
+			const char *name, mode_t mode, 
+			struct proc_dir_entry *parent,
+			struct proc_dir_entry *link);
+extern struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir,
+		const char *name, int namelen);
+extern struct proc_dir_entry *proc_lookup_entry(const char *name, 
+			struct proc_dir_entry *parent);
+
+extern struct proc_dir_entry glob_proc_root;
 
 static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode,
 	struct proc_dir_entry *parent, const struct file_operations *proc_fops)
@@ -194,6 +209,8 @@ extern void proc_free_inum(unsigned int 
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
 static inline void proc_net_remove(struct net *net, const char *name) {}
 
+static inline int proc_dentry_of_dead_task(struct dentry *dentry) { return 0; }
+
 static inline void proc_flush_task(struct task_struct *task)
 {
 }
@@ -312,6 +329,9 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
+#ifdef CONFIG_VE
+	struct proc_dir_entry *lpde;
+#endif
 	struct inode vfs_inode;
 	void *ns;
 	const struct proc_ns_operations *ns_ops;
@@ -327,6 +347,15 @@ static inline struct proc_dir_entry *PDE
 	return PROC_I(inode)->pde;
 }
 
+static inline struct proc_dir_entry *LPDE(const struct inode *inode)
+{
+#ifdef CONFIG_VE
+	return PROC_I(inode)->lpde;
+#else
+	return NULL;
+#endif
+}
+
 static inline void *PDE_DATA(const struct inode *inode)
 {
 	return PROC_I(inode)->pde->data;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ptrace.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ptrace.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ptrace.h	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ptrace.h	2015-01-21 12:02:41.334276552 +0300
@@ -330,6 +330,9 @@ static inline void user_single_step_sigi
  * calling arch_ptrace_stop() when it would be superfluous.  For example,
  * if the thread has not been back to user mode since the last stop, the
  * thread state might indicate that nothing needs to be done.
+ *
+ * This is guaranteed to be invoked once before a task stops for ptrace and
+ * may include arch-specific operations necessary prior to a ptrace stop.
  */
 #define arch_ptrace_stop_needed(code, info)	(0)
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/quota.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/quota.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/quota.h	2014-12-12 23:28:52.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/quota.h	2015-01-21 12:02:53.309958619 +0300
@@ -175,6 +175,10 @@ enum {
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 
+#include <linux/spinlock.h>
+
+extern spinlock_t dq_data_lock;
+
 #include <linux/dqblk_xfs.h>
 #include <linux/dqblk_v1.h>
 #include <linux/dqblk_v2.h>
@@ -293,6 +297,8 @@ struct quota_format_ops {
 	int (*release_dqblk)(struct dquot *dquot);	/* Called when last reference to dquot is being dropped */
 };
 
+struct inode;
+struct iattr;
 /* Operations working with dquots */
 struct dquot_operations {
 	int (*initialize) (struct inode *, int);
@@ -318,9 +324,15 @@ struct dquot_operations {
 	/* get reserved quota for delayed alloc, value returned is managed by
 	 * quota code only */
 	qsize_t *(*get_reserved_space) (struct inode *);
+	int (*rename) (struct inode *, struct inode *, struct inode *);
+
+	void (*swap_inode) (struct inode *, struct inode *);
+	void (*shutdown) (struct super_block *);
+	unsigned int (*orphan_cookie) (struct inode *i);
 };
 
 /* Operations handling requests from userspace */
+struct v2_disk_dqblk;
 struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, char *, int);
 	int (*quota_off)(struct super_block *, int, int);
@@ -333,6 +345,10 @@ struct quotactl_ops {
 	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
 	int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
+#ifdef CONFIG_QUOTA_COMPAT
+	int (*get_quoti)(struct super_block *, int, unsigned int,
+			struct v2_disk_dqblk __user *);
+#endif
 };
 
 struct quota_format_type {
@@ -398,6 +414,12 @@ struct quota_info {
 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
 	struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+	struct vz_quota_master *vzdq_master;
+	const struct dquot_operations	*dq_op_orig;
+	const struct quotactl_ops	*qcop_orig;
+	int vzdq_count;
+#endif
 };
 
 int register_quota_format(struct quota_format_type *fmt);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/quotaops.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/quotaops.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/quotaops.h	2014-12-12 23:28:55.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/quotaops.h	2015-01-21 12:02:53.364957159 +0300
@@ -33,6 +33,9 @@ static inline void writeout_quota_sb(str
 void inode_add_rsv_space(struct inode *inode, qsize_t number);
 void inode_claim_rsv_space(struct inode *inode, qsize_t number);
 void inode_sub_rsv_space(struct inode *inode, qsize_t number);
+qsize_t inode_get_rsv_space(struct inode *inode);
+void inode_incr_space(struct inode *inode, qsize_t number, int reserve);
+void inode_decr_space(struct inode *inode, qsize_t number, int reserve);
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -278,6 +281,19 @@ static inline void vfs_dq_free_inode(str
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 
+static __inline__ int vfs_dq_rename(struct inode *inode,
+		struct inode *old_dir, struct inode *new_dir)
+{
+	const struct dquot_operations *q_op;
+
+	q_op = inode->i_sb->dq_op;
+	if (q_op && q_op->rename) {
+		if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA)
+			return 1;
+	}
+	return 0;
+}
+
 /* Cannot be called inside a transaction */
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
@@ -288,6 +304,43 @@ static inline int vfs_dq_off(struct supe
 	return ret;
 }
 
+static __inline__ void DQUOT_SWAP(struct inode *inode, struct inode *tmpl)
+{
+	if (sb_any_quota_active(tmpl->i_sb) &&
+	    tmpl->i_sb->dq_op->swap_inode)
+		tmpl->i_sb->dq_op->swap_inode(inode, tmpl);
+}
+
+static __inline__ int DQUOT_CHECK_SPACE(struct inode *inode)
+{
+	if (vfs_dq_alloc_space_nodirty(inode, 512))
+		return -EDQUOT;
+	vfs_dq_free_space_nodirty(inode, 512);
+	return 0;
+}
+
+static __inline__ void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks)
+{
+	if (sb_any_quota_active(inode->i_sb)) {
+		if (blocks > inode->i_blocks)
+			inode->i_sb->dq_op->alloc_space(inode,
+							(qsize_t)(blocks-inode->i_blocks)*512,
+							DQUOT_SPACE_NOFAIL);
+		else if (blocks < inode->i_blocks)
+			inode->i_sb->dq_op->free_space(inode, (qsize_t)(inode->i_blocks-blocks)*512);
+	} else
+		inode->i_blocks = blocks;
+}
+
+static __inline__ unsigned int DQUOT_ORPHAN_COOKIE(struct inode *inode)
+{
+	if (sb_any_quota_active(inode->i_sb) &&
+			inode->i_sb->dq_op->orphan_cookie)
+		return inode->i_sb->dq_op->orphan_cookie(inode);
+	else
+		return 0;
+}
+
 #else
 
 static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
@@ -377,6 +430,12 @@ static inline int vfs_dq_transfer(struct
 	return 0;
 }
 
+static inline int vfs_dq_rename(struct inode *inode, struct inode *old_dir,
+		struct inode *new_dir)
+{
+	return 0;
+}
+
 static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	inode_add_bytes(inode, nr);
@@ -436,6 +495,20 @@ static inline void vfs_dq_free_space(str
 	mark_inode_dirty(inode);
 }	
 
+static inline void DQUOT_SWAP(struct inode *inode, struct inode *tmpl)
+{
+}
+
+static inline void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks)
+{
+	inode->i_blocks = blocks;
+}
+
+static inline unsigned int DQUOT_ORPHAN_COOKIE(struct inode *inode)
+{
+	return 0;
+}
+
 #endif /* CONFIG_QUOTA */
 
 static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/radix-tree.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/radix-tree.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/radix-tree.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/radix-tree.h	2015-01-21 12:02:43.139228628 +0300
@@ -219,6 +219,7 @@ void *radix_tree_tag_clear(struct radix_
 			unsigned long index, unsigned int tag);
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
+int radix_tree_prev_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ramfs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ramfs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ramfs.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ramfs.h	2015-01-21 12:02:52.790972397 +0300
@@ -1,9 +1,24 @@
 #ifndef _LINUX_RAMFS_H
 #define _LINUX_RAMFS_H
 
+struct ramfs_mount_opts {
+	umode_t mode;
+};
+
+struct ramfs_fs_info {
+	struct ramfs_mount_opts mount_opts;
+#ifdef CONFIG_PRAMFS
+	int pram_load;
+	int pram_save;
+#define PRAM_FS_NAME_MAX	256	/* including nul */
+	char pram_name[PRAM_FS_NAME_MAX];
+#endif
+};
+
 struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
 extern int ramfs_get_sb(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data, struct vfsmount *mnt);
+extern int ramfs_fill_super(struct super_block * sb, void * data, int silent);
 
 #ifndef CONFIG_MMU
 extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/rcupdate.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/rcupdate.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/rcupdate.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/rcupdate.h	2015-01-21 12:02:42.293251091 +0300
@@ -332,6 +332,19 @@ extern void call_rcu(struct rcu_head *he
 extern void call_rcu_bh(struct rcu_head *head,
 			void (*func)(struct rcu_head *head));
 
+/**
+ * call_rcu_in_process() - Queue an RCU for invocation in process context.
+ * Not guarantee grace period, must be combined with other rcu calls for this.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked.
+ *
+ * If called in non-interrupt context it does nothing and return 0,
+ * otherwise it schedule head for invocation and return 1.
+ */
+extern int call_rcu_in_process(struct rcu_head *head,
+		void (*func)(struct rcu_head *head));
+
+#define rcu_in_process_barrier		flush_scheduled_work
 
 extern void kfree(const void *);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/rmap.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/rmap.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/rmap.h	2014-12-12 23:29:01.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/rmap.h	2015-01-21 12:02:57.968834952 +0300
@@ -48,6 +48,8 @@ struct anon_vma {
 	 * mm_take_all_locks() (mm_all_locks_mutex).
 	 */
 	struct list_head head;	/* Chain of private "related" vmas */
+
+	struct user_beancounter *anon_vma_ub;
 };
 
 /*
@@ -161,6 +163,8 @@ void drop_anon_vma(struct anon_vma *);
 #define drop_anon_vma(x)	do {} while(0)
 #endif
 
+extern int anon_vma_link(struct vm_area_struct *vma);
+
 static inline void anon_vma_merge(struct vm_area_struct *vma,
 				  struct vm_area_struct *next)
 {
@@ -176,7 +180,7 @@ void page_add_anon_rmap(struct page *, s
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
 			   unsigned long, int);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_file_rmap(struct page *);
+void page_add_file_rmap(struct page *, struct mm_struct *);
 void page_remove_rmap(struct page *);
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
@@ -201,6 +205,7 @@ enum ttu_flags {
 	TTU_UNMAP = 0,			/* unmap mode */
 	TTU_MIGRATION = 1,		/* migration mode */
 	TTU_MUNLOCK = 2,		/* munlock mode */
+	TTU_VSWAP = 3,			/* vswap mode */
 	TTU_ACTION_MASK = 0xff,
 
 	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
@@ -241,8 +246,8 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma(struct page *page);
-void page_unlock_anon_vma(struct anon_vma *anon_vma);
+extern struct anon_vma *page_lock_anon_vma(struct page *page);
+extern void page_unlock_anon_vma(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sched.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sched.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sched.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sched.h	2015-01-21 12:02:58.309825902 +0300
@@ -94,6 +94,8 @@ struct sched_param {
 
 #include <asm/processor.h>
 
+#include <bc/task.h>
+
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
@@ -122,6 +124,8 @@ extern int print_fatal_signals;
  */
 extern unsigned long avenrun[];		/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_avenrun_ve(unsigned long *loads,
+			unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
@@ -135,16 +139,28 @@ extern void get_avenrun(unsigned long *l
 	load += n*(FIXED_1-exp); \
 	load >>= FSHIFT;
 
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
+extern unsigned long nr_sleeping(void);
+extern unsigned long nr_stopped(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
-extern unsigned long this_cpu_load(void);
+extern unsigned long nr_active_cpu(void);
+extern atomic_t nr_dead;
+extern unsigned long nr_zombie;
 
+#ifdef CONFIG_VE
+extern unsigned long nr_running_ve(void);
+#else
+#define nr_running_ve()				0
+#endif
 
 extern void calc_global_load(void);
 
@@ -196,6 +212,7 @@ extern unsigned long long time_sync_thre
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
+#define TASK_IOTHROTTLED	512
 
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -218,6 +235,7 @@ extern unsigned long long time_sync_thre
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 				 (task->flags & PF_FREEZING) == 0)
+#define task_iothrottled(task)	((task->state & TASK_IOTHROTTLED) != 0)
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -279,6 +297,7 @@ static inline void select_nohz_load_bala
 extern void show_state_filter(unsigned long state_filter);
 
 extern void wait_for_rqlock(void);
+void show_sched_debug(void);
 
 static inline void show_state(void)
 {
@@ -330,6 +349,7 @@ extern unsigned int  sysctl_hung_task_pa
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
+extern int sysctl_hung_task_verbosity;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
@@ -567,6 +587,9 @@ struct thread_group_cputimer {
 #include <linux/rwsem.h>
 struct autogroup;
 
+#include <linux/ve.h>
+#include <linux/ve_task.h>
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -772,6 +795,7 @@ struct user_struct {
 	uid_t uid;
 	struct user_namespace *user_ns;
 
+	struct user_beancounter *user_ub;
 #ifdef CONFIG_USER_SCHED
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
@@ -866,17 +890,39 @@ enum cpu_idle_type {
 };
 
 /*
- * sched-domains (multiprocessor balancing) declarations:
- */
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if BITS_PER_LONG > 32
+# define SCHED_LOAD_RESOLUTION	10
+# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION	0
+# define scale_load(w)		(w)
+# define scale_load_down(w)	(w)
+#endif
 
-/*
- * Increase resolution of nice-level calculations:
- */
-#define SCHED_LOAD_SHIFT	10
+#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
-#define SCHED_LOAD_SCALE_FUZZ	SCHED_LOAD_SCALE
+/*
+ * Increase resolution of cpu_power calculations
+ */
+#define SCHED_POWER_SHIFT	10
+#define SCHED_POWER_SCALE	(1L << SCHED_POWER_SHIFT)
 
+/*
+ * sched-domains (multiprocessor balancing) declarations:
+ */
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		0x0001	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
@@ -1128,6 +1174,7 @@ struct sched_domain;
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_WAKING		2
 #define ENQUEUE_HEAD		4
+#define ENQUEUE_BOOST		8
 
 #define DEQUEUE_SLEEP		1
 
@@ -1159,7 +1206,7 @@ struct sched_class {
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, int *this_best_prio);
+			int *all_pinned);
 
 	int (*move_one_task) (struct rq *this_rq, int this_cpu,
 			      struct rq *busiest, struct sched_domain *sd,
@@ -1205,10 +1252,14 @@ struct sched_class {
 	void (*moved_group) (struct task_struct *p);
 #endif
 #endif
+	void (*nr_iowait_inc) (struct task_struct *p);
+	void (*nr_iowait_dec) (struct task_struct *p);
 
 #ifndef __GENKSYMS__
 	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
 #endif
+
+	void (*task_scheduled) (struct rq *rq, struct task_struct *p);
 };
 
 struct load_weight {
@@ -1231,6 +1282,11 @@ struct sched_entity {
 	struct list_head	group_node;
 	unsigned int		on_rq;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	unsigned int		boosted;
+	struct list_head	boost_node;
+#endif
+
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
@@ -1391,11 +1447,15 @@ struct task_struct {
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
 	unsigned in_iowait:1;
+	unsigned did_ve_enter:1;
 
 
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
 
+	unsigned woken_while_running:1;
+	unsigned may_throttle:1;
+
 	pid_t pid;
 	pid_t tgid;
 
@@ -1462,6 +1522,7 @@ struct task_struct {
 				     - initialized normally by setup_new_exec */
 /* file system info */
 	int link_count, total_link_count;
+	unsigned int trans_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
 	struct sysv_sem sysvsem;
@@ -1556,6 +1617,9 @@ struct task_struct {
 /* journalling filesystem info */
 	void *journal_info;
 
+/* transaction filesystem info */
+	void *transaction_info;
+
 /* stacked block device info */
 	struct bio *bio_list, **bio_tail;
 
@@ -1622,6 +1686,8 @@ struct task_struct {
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
 
+	__u8	 stopped_state:1;
+
 	/*
 	 * cache last used pipe for splice
 	 */
@@ -1645,6 +1711,7 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
+	int data_csum_enabled;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored adress in ret_stack */
 	int curr_ret_stack;
@@ -1666,6 +1733,16 @@ struct task_struct {
 	/* bitmask of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_BEANCOUNTERS
+	struct task_beancounter task_bc;
+#endif
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+	unsigned long	magic;
+	struct inode	*ino;
+#endif
+#ifdef CONFIG_VE
+	struct ve_task_info ve_task_info;
+#endif
 	/* padding reserved by Red Hat for non-kABI breaking extensions */
 	unsigned long rh_reserved[2];
 #ifndef __GENKSYMS__
@@ -1866,6 +1943,33 @@ static inline void put_task_struct(struc
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
+#ifndef CONFIG_VE
+#define set_pn_state(tsk, state)	do { } while(0)
+#define clear_pn_state(tsk)		do { } while(0)
+#define set_stop_state(tsk)		do { } while(0)
+#define clear_stop_state(tsk)		do { } while(0)
+#else
+#define PN_STOP_TF	1	/* was not in 2.6.8 */
+#define PN_STOP_TF_RT	2	/* was not in 2.6.8 */
+#define PN_STOP_ENTRY	3
+#define PN_STOP_FORK	4
+#define PN_STOP_VFORK	5
+#define PN_STOP_SIGNAL	6
+#define PN_STOP_EXIT	7
+#define PN_STOP_EXEC	8
+#define PN_STOP_LEAVE	9
+
+static inline void set_stop_state(struct task_struct *tsk)
+{
+	tsk->stopped_state = 1;
+}
+
+static inline void clear_stop_state(struct task_struct *tsk)
+{
+	tsk->stopped_state = 0;
+}
+#endif
+
 /*
  * Per process flags
  */
@@ -1875,6 +1979,7 @@ extern void thread_group_times(struct ta
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_EXIT_RESTART	0x00000020	/* do_exit() restarted, see do_exit() */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
@@ -2113,6 +2218,13 @@ static inline void sched_autogroup_exit(
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern int sched_cgroup_set_rate(struct cgroup *cgrp, unsigned long rate);
+extern unsigned long sched_cgroup_get_rate(struct cgroup *cgrp);
+extern int sched_cgroup_set_nr_cpus(struct cgroup *cgrp, unsigned int nr_cpus);
+extern unsigned int sched_cgroup_get_nr_cpus(struct cgroup *cgrp);
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -2132,6 +2244,7 @@ extern int task_nice(const struct task_s
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
+#define __HAVE_HYPERVISOR_SCHED_SETSCHEDULER
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
 				      struct sched_param *);
@@ -2235,12 +2348,14 @@ extern void block_all_signals(int (*noti
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
 extern void release_task(struct task_struct * p);
+extern int reap_zombie(struct task_struct *p);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sigsegv(int, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
 extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
+extern int recalc_sigpending_tsk(struct task_struct *t);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
@@ -2251,6 +2366,8 @@ extern void force_sig(int, struct task_s
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern void zap_other_threads(struct task_struct *p);
+extern struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
+					 int override_rlimit);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
@@ -2331,10 +2448,18 @@ extern NORET_TYPE void do_group_exit(int
 
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
+extern void exit_mm(struct task_struct *);
 extern int disallow_signal(int);
 
 extern int do_execve(const char *, char __user * __user *, char __user * __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern long do_fork_pid(unsigned long clone_flags,
+			unsigned long stack_start,
+			struct pt_regs *regs,
+			unsigned long stack_size,
+			int __user *parent_tidptr,
+			int __user *child_tidptr,
+			long pid0);
 struct task_struct *fork_idle(int);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
@@ -2352,11 +2477,11 @@ static inline unsigned long wait_task_in
 }
 #endif
 
-#define next_task(p) \
+#define next_task_all(p) \
 	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
-#define for_each_process(p) \
-	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+#define for_each_process_all(p) \
+	for (p = &init_task ; (p = next_task_all(p)) != &init_task ; )
 
 extern bool current_is_single_threaded(void);
 
@@ -2364,10 +2489,10 @@ extern bool current_is_single_threaded(v
  * Careful: do_each_thread/while_each_thread is a double loop so
  *          'break' will not work as expected - use goto instead.
  */
-#define do_each_thread(g, t) \
-	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+#define do_each_thread_all(g, t) \
+	for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do
 
-#define while_each_thread(g, t) \
+#define while_each_thread_all(g, t) \
 	while ((t = next_thread(t)) != g)
 
 extern int get_nr_threads(struct task_struct *tsk);
@@ -2465,6 +2590,97 @@ static inline void threadgroup_fork_writ
 static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
 #endif
 
+#ifndef CONFIG_VE
+
+#define for_each_process_ve(p)		for_each_process_all(p)
+#define do_each_thread_ve(g, t)		do_each_thread_all(g, t)
+#define while_each_thread_ve(g, t)	while_each_thread_all(g, t)
+#define first_task_ve()			next_task_ve(&init_task)
+#define __first_task_ve(owner)		next_task_ve(&init_task)
+#define __next_task_ve(owner, p)	next_task_ve(p)
+#define next_task_ve(p)			\
+	(next_task_all(p) != &init_task ? next_task_all(p) : NULL)
+
+#define ve_is_super(env)				1
+#define ve_accessible(target, owner)			1
+#define ve_accessible_strict(target, owner)		1
+#define ve_accessible_veid(target, owner)		1
+#define ve_accessible_strict_veid(target, owner)	1
+
+#define VEID(ve)					0
+
+#else	/* CONFIG_VE */
+
+#include <linux/ve.h>
+
+#define ve_is_super(env)			((env) == get_ve0())
+
+#define ve_accessible_strict(target, owner)	((target) == (owner))
+static inline int ve_accessible(struct ve_struct *target,
+		struct ve_struct *owner)
+{
+	return ve_is_super(owner) || ve_accessible_strict(target, owner);
+}
+
+#define ve_accessible_strict_veid(target, owner) ((target) == (owner))
+static inline int ve_accessible_veid(envid_t target, envid_t owner)
+{
+	return get_ve0()->veid == owner ||
+		ve_accessible_strict_veid(target, owner);
+}
+
+#define VEID(ve)	(ve->veid)
+
+static inline struct task_struct *ve_lh2task(struct ve_struct *ve,
+		struct list_head *lh)
+{
+	return lh == &ve->vetask_lh ? NULL :
+		list_entry(lh, struct task_struct, ve_task_info.vetask_list);
+}
+
+static inline struct task_struct *__first_task_ve(struct ve_struct *ve)
+{
+	struct task_struct *tsk;
+
+	if (unlikely(ve_is_super(ve))) {
+		tsk = next_task_all(&init_task);
+		if (tsk == &init_task)
+			tsk = NULL;
+	} else {
+		tsk = ve_lh2task(ve, ve->vetask_lh.next);
+	}
+	return tsk;
+}
+
+static inline struct task_struct *__next_task_ve(struct ve_struct *ve,
+		struct task_struct *tsk)
+{
+	if (unlikely(ve_is_super(ve))) {
+		tsk = next_task_all(tsk);
+		if (tsk == &init_task)
+			tsk = NULL;
+	} else {
+		BUG_ON(tsk->ve_task_info.owner_env != ve);
+		tsk = ve_lh2task(ve, tsk->ve_task_info.vetask_list.next);
+	}
+	return tsk;
+}
+
+#define first_task_ve()	__first_task_ve(get_exec_env())
+#define next_task_ve(p)	__next_task_ve(get_exec_env(), p)
+/* no one uses prev_task_ve(), copy next_task_ve() if needed */
+
+#define for_each_process_ve(p) \
+	for (p = first_task_ve(); p != NULL ; p = next_task_ve(p))
+
+#define do_each_thread_ve(g, t) \
+	for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do
+
+#define while_each_thread_ve(g, t) \
+	while ((t = next_thread(t)) != g)
+
+#endif	/* CONFIG_VE */
+
 #ifndef __HAVE_THREAD_FUNCTIONS
 
 #define task_thread_info(task)	((struct thread_info *)(task)->stack)
@@ -2598,9 +2814,17 @@ extern int _cond_resched(void);
 	_cond_resched();			\
 })
 
+#define __HAVE_COND_RESCHED_MAY_THROTTLE
+extern int _cond_resched_may_throttle(void);
+
+#define cond_resched_may_throttle() ({		\
+	__might_sleep(__FILE__, __LINE__, 0);	\
+	_cond_resched_may_throttle();		\
+})
+
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
@@ -2685,6 +2909,31 @@ static inline void set_task_cpu(struct t
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int task_nr_cpus(struct task_struct *p);
+extern unsigned int task_vcpu_id(struct task_struct *p);
+extern unsigned int sysctl_sched_vcpu_hotslice;
+extern unsigned int sysctl_sched_cpulimit_scale_cpufreq;
+extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq);
+#else
+static inline unsigned int task_nr_cpus(struct task_struct *p)
+{
+	return num_online_cpus();
+}
+
+static inline unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p);
+}
+
+static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	return freq;
+}
+#endif
+
+#define num_online_vcpus() task_nr_cpus(current)
+
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
 
 #ifdef CONFIG_TRACING
@@ -2716,18 +2965,28 @@ extern struct task_group *sched_create_g
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 extern void __sched_move_task(struct task_struct *tsk);
+extern void sched_group_set_start_time(struct task_struct *tsk,
+				       const struct timespec *ts);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
+extern int sched_cgroup_set_shares(struct cgroup *cgrp, unsigned long shares);
+unsigned long sched_cgroup_get_shares(struct cgroup *cgrp);
+extern unsigned long sched_cgroup_get_nr_running(struct cgroup *cgrp);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 extern int sched_group_set_rt_runtime(struct task_group *tg,
 				      long rt_runtime_us);
+extern int sched_cgroup_set_rt_runtime(struct cgroup *cgrp,
+				       long rt_runtime_us);
 extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				      long rt_period_us);
 extern long sched_group_rt_period(struct task_group *tg);
 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
+#else
+static inline int sched_cgroup_set_rt_runtime(struct cgroup *cgrp,
+					      long rt_runtime_us) { return 0; }
 #endif
 #endif
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sem.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sem.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sem.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sem.h	2015-01-21 12:02:58.124830812 +0300
@@ -19,6 +19,39 @@
 #define SEM_STAT 18
 #define SEM_INFO 19
 
+/* One semaphore structure for each semaphore in the system. */
+struct sem {
+	int	semval;		/* current value */
+	int	sempid;		/* pid of last operation */
+	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
+	struct list_head sem_pending; /* pending single-sop operations */
+};
+
+/* Each task has a list of undo requests. They are executed automatically
+ * when the process exits.
+ */
+struct sem_undo {
+	struct list_head	list_proc;	/* per-process list: *
+						 * all undos from one process
+						 * rcu protected */
+	struct rcu_head		rcu;		/* rcu struct for sem_undo */
+	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
+	struct list_head	list_id;	/* per semaphore array list:
+						 * all undos for one array */
+	int			semid;		/* semaphore set identifier */
+	short			*semadj;	/* array of adjustments */
+						/* one per semaphore */
+};
+
+/* sem_undo_list controls shared access to the list of sem_undo structures
+ * that may be shared among all a CLONE_SYSVSEM task group.
+ */
+struct sem_undo_list {
+	atomic_t		refcnt;
+	spinlock_t		lock;
+	struct list_head	list_proc;
+};
+
 /* Obsolete, used only for backwards compatibility and libc5 compiles */
 struct semid_ds {
 	struct ipc_perm	sem_perm;		/* permissions .. see ipc.h */
@@ -142,6 +175,9 @@ static inline void exit_sem(struct task_
 }
 #endif
 
+int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg);
+int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg);
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SEM_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/shm.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/shm.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/shm.h	2014-12-12 23:29:12.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/shm.h	2015-01-21 12:02:47.940101171 +0300
@@ -83,6 +83,22 @@ struct shm_info {
 };
 
 #ifdef __KERNEL__
+
+#include <linux/ipc_namespace.h>
+
+#define IPC_SEM_IDS	0
+#define IPC_MSG_IDS	1
+#define IPC_SHM_IDS	2
+
+struct shm_file_data {
+	int id;
+	struct ipc_namespace *ns;
+	struct file *file;
+	const struct vm_operations_struct *vm_ops;
+};
+#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
+#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
+
 struct shmid_kernel /* private to the kernel */
 {	
 	struct kern_ipc_perm	shm_perm;
@@ -97,6 +113,23 @@ struct shmid_kernel /* private to the ke
 	struct user_struct	*mlock_user;
 };
 
+/*
+ * shm_lock_(check_) routines are called in the paths where the rw_mutex
+ * is not held.
+ */
+static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return (struct shmid_kernel *)ipcp;
+
+	return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+#define shm_unlock(shp)			\
+	ipc_unlock(&(shp)->shm_perm)
+
 /* shm_mode upper byte flags */
 #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
 #define SHM_LOCKED      02000   /* segment will not be swapped */
@@ -122,6 +155,12 @@ static inline void exit_shm(struct task_
 }
 #endif
 
+int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg);
+struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg);
+extern const struct file_operations shmem_file_operations;
+extern const struct file_operations shm_file_operations;
+
+extern struct file_system_type shmem_fs_type;
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SHM_H_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/shmem_fs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/shmem_fs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/shmem_fs.h	2014-12-12 23:29:17.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/shmem_fs.h	2015-01-21 12:02:58.675816188 +0300
@@ -19,6 +19,7 @@ struct shmem_inode_info {
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* first blocks */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct inode		vfs_inode;
+	struct user_beancounter	*shmi_ub;
 };
 
 struct shmem_sb_info {
@@ -75,4 +76,10 @@ static inline int shmem_acl_init(struct 
 }
 #endif  /* CONFIG_TMPFS_POSIX_ACL */
 
+int shmem_insertpage(struct inode * inode, unsigned long index,
+		     swp_entry_t swap);
+int install_shmem_page(struct vm_area_struct *vma,
+		       unsigned long addr, struct page *page);
+int is_shmem_vma(struct vm_area_struct *vma);
+
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/signalfd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/signalfd.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/signalfd.h	2014-12-12 23:28:59.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/signalfd.h	2015-01-21 12:02:47.957100721 +0300
@@ -61,6 +61,12 @@ static inline void signalfd_notify(struc
 		wake_up(&tsk->sighand->signalfd_wqh);
 }
 
+struct signalfd_ctx {
+	sigset_t sigmask;
+};
+
+extern long do_signalfd(int ufd, sigset_t *sigmask, int flags);
+
 #else /* CONFIG_SIGNALFD */
 
 static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/skbuff.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/skbuff.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/skbuff.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/skbuff.h	2015-01-21 12:02:45.759159071 +0300
@@ -362,6 +362,8 @@ typedef unsigned char *sk_buff_data_t;
  *	@vlan_tci: vlan tag control information
  */
 
+#include <bc/sock.h>
+
 struct sk_buff {
 	/* These two members must be first. */
 	struct sk_buff		*next;
@@ -409,6 +411,13 @@ struct sk_buff {
 	__be16			protocol:16;
 	kmemcheck_bitfield_end(flags1);
 
+#ifdef CONFIG_VE
+	unsigned int		accounted:1;
+	unsigned int		redirected:1;
+#endif
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+	__u8			brmark;
+#endif
 	void			(*destructor)(struct sk_buff *skb);
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct nf_conntrack	*nfct;
@@ -482,6 +491,8 @@ struct sk_buff {
 				*data;
 	unsigned int		truesize;
 	atomic_t		users;
+	struct skb_beancounter	skb_bc;
+	struct ve_struct	*owner_env;
 };
 
 #ifdef __KERNEL__
@@ -489,6 +500,7 @@ struct sk_buff {
  *	Handling routines are only of interest to the kernel
  */
 #include <linux/slab.h>
+#include <bc/net.h>
 
 #include <asm/system.h>
 
@@ -1591,7 +1603,7 @@ static inline void pskb_trim_unique(stru
  *	destructor function and make the @skb unowned. The buffer continues
  *	to exist but is no longer charged to its former owner.
  */
-static inline void skb_orphan(struct sk_buff *skb)
+static inline void __skb_orphan(struct sk_buff *skb)
 {
 	if (skb->destructor)
 		skb->destructor(skb);
@@ -1599,6 +1611,14 @@ static inline void skb_orphan(struct sk_
 	skb->sk		= NULL;
 }
 
+static inline void skb_orphan(struct sk_buff *skb)
+{
+	if (skb->sk)
+		ub_skb_uncharge(skb);
+
+	__skb_orphan(skb);
+}
+
 /**
  *	__skb_queue_purge - empty a list
  *	@list: list to empty
@@ -2309,6 +2329,7 @@ static inline void nf_bridge_get(struct 
 		atomic_inc(&nf_bridge->use);
 }
 #endif /* CONFIG_BRIDGE_NETFILTER */
+static inline void skb_init_brmark(struct sk_buff *skb);
 static inline void nf_reset(struct sk_buff *skb)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -2320,6 +2341,7 @@ static inline void nf_reset(struct sk_bu
 #ifdef CONFIG_BRIDGE_NETFILTER
 	nf_bridge_put(skb->nf_bridge);
 	skb->nf_bridge = NULL;
+	skb_init_brmark(skb);
 #endif
 }
 
@@ -2369,6 +2391,45 @@ static inline void skb_init_secmark(stru
 { }
 #endif
 
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from)
+{
+	to->brmark = from->brmark;
+}
+
+static inline void skb_init_brmark(struct sk_buff *skb)
+{
+	skb->brmark = 0;
+}
+
+static inline u8 skb_get_brmark(struct sk_buff *skb)
+{
+	return skb->brmark;
+}
+
+static inline void skb_set_brmark(struct sk_buff *skb, u8 brmark)
+{
+	skb->brmark = brmark;
+}
+#else
+static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from)
+{
+}
+
+static inline void skb_init_brmark(struct sk_buff *skb)
+{
+}
+
+static inline u8 skb_get_brmark(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void skb_set_brmark(struct sk_buff *skb, u8 brmark)
+{
+}
+#endif
+
 static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
 {
 	skb->queue_mapping = queue_mapping;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/slab.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/slab.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/slab.h	2014-12-12 23:29:19.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/slab.h	2015-01-21 12:02:47.076124108 +0300
@@ -88,6 +88,26 @@
 				(unsigned long)ZERO_SIZE_PTR)
 
 /*
+ * allocation rules:                            __GFP_UBC       0
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *  cache (SLAB_UBC)				charge		charge
+ *				      (usual caches: mm, vma, task_struct, ...)
+ *
+ *  cache (SLAB_UBC | SLAB_NO_CHARGE)		charge		---
+ *					     (ub_kmalloc)    (kmalloc)
+ *
+ *  cache (no UB flags)				BUG()		---
+ *							(nonub caches, mempools)
+ *
+ *  pages					charge		---
+ *					   (ub_vmalloc,	      (vmalloc,
+ *				        poll, fdsets, ...)  non-ub allocs)
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#define SLAB_UBC		0x10000000UL	/* alloc space for ubs ... */
+#define SLAB_NO_CHARGE		0x20000000UL	/* ... but don't charge */
+
+/*
  * struct kmem_cache related prototypes
  */
 void __init kmem_cache_init(void);
@@ -102,7 +122,28 @@ void kmem_cache_free(struct kmem_cache *
 unsigned int kmem_cache_size(struct kmem_cache *);
 const char *kmem_cache_name(struct kmem_cache *);
 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
-
+#ifdef CONFIG_SLABINFO
+extern void show_slab_info(void);
+#else
+#define show_slab_info()	do { } while (0)
+#endif
+int kmem_cache_objuse(struct kmem_cache *cachep);
+int kmem_obj_objuse(void *obj);
+int kmem_dname_objuse(void *obj);
+unsigned long ub_cache_growth(struct kmem_cache *cachep);
+
+struct user_beancounter;
+extern void slab_walk_ub(struct user_beancounter *ub,
+		void (*show)(const char *name, int count, void *v), void *v);
+
+#ifdef CONFIG_BEANCOUNTERS
+void kmem_mark_nocharge(struct kmem_cache *cachep);
+struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj);
+struct user_beancounter *slab_ub(void *obj);
+#else
+static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { }
+static inline struct user_beancounter *slab_ub(void *obj) { return NULL; }
+#endif
 /*
  * Please use this macro to create slab caches. Simply specify the
  * name of the structure and maybe some flags that are listed above.
@@ -338,5 +379,6 @@ static inline void *kzalloc_node(size_t 
 }
 
 void __init kmem_cache_init_late(void);
+void slab_obj_walk(struct kmem_cache *c, void (*f)(void *));
 
 #endif	/* _LINUX_SLAB_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/slab_def.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/slab_def.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/slab_def.h	2014-12-12 23:29:22.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/slab_def.h	2015-01-21 12:02:43.393221886 +0300
@@ -28,27 +28,45 @@
 #endif
 
 /*
+ * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
+ *		  0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS	- 1 to collect stats for /proc/slabinfo.
+ *		  0 for faster, smaller code (especially in the critical paths).
+ *
+ * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+ */
+
+#ifdef CONFIG_DEBUG_SLAB
+#define	SLAB_DEBUG		1
+#define	SLAB_STATS		1
+#define SLAB_FORCED_DEBUG	1
+#else
+#define	SLAB_DEBUG		0
+#define	SLAB_STATS		0
+#define SLAB_FORCED_DEBUG	0
+#endif
+
+/*
  * struct kmem_cache
  *
  * manages a cache.
  */
 
 struct kmem_cache {
-/* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache *array[NR_CPUS];
-/* 2) Cache tunables. Protected by cache_chain_mutex */
+/* 1) Cache tunables. Protected by cache_chain_mutex */
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
 
 	unsigned int buffer_size;
 	u32 reciprocal_buffer_size;
-/* 3) touched by every alloc & free from the backend */
+/* 2) touched by every alloc & free from the backend */
 
 	unsigned int flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
 
-/* 4) cache_grow/shrink */
+/* 3) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
 	unsigned int gfporder;
 
@@ -64,17 +82,18 @@ struct kmem_cache {
 	/* constructor func */
 	void (*ctor)(void *obj);
 
-/* 5) cache creation/removal */
+/* 4) cache creation/removal */
 	const char *name;
 	struct list_head next;
 
-/* 6) statistics */
+/* 5) statistics */
 #ifdef CONFIG_DEBUG_SLAB
 	unsigned long num_active;
 	unsigned long num_allocations;
 	unsigned long high_mark;
 	unsigned long grown;
 	unsigned long reaped;
+	unsigned long shrunk;
 	unsigned long errors;
 	unsigned long max_freeable;
 	unsigned long node_allocs;
@@ -94,17 +113,22 @@ struct kmem_cache {
 	int obj_offset;
 	int obj_size;
 #endif /* CONFIG_DEBUG_SLAB */
+#ifdef CONFIG_BEANCOUNTERS
+	int objuse;
+#endif
 
+/* 6) per-cpu/per-node data, touched during every alloc/free */
 	/*
-	 * We put nodelists[] at the end of kmem_cache, because we want to size
-	 * this array to nr_node_ids slots instead of MAX_NUMNODES
+	 * We put array[] at the end of kmem_cache, because we want to size
+	 * this array to nr_cpu_ids slots instead of NR_CPUS
 	 * (see kmem_cache_init())
-	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
-	 * is statically defined, so we reserve the max number of nodes.
+	 * We still use [NR_CPUS] and not [1] or [0] because cache_cache
+	 * is statically defined, so we reserve the max number of cpus.
 	 */
-	struct kmem_list3 *nodelists[MAX_NUMNODES];
+	struct kmem_list3 **nodelists;
+	struct array_cache *array[NR_CPUS];
 	/*
-	 * Do not add fields after nodelists[]
+	 * Do not add fields after array[]
 	 */
 };
 
@@ -117,6 +141,7 @@ struct cache_sizes {
 #endif
 };
 extern struct cache_sizes malloc_sizes[];
+extern int malloc_cache_num;
 
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
@@ -162,6 +187,8 @@ static __always_inline void *kmalloc(siz
 #undef CACHE
 		return NULL;
 found:
+		if (flags & __GFP_UBC)
+			i += malloc_cache_num;
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
 			cachep = malloc_sizes[i].cs_dmacachep;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/slub_def.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/slub_def.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/slub_def.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/slub_def.h	2015-01-21 12:02:43.393221886 +0300
@@ -97,6 +97,10 @@ struct kmem_cache {
 	struct kobject kobj;	/* For sysfs */
 #endif
 
+#ifdef CONFIG_BEANCOUNTERS
+	atomic_t grown;
+	int objuse;
+#endif
 #ifdef CONFIG_NUMA
 	/*
 	 * Defragmentation by allocating from a remote node.
@@ -141,6 +145,19 @@ struct kmem_cache {
  */
 extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
 
+#ifdef CONFIG_BEANCOUNTERS
+extern struct kmem_cache ub_kmalloc_caches[SLUB_PAGE_SHIFT];
+static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx)
+{
+	return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx];
+}
+#else
+static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx)
+{
+	return &kmalloc_caches[idx];
+}
+#endif
+
 /*
  * Sorry that the following has to be that ugly but some versions of GCC
  * have trouble with constant propagation and loops.
@@ -197,14 +214,14 @@ static __always_inline int kmalloc_index
  * This ought to end up with a global pointer to the right cache
  * in kmalloc_caches.
  */
-static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
+static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 {
 	int index = kmalloc_index(size);
 
 	if (index == 0)
 		return NULL;
 
-	return &kmalloc_caches[index];
+	return __kmalloc_cache(flags, index);
 }
 
 #ifdef CONFIG_ZONE_DMA
@@ -247,7 +264,7 @@ static __always_inline void *kmalloc(siz
 			return kmalloc_large(size, flags);
 
 		if (!(flags & SLUB_DMA)) {
-			struct kmem_cache *s = kmalloc_slab(size);
+			struct kmem_cache *s = kmalloc_slab(size, flags);
 
 			if (!s)
 				return ZERO_SIZE_PTR;
@@ -286,7 +303,7 @@ static __always_inline void *kmalloc_nod
 
 	if (__builtin_constant_p(size) &&
 		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
-			struct kmem_cache *s = kmalloc_slab(size);
+			struct kmem_cache *s = kmalloc_slab(size, flags);
 
 		if (!s)
 			return ZERO_SIZE_PTR;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/socket.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/socket.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/socket.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/socket.h	2015-01-21 12:02:45.356169771 +0300
@@ -310,6 +310,16 @@ struct ucred {
 #define IPX_TYPE	1
 
 #ifdef __KERNEL__
+
+#define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
+					   16 for IP, 16 for IPX,
+					   24 for IPv6,
+					   about 80 for AX.25
+					   must be at least one bigger than
+					   the AF_UNIX size (see net/unix/af_unix.c
+					   :unix_mkname()).
+					 */
+
 extern void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred);
 
 extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
@@ -327,6 +337,8 @@ extern int memcpy_toiovecend(const struc
 extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, int __user *ulen);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
+extern int vz_security_family_check(int family);
+extern int vz_security_protocol_check(int protocol);
 
 struct timespec;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/cache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/cache.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/cache.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/cache.h	2015-01-21 12:02:46.935127851 +0300
@@ -200,6 +200,8 @@ extern int cache_register(struct cache_d
 extern int cache_register_net(struct cache_detail *cd, struct net *net);
 extern void cache_unregister(struct cache_detail *cd);
 extern void cache_unregister_net(struct cache_detail *cd, struct net *net);
+extern struct cache_detail *cache_alloc(struct cache_detail *, int);
+extern void cache_free(struct cache_detail *);
 
 extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
 					mode_t, struct cache_detail *);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/clnt.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/clnt.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/clnt.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/clnt.h	2015-01-21 12:02:46.935127851 +0300
@@ -167,6 +167,9 @@ char *		rpc_sockaddr2uaddr(const struct 
 size_t		rpc_uaddr2sockaddr(const char *, const size_t,
 				   struct sockaddr *, const size_t);
 
+extern int ve_ip_map_init(void);
+extern void ve_ip_map_exit(void);
+
 static inline unsigned short rpc_get_port(const struct sockaddr *sap)
 {
 	switch (sap->sa_family) {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/sched.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/sched.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/sched.h	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/sched.h	2015-01-21 12:02:47.641109109 +0300
@@ -249,9 +249,13 @@ int		__rpc_wait_for_completion_task(stru
 #ifdef RPC_DEBUG
 void		rpc_show_tasks(void);
 #endif
+int		rpciod_start(void);
+void		rpciod_stop(void);
 int		rpc_init_mempool(void);
 void		rpc_destroy_mempool(void);
-extern struct workqueue_struct *rpciod_workqueue;
+#ifndef CONFIG_VE
+extern struct workqueue_struct	*rpciod_workqueue;
+#endif
 void		rpc_prepare_task(struct rpc_task *task);
 
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/stats.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/stats.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/stats.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/stats.h	2015-01-21 12:02:47.651108843 +0300
@@ -38,7 +38,7 @@ struct svc_stat {
 				rpcbadclnt;
 };
 
-void			rpc_proc_init(void);
+struct proc_dir_entry * rpc_proc_init(void);
 void			rpc_proc_exit(void);
 #ifdef MODULE
 void			rpc_modcount(struct inode *, int);
@@ -55,7 +55,9 @@ void			svc_proc_unregister(const char *)
 void			svc_seq_show(struct seq_file *,
 				     const struct svc_stat *);
 
+#ifndef CONFIG_VE
 extern struct proc_dir_entry	*proc_net_rpc;
+#endif
 
 #else
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/svc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/svc.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/svc.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/svc.h	2015-01-21 12:02:46.936127825 +0300
@@ -409,7 +409,8 @@ struct svc_rqst *svc_prepare_thread(stru
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			void (*shutdown)(struct svc_serv *),
-			svc_thread_fn, struct module *);
+			svc_thread_fn, struct module *,
+			struct svc_stat *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/xprt.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/xprt.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sunrpc/xprt.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sunrpc/xprt.h	2015-01-21 12:02:46.936127825 +0300
@@ -150,6 +150,7 @@ enum xprt_transports {
 struct rpc_xprt {
 	atomic_t		count;		/* Reference count */
 	struct rpc_xprt_ops *	ops;		/* transport methods */
+	struct ve_struct *	owner_env;	/* VE owner of mount */
 
 	const struct rpc_timeout *timeout;	/* timeout parms */
 	struct sockaddr_storage	addr;		/* server address */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/swap.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/swap.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/swap.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/swap.h	2015-01-21 12:02:58.676816161 +0300
@@ -41,6 +41,14 @@ static inline int current_is_kswapd(void
  * actions on faults.
  */
 
+#ifdef CONFIG_MEMORY_VSWAP
+#define SWP_VSWAP_NUM 2
+#define SWP_VSWAP_READ	(MAX_SWAPFILES + SWP_HWPOISON_NUM + SWP_MIGRATION_NUM)
+#define SWP_VSWAP_WRITE	(SWP_VSWAP_READ + 1)
+#else
+#define SWP_VSWAP_NUM 0
+#endif
+
 /*
  * NUMA node memory migration support
  */
@@ -63,7 +71,8 @@ static inline int current_is_kswapd(void
 #endif
 
 #define MAX_SWAPFILES \
-	((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+	((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM \
+	 - SWP_VSWAP_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
@@ -116,6 +125,7 @@ struct address_space;
 struct sysinfo;
 struct writeback_control;
 struct zone;
+struct user_beancounter;
 
 /*
  * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
@@ -193,8 +203,25 @@ struct swap_info_struct {
 	unsigned int max;
 	unsigned int inuse_pages;
 	unsigned int old_block_size;
+	unsigned char uuid[16];
+#ifdef CONFIG_PSWAP
+	signed char pswap_type;
+	unsigned long *pswap_reserved;
+#endif
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+	struct user_beancounter **swap_ubs;
+#endif
 };
 
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+struct user_beancounter *get_swap_ub(swp_entry_t entry);
+#else
+static inline struct user_beancounter *get_swap_ub(swp_entry_t entry)
+{
+	return get_ub0();
+}
+#endif
+
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
 	int next;	/* swapfile to be used next */
@@ -218,9 +245,10 @@ extern unsigned long nr_free_pagecache_p
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
-extern void lru_add_page_tail(struct zone* zone,
+extern void lru_add_page_tail(struct lruvec *lruvec,
 			      struct page *page, struct page *page_tail);
 extern void activate_page(struct page *);
+extern void deactivate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
@@ -258,6 +286,8 @@ static inline void lru_cache_add_active_
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long try_to_free_gang_pages(struct gang_set *gs,
+					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
@@ -266,11 +296,13 @@ extern unsigned long mem_cgroup_shrink_n
 						unsigned int swappiness,
 						struct zone *zone,
 						int nid);
-extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file);
+extern int __isolate_lru_page(struct page *page, isolate_mode_t mode,
+		int file, struct lruvec **locked);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
+extern int vm_sync_reclaim;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
@@ -289,6 +321,13 @@ static inline int zone_reclaim(struct zo
 }
 #endif
 
+/*
+ * must be called with vma's mmap_sem held for read or write, and page locked.
+ */
+extern void mlock_vma_page(struct vm_area_struct *vma, struct page *page);
+extern void munlock_vma_page(struct page *page);
+#define MM_HAS_MLOCK_VMA_PAGE
+
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
@@ -302,6 +341,25 @@ extern int kswapd_run(int nid);
 
 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
+#ifdef CONFIG_PSWAP
+extern int sysctl_prune_pswap;
+extern int prune_pswap_sysctl_handler(ctl_table *table, int write,
+		void __user *buffer, size_t *length, loff_t *ppos);
+extern swp_entry_t pswap_reserve(swp_entry_t);
+extern swp_entry_t pswap_restore(swp_entry_t, struct user_beancounter *);
+#else
+static inline swp_entry_t pswap_reserve(swp_entry_t swp)
+{
+	return (swp_entry_t) {0};
+}
+
+static inline swp_entry_t pswap_restore(swp_entry_t swp,
+					struct user_beancounter *ub)
+{
+	return (swp_entry_t) {0};
+}
+#endif
+
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
@@ -312,7 +370,7 @@ extern void end_swap_bio_read(struct bio
 extern struct address_space swapper_space;
 #define total_swapcache_pages  swapper_space.nrpages
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, struct user_beancounter *ub);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
@@ -328,12 +386,14 @@ extern struct page *swapin_readahead(swp
 extern long nr_swap_pages;
 extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct user_beancounter *);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
+extern int __swap_duplicate(swp_entry_t, unsigned char);
+extern int swap_convert_to_shmem(swp_entry_t entry);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
 extern void swapcache_free(swp_entry_t, struct page *page);
@@ -346,6 +406,14 @@ extern int reuse_swap_page(struct page *
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+extern void ub_unuse_swap_page(struct page *);
+extern void ub_unuse_swap(struct user_beancounter *);
+#else
+static inline void ub_unuse_swap_page(struct page *pg) { }
+static inline void ub_unuse_swap(struct user_beancounter *ub)  { }
+#endif
+
 /* linux/mm/thrash.c */
 extern struct mm_struct *swap_token_mm;
 extern void grab_swap_token(struct mm_struct *);
@@ -421,6 +489,11 @@ static inline int swap_duplicate(swp_ent
 	return 0;
 }
 
+static inline int swap_convert_to_shmem(swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline void swap_free(swp_entry_t swp)
 {
 }
@@ -445,7 +518,7 @@ static inline struct page *lookup_swap_c
 	return NULL;
 }
 
-static inline int add_to_swap(struct page *page)
+static inline int add_to_swap(struct page *page, struct user_beancounter *ub)
 {
 	return 0;
 }
@@ -471,7 +544,7 @@ static inline int try_to_free_swap(struc
 	return 0;
 }
 
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct user_beancounter *ub)
 {
 	swp_entry_t entry;
 	entry.val = 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/swapops.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/swapops.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/swapops.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/swapops.h	2015-01-21 12:02:58.676816161 +0300
@@ -99,6 +99,98 @@ static inline void *swp_to_radix_entry(s
 	return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
+#include <linux/rmap.h>
+
+#ifdef CONFIG_MEMORY_VSWAP
+
+static inline swp_entry_t make_vswap_entry(struct page *page, int write)
+{
+	VM_BUG_ON(!PageLocked(page));
+	return swp_entry(write ? SWP_VSWAP_WRITE : SWP_VSWAP_READ,
+			page_to_pfn(page));
+}
+
+static inline int is_vswap_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_VSWAP_READ ||
+	       swp_type(entry) == SWP_VSWAP_WRITE;
+}
+
+static inline int is_write_vswap_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_VSWAP_WRITE;
+}
+
+static inline swp_entry_t wprotect_vswap_entry(swp_entry_t entry)
+{
+	return swp_entry(SWP_VSWAP_READ, swp_offset(entry));
+}
+
+static inline struct page *vswap_entry_to_page(swp_entry_t entry)
+{
+	return pfn_to_page(swp_offset(entry));
+}
+
+extern int add_to_vswap(struct page *page);
+extern void __remove_from_vswap(struct page *page);
+extern int remove_from_vswap(struct page *page);
+
+static inline void get_vswap_page(struct page *page)
+{
+	VM_BUG_ON(!PageVSwap(page));
+	VM_BUG_ON(!atomic_read(&page->vswap_count));
+	atomic_inc(&page->vswap_count);
+}
+
+static inline void put_vswap_page(struct page *page)
+{
+	VM_BUG_ON(!PageVSwap(page));
+	if (atomic_dec_and_test(&page->vswap_count))
+		__remove_from_vswap(page);
+}
+
+static inline int page_vswapcount(struct page *page)
+{
+	return atomic_read(&page->vswap_count);
+}
+
+#else /* CONFIG_MEMORY_VSWAP */
+
+static inline swp_entry_t make_vswap_entry(struct page *page, int write)
+{
+	return swp_entry(0, 0);
+}
+static inline int is_vswap_entry(swp_entry_t entry) {
+	return 0;
+}
+static inline int is_write_vswap_entry(swp_entry_t entry)
+{
+	return 0;
+}
+static inline swp_entry_t wprotect_vswap_entry(swp_entry_t entry)
+{
+	return swp_entry(0, 0);
+}
+static inline struct page *vswap_entry_to_page(swp_entry_t entry)
+{
+	BUG();
+	return NULL;
+}
+static inline int add_to_vswap(struct page *page)
+{
+	return SWAP_FAIL;
+}
+static inline int remove_from_vswap(struct page *page)
+{
+	BUG();
+	return 0;
+}
+static inline void get_vswap_page(struct page *page) { }
+static inline void put_vswap_page(struct page *page) { }
+static inline int page_vswapcount(struct page *page) { return 0; }
+
+#endif /* CONFIG_MEMORY_VSWAP */
+
 #ifdef CONFIG_MIGRATION
 static inline swp_entry_t make_migration_entry(struct page *page, int write)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/syscalls.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/syscalls.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/syscalls.h	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/syscalls.h	2015-01-21 12:02:52.024992729 +0300
@@ -57,6 +57,7 @@ struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
 struct perf_event_attr;
+struct file_handle;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -155,7 +156,8 @@ static void prof_sysexit_disable_##sname
 
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
 	static const struct syscall_metadata __syscall_meta_##sname;	\
-	static struct ftrace_event_call event_enter_##sname;		\
+	static struct ftrace_event_call					\
+	__attribute__((__aligned__(4))) event_enter_##sname;		\
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
@@ -199,7 +201,8 @@ static void prof_sysexit_disable_##sname
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
 	static const struct syscall_metadata __syscall_meta_##sname;	\
-	static struct ftrace_event_call event_exit_##sname;		\
+	static struct ftrace_event_call					\
+	__attribute__((__aligned__(4))) event_exit_##sname;		\
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
@@ -924,4 +927,11 @@ asmlinkage long sys_process_vm_writev(pi
 				      unsigned long flags);
 
 asmlinkage long sys_setns(int fd, int nstype);
+
+asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
+				      struct file_handle __user *handle,
+				      int __user *mnt_id, int flag);
+asmlinkage long sys_open_by_handle_at(int mountdirfd,
+				      struct file_handle __user *handle,
+				      int flags);
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sysctl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sysctl.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sysctl.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sysctl.h	2015-01-21 12:02:45.463166929 +0300
@@ -995,6 +995,8 @@ extern int proc_dostring(struct ctl_tabl
 			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
+extern int proc_dointvec_once(struct ctl_table *, int,
+			      void __user *, size_t *, loff_t *);
 extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
@@ -1116,10 +1118,17 @@ struct ctl_table_header *__register_sysc
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
 struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 						struct ctl_table *table);
+struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *, int);
+struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *,
+						struct ctl_table *, int);
+struct ctl_table *sysctl_ve_table(struct ctl_table *, struct ctl_table *, int);
 
 void unregister_sysctl_table(struct ctl_table_header * table);
 int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
+extern int ve_allow_kthreads;
+extern int ve_allow_module_load;
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SYSCTL_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/sysfs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sysfs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/sysfs.h	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/sysfs.h	2015-01-21 12:02:44.611189548 +0300
@@ -18,8 +18,28 @@
 #include <linux/lockdep.h>
 #include <asm/atomic.h>
 
+#ifdef CONFIG_SYSFS_DEPRECATED_DYN
+extern unsigned _sysfs_deprecated;
+#ifdef CONFIG_VE
+#define sysfs_deprecated (ve_is_super(get_exec_env()) && _sysfs_deprecated)
+#else
+#define sysfs_deprecated (_sysfs_deprecated)
+#endif
+#else
+
+/* static deprecation */
+
+#ifdef CONFIG_SYSFS_DEPRECATED
+#define sysfs_deprecated 1
+#else
+#define sysfs_deprecated 0
+#endif
+
+#endif
+
 struct kobject;
 struct module;
+struct sysfs_open_dirent;
 
 /* FIXME
  * The *owner field is no longer used.
@@ -54,7 +74,8 @@ struct attribute_group {
 	struct attribute	**attrs;
 };
 
-
+#include <linux/fs.h>
+#include <linux/rbtree.h>
 
 /**
  * Use these macros to make defining attributes easier. See include/linux/device.h
@@ -102,6 +123,99 @@ struct sysfs_ops {
 
 struct sysfs_dirent;
 
+/* type-specific structures for sysfs_dirent->s_* union members */
+struct sysfs_elem_dir {
+	struct kobject		*kobj;
+#ifdef __GENKSYMS__
+	struct sysfs_dirent	*children;
+#endif
+
+#ifndef __GENKSYMS__
+	unsigned long           subdirs;
+
+	struct rb_root          inode_tree;
+	struct rb_root          name_tree;
+#endif
+};
+
+struct sysfs_elem_symlink {
+	struct sysfs_dirent	*target_sd;
+};
+
+struct sysfs_elem_attr {
+	struct attribute	*attr;
+	struct sysfs_open_dirent *open;
+};
+
+struct sysfs_elem_bin_attr {
+	struct bin_attribute	*bin_attr;
+	struct hlist_head	buffers;
+};
+
+struct sysfs_elem_dir_link {
+	struct sysfs_dirent	*target_sd;
+};
+
+struct sysfs_inode_attrs {
+	struct iattr	ia_iattr;
+	void		*ia_secdata;
+	u32		ia_secdata_len;
+};
+
+/*
+ * sysfs_dirent - the building block of sysfs hierarchy.  Each and
+ * every sysfs node is represented by single sysfs_dirent.
+ *
+ * As long as s_count reference is held, the sysfs_dirent itself is
+ * accessible.  Dereferencing s_elem or any other outer entity
+ * requires s_active reference.
+ */
+struct sysfs_dirent {
+	atomic_t		s_count;
+	atomic_t		s_active;
+	struct sysfs_dirent	*s_parent;
+#ifdef __GENKSYMS__
+	struct sysfs_dirent	*s_sibling;
+#endif
+	const char		*s_name;
+
+#ifndef __GENKSYMS__
+	struct rb_node          inode_node;
+	struct rb_node          name_node;
+
+	union {
+		struct completion       *completion;
+		struct sysfs_dirent     *removed_list;
+	} u;
+#endif
+
+	union {
+		struct sysfs_elem_dir		s_dir;
+		struct sysfs_elem_symlink	s_symlink;
+		struct sysfs_elem_attr		s_attr;
+		struct sysfs_elem_bin_attr	s_bin_attr;
+		struct sysfs_elem_dir_link	s_dir_link;
+	};
+
+	unsigned int		s_flags;
+	ino_t			s_ino;
+	umode_t			s_mode;
+	struct sysfs_inode_attrs *s_iattr;
+};
+
+#define SD_DEACTIVATED_BIAS		INT_MIN
+
+#define SYSFS_TYPE_MASK			0x00ff
+#define SYSFS_DIR			0x0001
+#define SYSFS_KOBJ_ATTR			0x0002
+#define SYSFS_KOBJ_BIN_ATTR		0x0004
+#define SYSFS_KOBJ_LINK			0x0008
+#define SYSFS_DIR_LINK			0x0010
+#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK | SYSFS_DIR_LINK)
+
+#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_REMOVED		0x0200
+
 #ifdef CONFIG_SYSFS
 
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
@@ -147,6 +261,9 @@ int sysfs_merge_group(struct kobject *ko
 		       const struct attribute_group *grp);
 void sysfs_unmerge_group(struct kobject *kobj,
 		       const struct attribute_group *grp);
+extern struct sysfs_dirent *sysfs_create_dirlink(struct sysfs_dirent *parent_sd,
+			struct kobject *target);
+extern void sysfs_remove_dirlink(struct sysfs_dirent *sd);
 
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
 void sysfs_notify_dirent(struct sysfs_dirent *sd);
@@ -157,6 +274,10 @@ void sysfs_put(struct sysfs_dirent *sd);
 void sysfs_printk_last_file(void);
 int __must_check sysfs_init(void);
 
+extern int init_ve_sysfs_root(struct ve_struct *ve);
+
+extern struct file_system_type sysfs_fs_type;
+
 #else /* CONFIG_SYSFS */
 
 static inline int sysfs_schedule_callback(struct kobject *kobj,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/task_io_accounting_ops.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/task_io_accounting_ops.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/task_io_accounting_ops.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/task_io_accounting_ops.h	2015-01-21 12:02:43.346223134 +0300
@@ -5,10 +5,13 @@
 #define __TASK_IO_ACCOUNTING_OPS_INCLUDED
 
 #include <linux/sched.h>
+#include <bc/io_acct.h>
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
+
 static inline void task_io_account_read(size_t bytes)
 {
+	ub_io_account_read(bytes);
 	current->ioac.read_bytes += bytes;
 }
 
@@ -23,6 +26,12 @@ static inline unsigned long task_io_get_
 
 static inline void task_io_account_write(size_t bytes)
 {
+	ub_io_account_write(bytes);
+	current->ioac.write_bytes += bytes;
+}
+
+static inline void task_io_account_dirty(size_t bytes)
+{
 	current->ioac.write_bytes += bytes;
 }
 
@@ -73,6 +82,10 @@ static inline unsigned long task_io_get_
 	return 0;
 }
 
+static inline void task_io_account_dirty(size_t bytes)
+{
+}
+
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/tcp.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/tcp.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/tcp.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/tcp.h	2015-01-21 12:02:45.356169771 +0300
@@ -436,6 +436,11 @@ static inline struct tcp_sock *tcp_sk(co
 	return (struct tcp_sock *)sk;
 }
 
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
 struct tcp_timewait_sock {
 	struct inet_timewait_sock tw_sk;
 	u32			  tw_rcv_nxt;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/threads.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/threads.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/threads.h	2014-12-12 23:28:56.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/threads.h	2015-01-21 12:02:44.527191779 +0300
@@ -24,7 +24,8 @@
 /*
  * This controls the default maximum pid allocated to a process
  */
-#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
+#define PID_MAX_DEFAULT 	(sizeof(long) > 4 ? 1024 * 1024 : 32 * 1024)
+#define PID_MAX_NS_DEFAULT	(32 * 1024)
 
 /*
  * A maximum of 4 million PIDs should be enough for a while.
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/time.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/time.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/time.h	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/time.h	2015-01-21 12:02:53.088964485 +0300
@@ -187,6 +187,7 @@ extern unsigned int alarm_setitimer(unsi
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
 extern void getrawmonotonic(struct timespec *ts);
+extern void getrealboottime(struct timespec *ts);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
 extern void get_monotonic_boottime(struct timespec *ts);
@@ -320,6 +321,7 @@ struct itimerval {
 #define CLOCK_MONOTONIC_RAW		4
 #define CLOCK_REALTIME_COARSE		5
 #define CLOCK_MONOTONIC_COARSE		6
+#define CLOCK_BOOTTIME			7
 
 /*
  * The IDs of various hardware clocks:
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/timerfd.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/timerfd.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/timerfd.h	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/timerfd.h	2015-01-21 12:02:48.073097641 +0300
@@ -29,4 +29,20 @@
 /* Flags for timerfd_settime.  */
 #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)
 
+struct timerfd_ctx {
+	struct hrtimer tmr;
+	ktime_t tintv;
+	ktime_t moffs;
+	wait_queue_head_t wqh;
+	u64 ticks;
+	int expired;
+	struct rcu_head rcu;
+	struct list_head clist;
+	bool might_cancel;
+	int clockid;
+};
+
+extern const struct file_operations timerfd_fops;
+ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx);
+
 #endif /* _LINUX_TIMERFD_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/tty.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/tty.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/tty.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/tty.h	2015-01-21 12:02:49.065071307 +0300
@@ -316,6 +316,14 @@ struct tty_struct {
 	/* If the tty has a pending do_SAK, queue it here - akpm */
 	struct work_struct SAK_work;
 	struct tty_port *port;
+	struct ve_struct *owner_env;
+};
+
+/* Each of a tty's open files has private_data pointing to tty_file_private */
+struct tty_file_private {
+	struct tty_struct *tty;
+	struct file *file;
+	struct list_head list;
 };
 
 /* tty magic number */
@@ -347,6 +355,9 @@ struct tty_struct {
 #define TTY_HUPPED 		18	/* Post driver->hangup() */
 #define TTY_FLUSHING		19	/* Flushing to ldisc in progress */
 #define TTY_FLUSHPENDING	20	/* Queued buffer flush pending */
+#define TTY_CHARGED		21	/* Charged as ub resource */
+#define TTY_EXTRA_REFERENCE	22	/* plus one to ->count */
+#define TTY_HUPPING		23	/* ->hangup() in progress */
 
 #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
 
@@ -465,11 +476,12 @@ extern void proc_clear_tty(struct task_s
 extern struct tty_struct *get_current_tty(void);
 extern void tty_default_fops(struct file_operations *fops);
 extern struct tty_struct *alloc_tty_struct(void);
+extern void tty_add_file(struct tty_struct *tty, struct file *file);
 extern void free_tty_struct(struct tty_struct *tty);
 extern void initialize_tty_struct(struct tty_struct *tty,
 		struct tty_driver *driver, int idx);
 extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
-								int first_ok);
+		struct tty_struct *i_tty, int first_ok);
 extern void tty_release_dev(struct file *filp);
 extern int tty_init_termios(struct tty_struct *tty);
 
@@ -477,6 +489,7 @@ extern struct tty_struct *tty_pair_get_t
 extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
 
 extern struct mutex tty_mutex;
+extern spinlock_t tty_files_lock;
 
 extern void tty_write_unlock(struct tty_struct *tty);
 extern int tty_write_lock(struct tty_struct *tty, int ndelay);
@@ -571,5 +584,10 @@ extern int vt_ioctl(struct tty_struct *t
 extern long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
 		     unsigned int cmd, unsigned long arg);
 
+static inline struct tty_struct *file_tty(struct file *file)
+{
+	return ((struct tty_file_private *)file->private_data)->tty;
+}
+
 #endif /* __KERNEL__ */
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/tty_driver.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/tty_driver.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/tty_driver.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/tty_driver.h	2015-01-21 12:02:47.706107383 +0300
@@ -309,8 +309,14 @@ struct tty_driver {
 
 	const struct tty_operations *ops;
 	struct list_head tty_drivers;
+	struct ve_struct *owner_env;
 };
 
+#ifdef CONFIG_LEGACY_PTYS
+extern struct tty_driver *pty_driver;
+extern struct tty_driver *pty_slave_driver;
+#endif
+
 extern struct list_head tty_drivers;
 
 extern struct tty_driver *alloc_tty_driver(int lines);
@@ -319,6 +325,9 @@ extern void tty_set_operations(struct tt
 			const struct tty_operations *op);
 extern struct tty_driver *tty_find_polling_driver(char *name, int *line);
 
+int init_ve_tty_class(void);
+void fini_ve_tty_class(void);
+
 extern void tty_driver_kref_put(struct tty_driver *driver);
 
 static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/types.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/types.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/types.h	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/types.h	2015-01-21 12:02:44.028205026 +0300
@@ -31,6 +31,11 @@ typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
 typedef __kernel_mqd_t		mqd_t;
 
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
 typedef _Bool			bool;
 
 typedef __kernel_uid32_t	uid_t;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/utrace.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/utrace.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/utrace.h	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/utrace.h	2015-01-21 12:02:49.108070164 +0300
@@ -136,8 +136,15 @@ static inline void utrace_unfreeze_stop(
 {
 }
 
+static inline int task_utrace_attached(struct task_struct *task)
+{
+	return 0;
+}
+
 #else  /* CONFIG_UTRACE */
 
+int task_utrace_attached(struct task_struct *task);
+
 static inline unsigned long task_utrace_flags(struct task_struct *task)
 {
 	return task->utrace_flags;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/utsname.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/utsname.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/utsname.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/utsname.h	2015-01-21 12:02:48.162095278 +0300
@@ -37,14 +37,33 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+#ifdef CONFIG_X86
+struct uts_vdso {
+	void			*addr;
+	struct page		**pages;
+	unsigned int		nr_pages;
+	unsigned int		size;
+	unsigned long		version_off;
+};
+#endif
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 #ifndef __GENKSYMS__
 	unsigned int proc_inum;
 #endif
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+	struct uts_vdso vdso;
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	struct uts_vdso vdso32;
+#endif
+#endif
 };
 extern struct uts_namespace init_uts_ns;
+extern struct new_utsname virt_utsname;
 
 #ifdef CONFIG_UTS_NS
 static inline void get_uts_ns(struct uts_namespace *ns)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ve.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ve.h	2015-01-21 12:02:44.086203487 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve.h	2015-01-21 12:02:58.996807668 +0300
@@ -0,0 +1,387 @@
+/*
+ *  include/linux/ve.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_VE_H
+#define _LINUX_VE_H
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/sysctl.h>
+#include <linux/net.h>
+#include <linux/vzstat.h>
+#include <linux/kobject.h>
+#include <linux/pid.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+#include <linux/spinlock.h>
+#include <net/inet_frag.h>
+
+#ifdef VZMON_DEBUG
+#  define VZTRACE(fmt,args...) \
+	printk(KERN_DEBUG fmt, ##args)
+#else
+#  define VZTRACE(fmt,args...)
+#endif /* VZMON_DEBUG */
+
+struct tty_driver;
+struct task_struct;
+struct new_utsname;
+struct file_system_type;
+struct icmp_mib;
+struct ip_mib;
+struct tcp_mib;
+struct udp_mib;
+struct linux_mib;
+struct fib_info;
+struct fib_rule;
+struct veip_struct;
+struct ve_monitor;
+struct nsproxy;
+
+#if defined(CONFIG_VE) && defined(CONFIG_INET)
+struct fib_table;
+#ifdef CONFIG_VE_IPTABLES
+struct xt_table;
+struct nf_conn;
+
+#define FRAG6Q_HASHSZ   64
+
+struct ve_nf_conntrack {
+	struct hlist_head		*_bysource;
+	struct nf_nat_protocol		**_nf_nat_protos;
+	int				_nf_nat_vmalloced;
+	struct xt_table			*_nf_nat_table;
+	struct nf_conntrack_l3proto	*_nf_nat_l3proto;
+	atomic_t			_nf_conntrack_count;
+	int				_nf_conntrack_max;
+	struct hlist_head		*_nf_conntrack_hash;
+	int				_nf_conntrack_checksum;
+	int				_nf_conntrack_vmalloc;
+	struct hlist_head		_unconfirmed;
+	struct hlist_head		*_nf_ct_expect_hash;
+	unsigned int			_nf_ct_expect_vmalloc;
+	unsigned int			_nf_ct_expect_count;
+	unsigned int			_nf_ct_expect_max;
+	struct hlist_head		*_nf_ct_helper_hash;
+	unsigned int			_nf_ct_helper_vmalloc;
+#ifdef CONFIG_SYSCTL
+	/* l4 stuff: */
+	unsigned long			_nf_ct_icmp_timeout;
+	unsigned long			_nf_ct_icmpv6_timeout;
+	unsigned int			_nf_ct_udp_timeout;
+	unsigned int			_nf_ct_udp_timeout_stream;
+	unsigned int			_nf_ct_generic_timeout;
+	unsigned int			_nf_ct_log_invalid;
+	unsigned int			_nf_ct_tcp_timeout_max_retrans;
+	unsigned int			_nf_ct_tcp_timeout_unacknowledged;
+	int				_nf_ct_tcp_be_liberal;
+	int				_nf_ct_tcp_loose;
+	int				_nf_ct_tcp_max_retrans;
+	unsigned int			_nf_ct_tcp_timeouts[10];
+	struct ctl_table_header		*_icmp_sysctl_header;
+	unsigned int			_tcp_sysctl_table_users;
+	struct ctl_table_header		*_tcp_sysctl_header;
+	unsigned int			_udp_sysctl_table_users;
+	struct ctl_table_header		*_udp_sysctl_header;
+	struct ctl_table_header		*_icmpv6_sysctl_header;
+	struct ctl_table_header		*_generic_sysctl_header;
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	struct ctl_table_header		*_icmp_compat_sysctl_header;
+	struct ctl_table_header		*_tcp_compat_sysctl_header;
+	struct ctl_table_header		*_udp_compat_sysctl_header;
+	struct ctl_table_header		*_generic_compat_sysctl_header;
+#endif
+	/* l4 protocols sysctl tables: */
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_icmp;
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_tcp4;
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_icmpv6;
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_tcp6;
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_udp4;
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_udp6;
+	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_generic;
+	struct nf_conntrack_l4proto	**_nf_ct_protos[PF_MAX];
+	/* l3 protocols sysctl tables: */
+	struct nf_conntrack_l3proto	*_nf_conntrack_l3proto_ipv4;
+	struct nf_conntrack_l3proto	*_nf_conntrack_l3proto_ipv6;
+	struct nf_conntrack_l3proto	*_nf_ct_l3protos[AF_MAX];
+	/* sysctl standalone stuff: */
+	struct ctl_table_header		*_nf_ct_sysctl_header;
+	ctl_table			*_nf_ct_sysctl_table;
+	ctl_table			*_nf_ct_netfilter_table;
+	ctl_table			*_nf_ct_net_table;
+	ctl_table			*_ip_ct_netfilter_table;
+	struct ctl_table_header		*_ip_ct_sysctl_header;
+	int				_nf_ct_log_invalid_proto_min;
+	int				_nf_ct_log_invalid_proto_max;
+#endif /* CONFIG_SYSCTL */
+};
+#endif
+#endif
+
+struct ve_ipt_recent;
+struct ve_xt_hashlimit;
+struct svc_rqst;
+
+struct cgroup;
+struct css_set;
+
+struct ve_struct {
+	struct list_head	ve_list;
+	wait_queue_head_t	ve_list_wait;
+
+	envid_t			veid;
+	/*
+	 * this one is NOT rcu-protected
+	 */
+	struct list_head	vetask_lh;
+	/* capability bounding set */
+	kernel_cap_t		ve_cap_bset;
+	unsigned int		pcounter;
+	/* ref counter to ve from ipc */
+	atomic_t		counter;
+	unsigned int		class_id;
+	struct rw_semaphore	op_sem;
+	int			is_running;
+	int			is_locked;
+	atomic_t		suspend;
+	unsigned long		flags;
+	/* see vzcalluser.h for VE_FEATURE_XXX definitions */
+	__u64			features;
+
+/* VE's root */
+	struct path		root_path;
+
+	struct file_system_type *proc_fstype;
+	struct vfsmount		*proc_mnt;
+	struct proc_dir_entry	*proc_root;
+
+/* BSD pty's */
+#ifdef CONFIG_LEGACY_PTYS
+	struct tty_driver       *pty_driver;
+	struct tty_driver       *pty_slave_driver;
+#endif
+#ifdef CONFIG_UNIX98_PTYS
+	struct vfsmount		*devpts_mnt;
+#endif
+
+#define	MAX_NR_VTTY		12
+	struct tty_struct	*vtty[MAX_NR_VTTY];
+
+	struct file_system_type *shmem_fstype;
+	struct vfsmount		*shmem_mnt;
+#ifdef CONFIG_DEVTMPFS
+	struct file_system_type	*devtmpfs_fstype;
+	struct vfsmount		*devtmpfs_mnt;
+#endif
+#ifdef CONFIG_SYSFS
+	struct file_system_type *sysfs_fstype;
+	struct vfsmount		*sysfs_mnt;
+	struct super_block	*sysfs_sb;
+	struct sysfs_dirent	*_sysfs_root;
+	struct kobject		*fs_kobj;
+	struct kobject		*cgroup_kobj;
+#if defined(CONFIG_HOTPLUG)
+	struct kobject		*kernel_kobj;
+#endif
+#endif
+	struct kobject		*_virtual_dir;
+	struct kobject		*_system_dir;
+	struct kset		*cpu_kset;
+	struct kset		*class_kset;
+	struct kset		*devices_kset;
+	struct kobject		*dev_kobj;
+	struct kobject		*dev_char_kobj;
+	struct kobject		*dev_block_kobj;
+	struct kobject		*block_kobj;
+	struct class		*tty_class;
+	struct class		*mem_class;
+	struct list_head	devices;
+
+#ifdef CONFIG_NET
+	struct class		*net_class;
+#ifdef CONFIG_INET
+ 	unsigned long		rt_flush_required;
+#endif
+#endif
+#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE)
+	struct veip_struct	*veip;
+	struct net_device	*_venet_dev;
+#endif
+
+/* per VE CPU stats*/
+	struct timespec		start_timespec;		/* monotonic time */
+	struct timespec		real_start_timespec;	/* boot based time */
+	u64			start_jiffies;	/* Deprecated */
+
+	struct kstat_lat_pcpu_struct	sched_lat_ve;
+
+#ifdef CONFIG_INET
+	struct venet_stat       *stat;
+#ifdef CONFIG_VE_IPTABLES
+/* core/netfilter.c virtualization */
+	__u64			ipt_mask;
+	__u64			_iptables_modules;
+	struct ve_ipt_recent	*_ipt_recent;
+	struct ve_xt_hashlimit	*_xt_hashlimit;
+#endif /* CONFIG_VE_IPTABLES */
+#endif
+	wait_queue_head_t	*_log_wait;
+	unsigned		*_log_start;
+	unsigned		*_log_end;
+	unsigned		*_logged_chars;
+	char			*log_buf;
+#define VE_DEFAULT_LOG_BUF_LEN	4096
+
+	unsigned long		down_at;
+	struct list_head	cleanup_list;
+#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
+	struct list_head	_fuse_conn_list;
+	struct super_block	*_fuse_control_sb;
+
+	struct file_system_type	*fuse_fs_type;
+	struct file_system_type	*fuse_ctl_fs_type;
+#endif
+	unsigned long		jiffies_fixup;
+	unsigned char		disable_net;
+	struct ve_monitor	*monitor;
+	struct proc_dir_entry	*monitor_proc;
+	unsigned long		meminfo_val;
+	int _randomize_va_space;
+
+	int 			odirect_enable;
+	int			fsync_enable;
+
+#if defined(CONFIG_LOCKD) || defined(CONFIG_LOCKD_MODULE)
+	struct ve_nlm_data	*nlm_data;
+#endif
+#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE)
+	struct ve_nfs_data	*nfs_data;
+#endif
+#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+	struct ve_nfsd_data	*nfsd_data;
+#endif
+#if defined(CONFIG_SUNRPC) || defined(CONFIG_SUNRPC_MODULE)
+	struct ve_rpc_data	*ve_rpc_data;
+	struct work_struct	rpc_destroy_work;
+#endif
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+	struct file_system_type	*bm_fs_type;
+	struct vfsmount		*bm_mnt;
+	int			bm_enabled;
+	int			bm_entry_count;
+	struct list_head	bm_entries;
+#endif
+
+	struct nsproxy		*ve_ns;
+	struct user_namespace	*user_ns;
+	struct cred		*init_cred;
+	struct net		*ve_netns;
+	struct cgroup		*ve_cgroup;
+	struct list_head	vetask_auxlist;
+#if defined(CONFIG_HOTPLUG)
+	u64 _uevent_seqnum;
+#endif
+	struct list_head	_kthread_create_list;
+	struct task_struct	*_kthreadd_task;
+	struct workqueue_struct *khelper_wq;
+	struct mutex		sync_mutex;
+
+	struct idr		_posix_timers_id;
+	spinlock_t		posix_timers_lock;
+
+	struct list_head	devmnt_list;
+	struct mutex		devmnt_mutex;
+
+	atomic_t		mnt_nr;
+
+	void			*lve;
+
+	spinlock_t		aio_nr_lock;
+	unsigned long		aio_nr;
+	unsigned long		aio_max_nr;
+};
+
+#define VE_MEMINFO_NR_SPECIAL	3	/* if above or equal treat at nr_pages */
+#define VE_MEMINFO_COMPLETE	2	/* show complete information */
+#define VE_MEMINFO_DEFAULT      1       /* default behaviour */
+#define VE_MEMINFO_SYSTEM       0       /* disable meminfo virtualization */
+
+enum {
+	VE_REBOOT,
+	VE_RESTORE,
+};
+
+extern int nr_ve;
+extern struct proc_dir_entry *proc_vz_dir;
+extern struct proc_dir_entry *glob_proc_vz_dir;
+
+#ifdef CONFIG_VE
+
+/*
+ * Each host block device visible from CT can have no more than one struct
+ * ve_devmnt linked in ve->devmnt_list. If ve_devmnt is present, it can be
+ * found by 'dev' field.
+ */
+struct ve_devmnt {
+	struct list_head	link;
+
+	dev_t	                dev;
+	char	               *allowed_options;
+	char	               *hidden_options; /* balloon_ino, etc. */
+};
+
+void do_update_load_avg_ve(void);
+void do_env_free(struct ve_struct *ptr);
+
+static inline struct ve_struct *get_ve(struct ve_struct *ptr)
+{
+	if (ptr != NULL)
+		atomic_inc(&ptr->counter);
+	return ptr;
+}
+
+static inline void put_ve(struct ve_struct *ptr)
+{
+	if (ptr && atomic_dec_and_test(&ptr->counter))
+		do_env_free(ptr);
+}
+
+void ve_cleanup_schedule(struct ve_struct *);
+
+extern spinlock_t ve_cleanup_lock;
+extern struct list_head ve_cleanup_list;
+extern struct task_struct *ve_cleanup_thread;
+
+extern int (*do_ve_enter_hook)(struct ve_struct *ve, unsigned int flags);
+extern void (*do_env_free_hook)(struct ve_struct *ve);
+
+extern unsigned long long ve_relative_clock(struct timespec * ts);
+extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp);
+extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp);
+
+#ifdef CONFIG_VTTYS
+extern int vtty_open_master(int veid, int idx);
+extern struct tty_driver *vtty_driver;
+#else
+static inline int vtty_open_master(int veid, int idx) { return -ENODEV; }
+#endif
+
+#define restoring_ve(ve)	test_bit(VE_RESTORE, &(ve)->flags)
+
+#else	/* CONFIG_VE */
+#define ve_utsname	system_utsname
+#define get_ve(ve)	(NULL)
+#define put_ve(ve)	do { } while (0)
+#define pget_ve(ve)	do { } while (0)
+#define pput_ve(ve)	do { } while (0)
+#define restoring_ve(ve) (0)
+#endif	/* CONFIG_VE */
+
+#endif /* _LINUX_VE_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ve_nfs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve_nfs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ve_nfs.h	2015-01-21 12:02:44.086203487 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve_nfs.h	2015-01-21 12:02:50.656029070 +0300
@@ -0,0 +1,13 @@
+/*
+ * linux/include/ve_nfs.h
+ *
+ * VE context for NFS
+ *
+ * Copyright (C) 2007 SWsoft
+ */
+
+extern int ve_nfs_sync(struct ve_struct *env, int wait);
+extern void nfs_change_server_params(void *data, int timeo, int retrans);
+extern int is_nfs_automount(struct vfsmount *mnt);
+
+extern int nfs_enable_v4_in_ct;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ve_proto.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve_proto.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ve_proto.h	2015-01-21 12:02:44.086203487 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve_proto.h	2015-01-21 12:02:58.131830626 +0300
@@ -0,0 +1,103 @@
+/*
+ *  include/linux/ve_proto.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VE_H__
+#define __VE_H__
+
+#ifdef CONFIG_VE
+
+struct ve_struct;
+
+struct seq_file;
+
+typedef void (*ve_seq_print_t)(struct seq_file *, struct ve_struct *);
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t);
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t);
+
+#ifdef CONFIG_INET
+void tcp_v4_kill_ve_sockets(struct ve_struct *envid);
+#ifdef CONFIG_VE_NETDEV
+int venet_init(void);
+#endif
+#endif
+
+extern struct list_head ve_list_head;
+#define for_each_ve(ve)	list_for_each_entry((ve), &ve_list_head, ve_list)
+extern struct mutex ve_list_lock;
+extern struct ve_struct *get_ve_by_id(envid_t);
+extern struct ve_struct *__find_ve_by_id(envid_t);
+
+struct env_create_param3;
+extern int real_env_create(envid_t veid, unsigned flags, u32 class_id,
+			   struct env_create_param3 *data, int datalen);
+
+extern int ve_freeze(struct ve_struct *env);
+extern void ve_thaw(struct ve_struct *env);
+
+extern int ve_prep_devcgroup(struct ve_struct *ve);
+int set_device_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned);
+int get_device_perms_ve(int dev_type, dev_t dev, int access_mode);
+int devperms_seq_show(struct seq_file *m, void *v);
+
+enum {
+	VE_SS_CHAIN,
+	VE_INIT_EXIT_CHAIN,
+	VE_CLEANUP_CHAIN,
+
+	VE_MAX_CHAINS
+};
+
+typedef int ve_hook_init_fn(void *data);
+typedef void ve_hook_fini_fn(void *data);
+
+struct ve_hook
+{
+	ve_hook_init_fn *init;
+	ve_hook_fini_fn *fini;
+	struct module *owner;
+
+	/* Functions are called in ascending priority */
+	int priority;
+
+	/* Private part */
+	struct list_head list;
+};
+
+enum {
+	HOOK_PRIO_DEFAULT = 0,
+
+	HOOK_PRIO_FS = HOOK_PRIO_DEFAULT,
+
+	HOOK_PRIO_NET_PRE,
+	HOOK_PRIO_NET,
+	HOOK_PRIO_NET_POST,
+	HOOK_PRIO_NET_ACCT = 100,
+	HOOK_PRIO_NET_ACCT_V6,
+
+	HOOK_PRIO_AFTERALL = INT_MAX
+};
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos);
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos);
+void ve_seq_stop(struct seq_file *m, void *v);
+
+extern int ve_hook_iterate_init(int chain, void *data);
+extern void ve_hook_iterate_fini(int chain, void *data);
+
+extern void ve_hook_register(int chain, struct ve_hook *vh);
+extern void ve_hook_unregister(struct ve_hook *vh);
+#else /* CONFIG_VE */
+#define ve_hook_register(ch, vh)	do { } while (0)
+#define ve_hook_unregister(ve)		do { } while (0)
+
+#define get_device_perms_ve(t, d, a)	(0)
+#endif /* CONFIG_VE */
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/ve_task.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve_task.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/ve_task.h	2015-01-21 12:02:44.086203487 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/ve_task.h	2015-01-21 12:02:53.920942400 +0300
@@ -0,0 +1,71 @@
+/*
+ *  include/linux/ve_task.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VE_TASK_H__
+#define __VE_TASK_H__
+
+#include <linux/seqlock.h>
+#include <asm/timex.h>
+
+struct ve_task_info {
+/* virtualization */
+	struct ve_struct *owner_env;
+	struct ve_struct *exec_env;
+	struct ve_struct *saved_env;
+	struct list_head vetask_list; /* ve->vetask_lh */
+	struct list_head aux_list;
+/* statistics: scheduling latency */
+	u64 sleep_time;
+	u64 sched_time;
+	u64 sleep_stamp;
+	u64 wakeup_stamp;
+	seqcount_t wakeup_lock;
+};
+
+#define VE_TASK_INFO(task)	(&(task)->ve_task_info)
+#define VE_TASK_LIST_2_TASK(lh)	\
+	list_entry(lh, struct task_struct, ve_task_info.vetask_list)
+
+#ifdef CONFIG_VE
+extern struct ve_struct ve0;
+#define get_ve0()	(&ve0)
+
+#define ve_save_context(t)	do {				\
+		t->ve_task_info.saved_env = 			\
+				t->ve_task_info.exec_env;	\
+		t->ve_task_info.exec_env = get_ve0();		\
+	} while (0)
+#define ve_restore_context(t)	do {				\
+		t->ve_task_info.exec_env = 			\
+				t->ve_task_info.saved_env;	\
+	} while (0)
+
+#define get_exec_env()	(current->ve_task_info.exec_env)
+#define set_exec_env(ve)	({		\
+		struct ve_struct *__old;	\
+		__old = current->ve_task_info.exec_env;	\
+		current->ve_task_info.exec_env = ve;	\
+		__old;				\
+	})
+#define get_env_init(ve)	(ve->ve_ns->pid_ns->child_reaper)
+#define get_exec_env_init()	get_env_init(get_exec_env())
+#define task_veid(t)		((t)->ve_task_info.owner_env->veid)
+#else
+#define get_ve0()		(NULL)
+#define get_exec_env()		(NULL)
+#define set_exec_env(new_env)	(NULL)
+#define ve_save_context(t)	do { } while (0)
+#define ve_restore_context(t)	do { } while (0)
+#define get_env_init(ve)	(&init_task)
+#define get_exec_env_init()	(&init_task)
+#define task_veid(t)		(0)
+#endif
+
+#endif /* __VE_TASK_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/veip.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/veip.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/veip.h	2015-01-21 12:02:44.086203487 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/veip.h	2015-01-21 12:02:44.086203487 +0300
@@ -0,0 +1,15 @@
+#ifndef __VE_IP_H_
+#define __VE_IP_H_
+
+struct ve_addr_struct {
+	int family;
+	__u32 key[4];
+};
+
+struct sockaddr;
+
+extern void veaddr_print(char *, int, struct ve_addr_struct *);
+extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+		struct ve_addr_struct *veaddr);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/venet.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/venet.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/venet.h	2015-01-21 12:02:44.086203487 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/venet.h	2015-01-21 12:02:45.655161833 +0300
@@ -0,0 +1,95 @@
+/*
+ *  include/linux/venet.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VENET_H
+#define _VENET_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/vzcalluser.h>
+#include <linux/veip.h>
+#include <linux/netdevice.h>
+
+#define VEIP_HASH_SZ 512
+
+struct ve_struct;
+struct venet_stat;
+struct venet_stats {
+	struct net_device_stats	stats;
+	struct net_device_stats	*real_stats;
+};
+
+struct ip_entry_struct
+{
+	struct ve_addr_struct	addr;
+	struct ve_struct	*active_env;
+	struct veip_struct	*tgt_veip;
+	struct hlist_node 	ip_hash;
+	union {
+		struct list_head 	ve_list;
+		struct rcu_head		rcu;
+	};
+};
+
+struct ext_entry_struct
+{
+	struct list_head	list;
+	struct ve_addr_struct	addr;
+	struct rcu_head		rcu;
+};
+
+struct veip_struct
+{
+	struct list_head	src_lh;
+	struct list_head	dst_lh;
+	struct list_head	ip_lh;
+	struct list_head	list;
+	struct list_head	ext_lh;
+	envid_t			veid;
+	struct venet_stat	*stat;
+	struct rcu_head		rcu;
+};
+
+struct veip_pool_ops {
+	int (*veip_create)(struct ve_struct *);
+	void (*veip_release)(struct ve_struct *);
+	void (*veip_free)(struct veip_struct *);
+	struct ve_struct *(*veip_lookup)(struct sk_buff *);
+};
+
+extern struct veip_pool_ops *veip_pool_ops;
+
+static inline struct net_device_stats *
+venet_stats(struct net_device *dev, int cpu)
+{
+	struct venet_stats *stats;
+	stats = (struct venet_stats*)dev->ml_priv;
+	return per_cpu_ptr(stats->real_stats, cpu);
+}
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip);
+void ip_entry_unhash(struct ip_entry_struct *entry);
+void ip_entry_unhash(struct ip_entry_struct *entry);
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *);
+
+struct veip_struct *veip_find(envid_t veid);
+struct veip_struct *veip_findcreate(envid_t veid);
+int veip_put(struct veip_struct *veip);
+void veip_cleanup(void);
+
+extern struct list_head veip_lh;
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+		struct ve_addr_struct *addr);
+
+extern struct hlist_head ip_entry_hash_table[];
+extern spinlock_t veip_lock;
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/veprintk.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/veprintk.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/veprintk.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/veprintk.h	2015-01-21 12:02:44.087203460 +0300
@@ -0,0 +1,38 @@
+/*
+ *  include/linux/veprintk.h
+ *
+ *  Copyright (C) 2006  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VE_PRINTK_H__
+#define __VE_PRINTK_H__
+
+#ifdef CONFIG_VE
+
+#define ve_log_wait		(*(get_exec_env()->_log_wait))
+#define ve_log_start		(*(get_exec_env()->_log_start))
+#define ve_log_end		(*(get_exec_env()->_log_end))
+#define ve_logged_chars		(*(get_exec_env()->_logged_chars))
+#define ve_log_buf		(get_exec_env()->log_buf)
+#define ve_log_buf_len		(ve_is_super(get_exec_env()) ? \
+				log_buf_len : VE_DEFAULT_LOG_BUF_LEN)
+#define VE_LOG_BUF_MASK		(ve_log_buf_len - 1)
+#define VE_LOG_BUF(idx)		(ve_log_buf[(idx) & VE_LOG_BUF_MASK])
+
+#else
+
+#define ve_log_wait		log_wait
+#define ve_log_start		log_start
+#define ve_log_end		log_end
+#define ve_logged_chars		logged_chars
+#define ve_log_buf		log_buf
+#define ve_log_buf_len		log_buf_len
+#define VE_LOG_BUF_MASK		LOG_BUF_MASK
+#define VE_LOG_BUF(idx)		LOG_BUF(idx)
+
+#endif /* CONFIG_VE */
+#endif /* __VE_PRINTK_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/veth.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/veth.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/veth.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/veth.h	2015-01-21 12:02:45.356169771 +0300
@@ -1,3 +1,12 @@
+/*
+ *  include/linux/veth.h
+ *
+ *  Copyright (C) 2007  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
 #ifndef __NET_VETH_H_
 #define __NET_VETH_H_
 
@@ -9,4 +18,29 @@ enum {
 #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
 };
 
+#ifdef __KERNEL__
+struct veth_struct
+{
+	struct net_device_stats stats;
+	struct net_device	*me;
+	struct net_device	*pair;
+	struct list_head	hwaddr_list;
+	struct net_device_stats	*real_stats;
+	int			allow_mac_change;
+};
+
+#define veth_from_netdev(dev) \
+	((struct veth_struct *)(netdev_priv(dev)))
+static inline struct net_device * veth_to_netdev(struct veth_struct *veth)
+{
+	return veth->me;
+}
+#endif
+
+static inline struct net_device_stats *
+veth_stats(struct net_device *dev, int cpuid)
+{
+	return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid);
+}
+
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/virtinfo.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/virtinfo.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/virtinfo.h	2015-01-21 12:02:43.128228921 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/virtinfo.h	2015-01-21 12:02:58.939809182 +0300
@@ -0,0 +1,86 @@
+/*
+ *  include/linux/virtinfo.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __LINUX_VIRTINFO_H
+#define __LINUX_VIRTINFO_H
+
+#include <linux/kernel.h>
+#include <linux/page-flags.h>
+#include <linux/notifier.h>
+#include <linux/mmzone.h>
+
+struct vnotifier_block
+{
+	int (*notifier_call)(struct vnotifier_block *self,
+			unsigned long, void *, int);
+	struct vnotifier_block *next;
+	int priority;
+};
+
+extern struct semaphore virtinfo_sem;
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb);
+int virtinfo_notifier_call(int type, unsigned long n, void *data);
+int virtinfo_notifier_call_irq(int type, unsigned long n, void *data);
+
+struct page_info {
+	unsigned long nr_file_dirty;
+	unsigned long nr_writeback;
+	unsigned long nr_anon_pages;
+	unsigned long nr_file_mapped;
+	unsigned long nr_slab_rec;
+	unsigned long nr_slab_unrec;
+	unsigned long nr_pagetable;
+	unsigned long nr_unstable_nfs;
+	unsigned long nr_bounce;
+	unsigned long nr_writeback_temp;
+};
+
+struct sysinfo;
+struct user_beancounter;
+
+struct meminfo {
+	struct sysinfo *si;
+	struct user_beancounter *ub;
+	unsigned long meminfo_val;
+	unsigned long pages[NR_LRU_LISTS];
+	unsigned long shadow[NR_LRU_LISTS];
+	unsigned long cached, dirty_pages, writeback_pages, locked, shmem;
+	unsigned long slab_reclaimable, slab_unreclaimable;
+	struct idle_page_stats idle_page_stats;
+};
+
+int meminfo_proc_show_ub(struct seq_file *m, void *v,
+		struct user_beancounter *ub, unsigned long meminfo_val);
+
+#define VIRTINFO_MEMINFO	0
+#define VIRTINFO_SYSINFO	2
+#define VIRTINFO_VMSTAT		3
+#define VIRTINFO_OOMKILL	4
+
+#define VIRTINFO_IO_ACCOUNT	0
+#define VIRTINFO_IO_PREPARE	1
+#define VIRTINFO_IO_JOURNAL	2
+#define VIRTINFO_IO_READAHEAD	3
+#define VIRTINFO_IO_CONGESTION	4
+#define VIRTINFO_IO_OP_ACCOUNT	5
+#define VIRTINFO_IO_BALANCE_DIRTY	6
+#define VIRTINFO_IO_FUSE_REQ	7
+
+enum virt_info_types {
+	VITYPE_GENERAL,
+	VITYPE_QUOTA,
+	VITYPE_IO,
+
+	VIRT_TYPES
+};
+
+#endif /* __LINUX_VIRTINFO_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vmalloc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vmalloc.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vmalloc.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vmalloc.h	2015-01-21 12:02:43.394221859 +0300
@@ -53,13 +53,17 @@ static inline void vmalloc_init(void)
 
 extern void *vmalloc(unsigned long size);
 extern void *vzalloc(unsigned long size);
+extern void *ub_vmalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
+extern void *ub_vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
+extern void *vmalloc_best(unsigned long size);
+extern void *ub_vmalloc_best(unsigned long size);
 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
 				pgprot_t prot);
 extern void vfree(const void *addr);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vmstat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vmstat.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vmstat.h	2014-12-12 23:29:08.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vmstat.h	2015-01-21 12:02:43.394221859 +0300
@@ -29,6 +29,12 @@
 #define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
+#ifdef CONFIG_MEMORY_VSWAP
+		VSWPIN, VSWPOUT,
+#else
+# define	VSWPIN -1
+# define	VSWPOUT -1
+#endif
 		FOR_ALL_ZONES(PGALLOC),
 		PGFREE, PGACTIVATE, PGDEACTIVATE,
 		PGFAULT, PGMAJFAULT,
@@ -118,6 +124,7 @@ static inline void vm_events_fold_cpu(in
 }
 #endif
 
+extern unsigned long vm_events(enum vm_event_item i);
 #else
 
 /* Disable counters */
@@ -140,6 +147,7 @@ static inline void vm_events_fold_cpu(in
 {
 }
 
+static inline unsigned long vm_events(enum vm_event_item i) { return 0; }
 #endif /* CONFIG_VM_EVENT_COUNTERS */
 
 #define __count_zone_vm_events(item, zone, delta) \
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzcalluser.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzcalluser.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzcalluser.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzcalluser.h	2015-01-21 12:02:47.768105737 +0300
@@ -0,0 +1,214 @@
+/*
+ *  include/linux/vzcalluser.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_VZCALLUSER_H
+#define _LINUX_VZCALLUSER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/vziptable_defs.h>
+
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+/*
+ * VE management ioctls
+ */
+
+struct vzctl_old_env_create {
+	envid_t veid;
+	unsigned flags;
+#define VE_CREATE 	1	/* Create VE, VE_ENTER added automatically */
+#define VE_EXCLUSIVE	2	/* Fail if exists */
+#define VE_ENTER	4	/* Enter existing VE */
+#define VE_TEST		8	/* Test if VE exists */
+#define VE_LOCK		16	/* Do not allow entering created VE */
+#define VE_SKIPLOCK	32	/* Allow entering embrion VE */
+	__u32 addr;
+};
+
+struct vzctl_mark_env_to_down {
+	envid_t veid;
+};
+
+struct vzctl_setdevperms {
+	envid_t veid;
+	unsigned type;
+#define VE_USE_MAJOR	010	/* Test MAJOR supplied in rule */
+#define VE_USE_MINOR	030	/* Test MINOR supplied in rule */
+#define VE_USE_MASK	030	/* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */
+	unsigned dev;
+	unsigned mask;
+};
+
+struct vzctl_ve_netdev {
+	envid_t veid;
+	int op;
+#define VE_NETDEV_ADD  1
+#define VE_NETDEV_DEL  2
+	char __user *dev_name;
+};
+
+struct vzctl_ve_configure {
+	unsigned int veid;
+	unsigned int key;
+#define VE_CONFIGURE_OS_RELEASE		2
+#define VE_CONFIGURE_CREATE_PROC_LINK	4
+#define VE_CONFIGURE_OPEN_TTY		5
+#define VE_CONFIGURE_MOUNT_OPTIONS	7
+	unsigned int val;
+	unsigned int size;
+	char data[0];
+};
+
+struct vzctl_ve_meminfo {
+	envid_t veid;
+	unsigned long val;
+};
+
+struct vzctl_env_create_cid {
+	envid_t veid;
+	unsigned flags;
+	__u32 class_id;
+};
+
+struct vzctl_env_create {
+	envid_t veid;
+	unsigned flags;
+	__u32 class_id;
+};
+
+struct env_create_param {
+	__u64 iptables_mask;
+};
+
+#define VZCTL_ENV_CREATE_DATA_MINLEN	sizeof(struct env_create_param)
+
+struct env_create_param2 {
+	__u64 iptables_mask;
+	__u64 feature_mask;
+	__u32 total_vcpus;	/* 0 - don't care, same as in host */
+};
+
+struct env_create_param3 {
+	__u64 iptables_mask;
+	__u64 feature_mask;
+	__u32 total_vcpus;
+	__u32 pad;
+	__u64 known_features;
+};
+
+#define VE_FEATURE_SYSFS	(1ULL << 0)
+#define VE_FEATURE_NFS		(1ULL << 1)
+#define VE_FEATURE_DEF_PERMS	(1ULL << 2)
+#define VE_FEATURE_SIT          (1ULL << 3)
+#define VE_FEATURE_IPIP         (1ULL << 4)
+#define VE_FEATURE_PPP		(1ULL << 5)
+#define VE_FEATURE_IPGRE	(1ULL << 6)
+#define VE_FEATURE_BRIDGE	(1ULL << 7)
+#define VE_FEATURE_NFSD		(1ULL << 8)
+
+#define VE_FEATURES_OLD		(VE_FEATURE_SYSFS)
+#define VE_FEATURES_DEF		(VE_FEATURE_SYSFS | \
+				 VE_FEATURE_DEF_PERMS)
+
+typedef struct env_create_param3 env_create_param_t;
+#define VZCTL_ENV_CREATE_DATA_MAXLEN	sizeof(env_create_param_t)
+
+struct vzctl_env_create_data {
+	envid_t veid;
+	unsigned flags;
+	__u32 class_id;
+	env_create_param_t __user *data;
+	int datalen;
+};
+
+struct vz_load_avg {
+	int val_int;
+	int val_frac;
+};
+
+struct vz_cpu_stat {
+	unsigned long user_jif;
+	unsigned long nice_jif;
+	unsigned long system_jif; 
+	unsigned long uptime_jif;
+	__u64 idle_clk;
+	__u64 strv_clk;
+	__u64 uptime_clk;
+	struct vz_load_avg avenrun[3];	/* loadavg data */
+};
+
+struct vzctl_cpustatctl {
+	envid_t veid;
+	struct vz_cpu_stat __user *cpustat;
+};
+
+#define VZCTLTYPE '.'
+#define VZCTL_OLD_ENV_CREATE	_IOW(VZCTLTYPE, 0,			\
+					struct vzctl_old_env_create)
+#define VZCTL_MARK_ENV_TO_DOWN	_IOW(VZCTLTYPE, 1,			\
+					struct vzctl_mark_env_to_down)
+#define VZCTL_SETDEVPERMS	_IOW(VZCTLTYPE, 2,			\
+					struct vzctl_setdevperms)
+#define VZCTL_ENV_CREATE_CID	_IOW(VZCTLTYPE, 4,			\
+					struct vzctl_env_create_cid)
+#define VZCTL_ENV_CREATE	_IOW(VZCTLTYPE, 5,			\
+					struct vzctl_env_create)
+#define VZCTL_GET_CPU_STAT	_IOW(VZCTLTYPE, 6,			\
+					struct vzctl_cpustatctl)
+#define VZCTL_ENV_CREATE_DATA	_IOW(VZCTLTYPE, 10,			\
+					struct vzctl_env_create_data)
+#define VZCTL_VE_NETDEV		_IOW(VZCTLTYPE, 11,			\
+					struct vzctl_ve_netdev)
+#define VZCTL_VE_MEMINFO	_IOW(VZCTLTYPE, 13,                     \
+					struct vzctl_ve_meminfo)
+#define VZCTL_VE_CONFIGURE	_IOW(VZCTLTYPE, 15,			\
+					struct vzctl_ve_configure)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_ve_netdev {
+	envid_t veid;
+	int op;
+	compat_uptr_t dev_name;
+};
+
+struct compat_vzctl_ve_meminfo {
+	envid_t veid;
+	compat_ulong_t val;
+};
+
+struct compat_vzctl_env_create_data {
+	envid_t veid;
+	unsigned flags;
+	__u32 class_id;
+	compat_uptr_t data;
+	int datalen;
+};
+
+#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10,		\
+					struct compat_vzctl_env_create_data)
+#define VZCTL_COMPAT_VE_NETDEV	_IOW(VZCTLTYPE, 11,			\
+					struct compat_vzctl_ve_netdev)
+#define VZCTL_COMPAT_VE_MEMINFO	_IOW(VZCTLTYPE, 13,                     \
+					struct compat_vzctl_ve_meminfo)
+#endif
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl.h	2015-01-21 12:02:44.087203460 +0300
@@ -0,0 +1,30 @@
+/*
+ *  include/linux/vzctl.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_VZCTL_H
+#define _LINUX_VZCTL_H
+
+#include <linux/list.h>
+
+struct module;
+struct inode;
+struct file;
+struct vzioctlinfo {
+	unsigned type;
+	int (*ioctl)(struct file *, unsigned int, unsigned long);
+	int (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+	struct module *owner;
+	struct list_head list;
+};
+
+extern void vzioctl_register(struct vzioctlinfo *inf);
+extern void vzioctl_unregister(struct vzioctlinfo *inf);
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl_quota.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl_quota.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl_quota.h	2015-01-21 12:02:53.120963637 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl_quota.h	2015-01-21 12:02:53.120963637 +0300
@@ -0,0 +1,74 @@
+/*
+ *  include/linux/vzctl_quota.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __LINUX_VZCTL_QUOTA_H__
+#define __LINUX_VZCTL_QUOTA_H__
+
+#include <linux/compat.h>
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+/*
+ * Quota management ioctl
+ */
+
+struct vz_quota_stat;
+struct vzctl_quotactl {
+	int cmd;
+	unsigned int quota_id;
+	struct vz_quota_stat __user *qstat;
+	char __user *ve_root;
+};
+
+struct vzctl_quotaugidctl {
+	int cmd;		/* subcommand */
+	unsigned int quota_id;	/* quota id where it applies to */
+	unsigned int ugid_index;/* for reading statistic. index of first
+				    uid/gid record to read */
+	unsigned int ugid_size;	/* size of ugid_buf array */
+	void *addr; 		/* user-level buffer */
+};
+
+#define VZDQCTLTYPE '+'
+#define VZCTL_QUOTA_DEPR_CTL	_IOWR(VZDQCTLTYPE, 1,			\
+					struct vzctl_quotactl)
+#define VZCTL_QUOTA_NEW_CTL	_IOWR(VZDQCTLTYPE, 2,			\
+					struct vzctl_quotactl)
+#define VZCTL_QUOTA_UGID_CTL	_IOWR(VZDQCTLTYPE, 3,			\
+					struct vzctl_quotaugidctl)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+struct compat_vzctl_quotactl {
+	int cmd;
+	unsigned int quota_id;
+	compat_uptr_t qstat;
+	compat_uptr_t ve_root;
+};
+
+struct compat_vzctl_quotaugidctl {
+	int cmd;		/* subcommand */
+	unsigned int quota_id;	/* quota id where it applies to */
+	unsigned int ugid_index;/* for reading statistic. index of first
+				    uid/gid record to read */
+	unsigned int ugid_size;	/* size of ugid_buf array */
+	compat_uptr_t addr; 	/* user-level buffer */
+};
+
+#define VZCTL_COMPAT_QUOTA_CTL	_IOWR(VZDQCTLTYPE, 2,			\
+					struct compat_vzctl_quotactl)
+#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3,		\
+					struct compat_vzctl_quotaugidctl)
+#endif
+#endif
+
+#endif /* __LINUX_VZCTL_QUOTA_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl_venet.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl_venet.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl_venet.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl_venet.h	2015-01-21 12:02:44.087203460 +0300
@@ -0,0 +1,53 @@
+/*
+ *  include/linux/vzctl_venet.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VZCTL_VENET_H
+#define _VZCTL_VENET_H
+
+#include <linux/types.h>
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
+struct vzctl_ve_ip_map {
+	envid_t veid;
+	int op;
+#define VE_IP_ADD	1
+#define VE_IP_DEL	2
+#define VE_IP_EXT_ADD	3
+#define VE_IP_EXT_DEL	4
+	struct sockaddr *addr;
+	int addrlen;
+};
+
+#define VENETCTLTYPE '('
+
+#define VENETCTL_VE_IP_MAP	_IOW(VENETCTLTYPE, 3,			\
+					struct vzctl_ve_ip_map)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+struct compat_vzctl_ve_ip_map {
+	envid_t veid;
+	int op;
+	compat_uptr_t addr;
+	int addrlen;
+};
+
+#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3,			\
+					struct compat_vzctl_ve_ip_map)
+#endif
+#endif
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl_veth.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl_veth.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzctl_veth.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzctl_veth.h	2015-01-21 12:02:44.087203460 +0300
@@ -0,0 +1,42 @@
+/*
+ *  include/linux/vzctl_veth.h
+ *
+ *  Copyright (C) 2006  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VZCTL_VETH_H
+#define _VZCTL_VETH_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
+struct vzctl_ve_hwaddr {
+	envid_t veid;
+	int op;
+#define VE_ETH_ADD			1
+#define VE_ETH_DEL			2
+#define VE_ETH_ALLOW_MAC_CHANGE		3
+#define VE_ETH_DENY_MAC_CHANGE		4
+	unsigned char	dev_addr[6];
+	int addrlen;
+	char		dev_name[16];
+	unsigned char	dev_addr_ve[6];
+	int addrlen_ve;
+	char		dev_name_ve[16];
+};
+
+#define VETHCTLTYPE '['
+
+#define VETHCTL_VE_HWADDR	_IOW(VETHCTLTYPE, 3,			\
+					struct vzctl_ve_hwaddr)
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzdq_tree.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzdq_tree.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzdq_tree.h	2015-01-21 12:02:53.120963637 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzdq_tree.h	2015-01-21 12:02:53.120963637 +0300
@@ -0,0 +1,99 @@
+/*
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo disk quota tree definition
+ */
+
+#ifndef _VZDQ_TREE_H
+#define _VZDQ_TREE_H
+
+#include <linux/list.h>
+#include <asm/string.h>
+
+typedef unsigned int quotaid_t;
+#define QUOTAID_BITS		32
+#define QUOTAID_BBITS		4
+#define QUOTAID_EBITS		8
+
+#if QUOTAID_EBITS % QUOTAID_BBITS
+#error Quota bit assumption failure
+#endif
+
+#define QUOTATREE_BSIZE		(1 << QUOTAID_BBITS)
+#define QUOTATREE_BMASK		(QUOTATREE_BSIZE - 1)
+#define QUOTATREE_DEPTH		((QUOTAID_BITS + QUOTAID_BBITS - 1) \
+							/ QUOTAID_BBITS)
+#define QUOTATREE_EDEPTH	((QUOTAID_BITS + QUOTAID_EBITS - 1) \
+							/ QUOTAID_EBITS)
+#define QUOTATREE_BSHIFT(lvl)	((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS)
+
+/*
+ * Depth of keeping unused node (not inclusive).
+ * 0 means release all nodes including root,
+ * QUOTATREE_DEPTH means never release nodes.
+ * Current value: release all nodes strictly after QUOTATREE_EDEPTH 
+ * (measured in external shift units).
+ */
+#define QUOTATREE_CDEPTH	(QUOTATREE_DEPTH \
+				- 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \
+				+ 1)
+
+/*
+ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes.
+ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS),
+ * and each node contains 2^QUOTAID_BBITS pointers.
+ * Level 0 is a (single) tree root node.
+ *
+ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data.
+ * Nodes of lower levels contain pointers to nodes.
+ *
+ * Double pointer in array of i-level node, pointing to a (i+1)-level node
+ * (such as inside quotatree_find_state) are marked by level (i+1), not i.
+ * Level 0 double pointer is a pointer to root inside tree struct.
+ *
+ * The tree is permanent, i.e. all index blocks allocated are keeped alive to
+ * preserve the blocks numbers in the quota file tree to keep its changes
+ * locally.
+ */
+struct quotatree_node {
+	struct list_head list;
+	quotaid_t num;
+	void *blocks[QUOTATREE_BSIZE];
+};
+
+struct quotatree_level {
+	struct list_head usedlh, freelh;
+	quotaid_t freenum;
+};
+
+struct quotatree_tree {
+	struct quotatree_level levels[QUOTATREE_DEPTH];
+	struct quotatree_node *root;
+	unsigned int leaf_num;
+};
+
+struct quotatree_find_state {
+	void **block;
+	int level;
+};
+
+/* number of leafs (objects) and leaf level of the tree */
+#define QTREE_LEAFNUM(tree)	((tree)->leaf_num)
+#define QTREE_LEAFLVL(tree)	(&(tree)->levels[QUOTATREE_DEPTH - 1])
+
+struct quotatree_tree *quotatree_alloc(void);
+void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
+		struct quotatree_find_state *st);
+int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
+		struct quotatree_find_state *st, void *data);
+void quotatree_remove(struct quotatree_tree *tree, quotaid_t id);
+void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *));
+void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id);
+void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index);
+
+#endif /* _VZDQ_TREE_H */
+
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzevent.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzevent.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzevent.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzevent.h	2015-01-21 12:02:44.175201124 +0300
@@ -0,0 +1,21 @@
+#ifndef __LINUX_VZ_EVENT_H__
+#define __LINUX_VZ_EVENT_H__
+
+#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE)
+extern int vzevent_send(int msg, const char *attrs_fmt, ...);
+#else
+static inline int vzevent_send(int msg, const char *attrs_fmt, ...)
+{
+	return 0;
+}
+#endif
+
+enum {
+	VE_EVENT_MOUNT,
+	VE_EVENT_UMOUNT,
+	VE_EVENT_START,
+	VE_EVENT_STOP,
+	VE_EVENT_REBOOT,
+};
+
+#endif /* __LINUX_VZ_EVENT_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vziolimit.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vziolimit.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vziolimit.h	2015-01-21 12:02:58.949808915 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vziolimit.h	2015-01-21 12:02:58.973808277 +0300
@@ -0,0 +1,29 @@
+/*
+ *  include/linux/vziolimit.h
+ *
+ *  Copyright (C) 2010, Parallels inc.
+ *  All rights reserved.
+ *
+ */
+
+#ifndef _LINUX_VZIOLIMIT_H
+#define _LINUX_VZIOLIMIT_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VZIOLIMITTYPE 'I'
+
+struct iolimit_state {
+	unsigned int id;
+	unsigned int speed;
+	unsigned int burst;
+	unsigned int latency;
+};
+
+#define VZCTL_SET_IOLIMIT	_IOW(VZIOLIMITTYPE, 0, struct iolimit_state)
+#define VZCTL_GET_IOLIMIT	_IOR(VZIOLIMITTYPE, 1, struct iolimit_state)
+#define VZCTL_SET_IOPSLIMIT	_IOW(VZIOLIMITTYPE, 2, struct iolimit_state)
+#define VZCTL_GET_IOPSLIMIT	_IOR(VZIOLIMITTYPE, 3, struct iolimit_state)
+
+#endif /* _LINUX_VZIOLIMIT_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vziptable_defs.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vziptable_defs.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vziptable_defs.h	2015-01-21 12:02:44.087203460 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vziptable_defs.h	2015-01-21 12:02:47.739106507 +0300
@@ -0,0 +1,80 @@
+#ifndef _LINUX_VZIPTABLE_DEFS_H
+#define _LINUX_VZIPTABLE_DEFS_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+
+/*
+ * This masks represent modules
+ *
+ * Strictly speaking we use only a small subset
+ * of this bits novadays but we MUST RESERVE all
+ * the bits were ever used in a sake of ABI compatibility
+ * (ie compatibility with vzctl user-space utility)
+ *
+ * DON'T EVER DELETE/MODIFY THIS BITS
+ */
+#define VE_IPT_GENERATE(name, shift)	name = (1U << shift)
+
+enum ve_ipt_mods {
+	VE_IPT_GENERATE(VE_IP_IPTABLES_MOD,		0),
+	VE_IPT_GENERATE(VE_IP_FILTER_MOD,		1),
+	VE_IPT_GENERATE(VE_IP_MANGLE_MOD,		2),
+	VE_IPT_GENERATE(VE_IP_MATCH_LIMIT_MOD,		3),
+	VE_IPT_GENERATE(VE_IP_MATCH_MULTIPORT_MOD,	4),
+	VE_IPT_GENERATE(VE_IP_MATCH_TOS_MOD,		5),
+	VE_IPT_GENERATE(VE_IP_TARGET_TOS_MOD,		6),
+	VE_IPT_GENERATE(VE_IP_TARGET_REJECT_MOD,	7),
+	VE_IPT_GENERATE(VE_IP_TARGET_TCPMSS_MOD,	8),
+	VE_IPT_GENERATE(VE_IP_MATCH_TCPMSS_MOD,		9),
+	VE_IPT_GENERATE(VE_IP_MATCH_TTL_MOD,		10),
+	VE_IPT_GENERATE(VE_IP_TARGET_LOG_MOD,		11),
+	VE_IPT_GENERATE(VE_IP_MATCH_LENGTH_MOD,		12),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_MOD,		14),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_FTP_MOD,	15),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_IRC_MOD,	16),
+	VE_IPT_GENERATE(VE_IP_MATCH_CONNTRACK_MOD,	17),
+	VE_IPT_GENERATE(VE_IP_MATCH_STATE_MOD,		18),
+	VE_IPT_GENERATE(VE_IP_MATCH_HELPER_MOD,		19),
+	VE_IPT_GENERATE(VE_IP_NAT_MOD,			20),
+	VE_IPT_GENERATE(VE_IP_NAT_FTP_MOD,		21),
+	VE_IPT_GENERATE(VE_IP_NAT_IRC_MOD,		22),
+	VE_IPT_GENERATE(VE_IP_TARGET_REDIRECT_MOD,	23),
+	VE_IPT_GENERATE(VE_IP_MATCH_OWNER_MOD,		24),
+	VE_IPT_GENERATE(VE_IP_MATCH_MAC_MOD,		25),
+	VE_IPT_GENERATE(VE_IP_IPTABLES6_MOD,		26),
+	VE_IPT_GENERATE(VE_IP_FILTER6_MOD,		27),
+	VE_IPT_GENERATE(VE_IP_MANGLE6_MOD,		28),
+	VE_IPT_GENERATE(VE_IP_IPTABLE_NAT_MOD,		29),
+	VE_IPT_GENERATE(VE_NF_CONNTRACK_MOD,		30),
+};
+
+/* these masks represent modules with their dependences */
+#define VE_IP_IPTABLES		(VE_IP_IPTABLES_MOD)
+#define VE_IP_FILTER		(VE_IP_FILTER_MOD | VE_IP_IPTABLES)
+#define VE_IP_MANGLE		(VE_IP_MANGLE_MOD | VE_IP_IPTABLES)
+#define VE_IP_IPTABLES6		(VE_IP_IPTABLES6_MOD)
+#define VE_IP_FILTER6		(VE_IP_FILTER6_MOD | VE_IP_IPTABLES6)
+#define VE_IP_MANGLE6		(VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6)
+#define VE_NF_CONNTRACK		(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK		(VE_IP_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK_FTP	(VE_IP_CONNTRACK_FTP_MOD | VE_IP_CONNTRACK)
+#define VE_IP_CONNTRACK_IRC	(VE_IP_CONNTRACK_IRC_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT		(VE_IP_NAT_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT_FTP		(VE_IP_NAT_FTP_MOD | VE_IP_NAT | VE_IP_CONNTRACK_FTP)
+#define VE_IP_NAT_IRC		(VE_IP_NAT_IRC_MOD | VE_IP_NAT | VE_IP_CONNTRACK_IRC)
+#define VE_IP_IPTABLE_NAT	(VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK)
+
+/* safe iptables mask to be used by default */
+#define VE_IP_DEFAULT		(VE_IP_IPTABLES  | VE_IP_FILTER  | VE_IP_MANGLE | \
+				 VE_IP_IPTABLES6 | VE_IP_FILTER6 | VE_IP_MANGLE6)
+
+#define VE_IP_ALL		(~0ULL)
+#define VE_IP_NONE		(0ULL)
+
+static inline bool mask_ipt_allow(__u64 permitted, __u64 mask)
+{
+	return (permitted & mask) == mask;
+}
+
+#endif /* _LINUX_VZIPTABLE_DEFS_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzquota.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzquota.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzquota.h	2015-01-21 12:02:53.121963610 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzquota.h	2015-01-21 12:02:53.396956311 +0300
@@ -0,0 +1,642 @@
+/*
+ *
+ * Copyright (C) 2001-2005 SWsoft
+ * All rights reserved.
+ * 
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo disk quota implementation
+ */
+
+#ifndef _VZDQUOTA_H
+#define _VZDQUOTA_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+#include <linux/sched.h>
+
+/* vzquotactl syscall commands */
+#define VZ_DQ_CREATE		5 /* create quota master block */
+#define VZ_DQ_DESTROY		6 /* destroy qmblk */
+#define VZ_DQ_ON		7 /* mark dentry with already created qmblk */
+#define VZ_DQ_OFF		8 /* remove mark, don't destroy qmblk */
+#define VZ_DQ_SETLIMIT		9 /* set new limits */
+#define VZ_DQ_GETSTAT		10 /* get usage statistic */
+#define VZ_DQ_OFF_FORCED	11 /* forced off */
+#define VZ_DQ_ON_FILE		12 /* on with data in file */
+#define VZ_DQ_OFF_FILE		13 /* off and sync data to file */
+#define VZ_DQ_STATUS		14 /* report general info (see VZDQ_XXX below) */
+
+/* set of syscalls to maintain UGID quotas */
+#define VZ_DQ_UGID_GETSTAT	1 /* get usage/limits for ugid(s) */
+#define VZ_DQ_UGID_ADDSTAT	2 /* set usage/limits statistic for ugid(s) */
+#define VZ_DQ_UGID_GETGRACE	3 /* get expire times */
+#define VZ_DQ_UGID_SETGRACE	4 /* set expire times */
+#define VZ_DQ_UGID_GETCONFIG	5 /* get ugid_max limit, cnt, flags of qmblk */
+#define VZ_DQ_UGID_SETCONFIG	6 /* set ugid_max limit, flags of qmblk */
+#define VZ_DQ_UGID_SETLIMIT	7 /* set ugid B/I limits */
+#define VZ_DQ_UGID_SETINFO	8 /* set ugid info */
+
+/* common structure for vz and ugid quota */
+struct dq_stat {
+	/* blocks limits */
+	__u64	bhardlimit;	/* absolute limit in bytes */
+	__u64	bsoftlimit;	/* preferred limit in bytes */
+	time_t	btime;		/* time limit for excessive disk use */
+	__u64	bcurrent;	/* current bytes count */
+	/* inodes limits */
+	__u32	ihardlimit;	/* absolute limit on allocated inodes */
+	__u32	isoftlimit;	/* preferred inode limit */
+	time_t	itime;		/* time limit for excessive inode use */
+	__u32	icurrent;	/* current # allocated inodes */
+};
+
+/* One second resolution for grace times */
+#define CURRENT_TIME_SECONDS	(get_seconds())
+
+/* Values for dq_info->flags */
+#define VZ_QUOTA_INODES 0x01       /* inodes limit warning printed */
+#define VZ_QUOTA_SPACE  0x02       /* space limit warning printed */
+
+struct dq_info {
+	time_t		bexpire;   /* expire timeout for excessive disk use */
+	time_t		iexpire;   /* expire timeout for excessive inode use */
+	unsigned	flags;	   /* see previos defines */
+};
+
+struct vz_quota_stat  {
+	struct dq_stat dq_stat;
+	struct dq_info dq_info;
+};
+
+struct vz_quota_hdr {
+	__le32	magic;
+	__le32	version;
+};
+
+#define VZQUOTA_MAGIC		0x31031982
+#define VZQUOTA_VERSION_0	0
+#define VZQUOTA_STAT_OFF	sizeof(struct vz_quota_hdr)
+#define VZQUOTA_ROOT_FILE	".vzdq.%d"
+
+struct vz_quota_stat_img {
+	__le64	btime;
+	__le64	bexpire;
+	__le64	itime;
+	__le64	iexpire;
+
+	__le64	bhardlimit;
+	__le64	bsoftlimit;
+	__le64	bcurrent;
+
+	__le32	ihardlimit;
+	__le32	isoftlimit;
+	__le32	icurrent;
+	__le32	flags;
+};
+
+#define VZQUOTA_UGINFO_OFF	(VZQUOTA_STAT_OFF + \
+				sizeof(struct vz_quota_stat_img))
+
+struct vz_quota_uginfo_img {
+	__le32	ugid_max;
+	__le32	user_flags;
+	__le64	uid_bexpire;
+	__le64	uid_iexpire;
+	__le64	gid_bexpire;
+	__le64	gid_iexpire;
+};
+
+#define VZQUOTA_UGID_OFF	4096
+
+struct vz_quota_ugid_stat_img {
+	__le32	flags;
+
+	__le32	ihardlimit;
+	__le32	isoftlimit;
+	__le32	icurrent;
+
+	__le64	bhardlimit;
+	__le64	bsoftlimit;
+	__le64	bcurrent;
+
+	__le64	btime;
+	__le64	itime;
+};
+
+#define VZQUOTA_UGID_ITEM_BITS	6
+#define VZQUOTA_UGID_ITEM_SIZE	(1 << VZQUOTA_UGID_ITEM_BITS)
+#define VZQUOTA_UGID_BITS	23
+#define VZQUOTA_UGID_SIZE	(1 << VZQUOTA_UGID_BITS)
+#define VZQUOTA_MAX_UGID	0xffff
+#define VZQUOTA_UGID_PRESENT	0x1
+
+/* UID/GID interface record - for user-kernel level exchange */
+struct vz_quota_iface {
+	unsigned int	qi_id;	   /* UID/GID this applies to */
+	unsigned int	qi_type;   /* USRQUOTA|GRPQUOTA */
+	struct dq_stat	qi_stat;   /* limits, options, usage stats */
+};
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+struct compat_dq_stat {
+	/* blocks limits */
+	__u64	bhardlimit;	/* absolute limit in bytes */
+	__u64	bsoftlimit;	/* preferred limit in bytes */
+	compat_time_t btime;	/* time limit for excessive disk use */
+	__u64	bcurrent;	/* current bytes count */
+	/* inodes limits */
+	__u32	ihardlimit;	/* absolute limit on allocated inodes */
+	__u32	isoftlimit;	/* preferred inode limit */
+	compat_time_t itime;	/* time limit for excessive inode use */
+	__u32	icurrent;	/* current # allocated inodes */
+};
+
+struct compat_dq_info {
+	compat_time_t	bexpire;   /* expire timeout for excessive disk use */
+	compat_time_t	iexpire;   /* expire timeout for excessive inode use */
+	unsigned	flags;	   /* see previos defines */
+};
+
+struct compat_vz_quota_stat  {
+	struct compat_dq_stat dq_stat;
+	struct compat_dq_info dq_info;
+};
+
+struct compat_vz_quota_iface {
+	unsigned int	qi_id;	   /* UID/GID this applies to */
+	unsigned int	qi_type;   /* USRQUOTA|GRPQUOTA */
+	struct compat_dq_stat qi_stat;   /* limits, options, usage stats */
+};
+
+static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs,
+				struct dq_stat *dqs)
+{
+	dqs->bhardlimit = odqs->bhardlimit;
+	dqs->bsoftlimit = odqs->bsoftlimit;
+	dqs->bcurrent = odqs->bcurrent;
+	dqs->btime = odqs->btime;
+
+	dqs->ihardlimit = odqs->ihardlimit;
+	dqs->isoftlimit = odqs->isoftlimit;
+	dqs->icurrent = odqs->icurrent;
+	dqs->itime = odqs->itime;
+}
+
+static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi,
+				struct dq_info *dqi)
+{
+	dqi->bexpire = odqi->bexpire;
+	dqi->iexpire = odqi->iexpire;
+	dqi->flags = odqi->flags;
+}
+
+static inline void dqstat2compat_dqstat(struct dq_stat *dqs,
+				struct compat_dq_stat *odqs)
+{
+	odqs->bhardlimit = dqs->bhardlimit;
+	odqs->bsoftlimit = dqs->bsoftlimit;
+	odqs->bcurrent = dqs->bcurrent;
+	odqs->btime = (compat_time_t)dqs->btime;
+
+	odqs->ihardlimit = dqs->ihardlimit;
+	odqs->isoftlimit = dqs->isoftlimit;
+	odqs->icurrent = dqs->icurrent;
+	odqs->itime = (compat_time_t)dqs->itime;
+}
+
+static inline void dqinfo2compat_dqinfo(struct dq_info *dqi,
+				struct compat_dq_info *odqi)
+{
+	odqi->bexpire = (compat_time_t)dqi->bexpire;
+	odqi->iexpire = (compat_time_t)dqi->iexpire;
+	odqi->flags = dqi->flags;
+}
+#endif
+
+/* values for flags and dq_flags */
+/* this flag is set if the userspace has been unable to provide usage
+ * information about all ugids
+ * if the flag is set, we don't allocate new UG quota blocks (their
+ * current usage is unknown) or free existing UG quota blocks (not to
+ * lose information that this block is ok) */
+#define VZDQUG_FIXED_SET	0x01
+/* permit to use ugid quota */
+#define VZDQUG_ON		0x02
+#define VZDQ_USRQUOTA		0x10
+#define VZDQ_GRPQUOTA		0x20
+#define VZDQ_NOACT		0x1000	/* not actual */
+#define VZDQ_NOQUOT		0x2000	/* not under quota tree */
+#define VZDQF_USER_MASK		0xFFFF0000 /* for user_flags above */
+
+struct vz_quota_ugid_stat {
+	unsigned int	limit;	/* max amount of ugid records */
+	unsigned int	count;	/* amount of ugid records */
+	unsigned int	flags;	
+};
+
+struct vz_quota_ugid_setlimit {
+	unsigned int	type;	/* quota type (USR/GRP) */
+	unsigned int	id;	/* ugid */
+	struct if_dqblk dqb;	/* limits info */
+};
+
+struct vz_quota_ugid_setinfo {
+	unsigned int	type;	/* quota type (USR/GRP) */
+	struct if_dqinfo dqi;	/* grace info */
+};
+
+/* values for dq_state */
+#define VZDQ_STARTING		0 /* created, not turned on yet */
+#define VZDQ_WORKING		1 /* quota created, turned on */
+#define VZDQ_STOPING		2 /* created, turned on and off */
+#define VZDQ_ORPHAN_CLEANUP	3 /* cleaning out orphans */
+#define VZDQ_WORKING_JOURNAL	4 /* quota created, turned on with journal */
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/time.h>
+#include <linux/vzquota_qlnk.h>
+#include <linux/vzdq_tree.h>
+#include <linux/semaphore.h>
+
+/* Values for dq_info flags */
+#define VZ_QUOTA_INODES	0x01	   /* inodes limit warning printed */
+#define VZ_QUOTA_SPACE	0x02	   /* space limit warning printed */
+
+/* Kernel space data structures */
+struct dq_kstat {
+	/* blocks limits */
+	__u64   bhardlimit;     /* absolute limit in bytes */
+	__u64   bsoftlimit;     /* preferred limit in bytes */
+	time_t  btime;	  /* time limit for excessive disk use */
+	__u64   bcurrent;       /* current bytes count */
+	__u64   breserved;      /* reserved bytes count */
+	/* inodes limits */
+	__u32   ihardlimit;     /* absolute limit on allocated inodes */
+	__u32   isoftlimit;     /* preferred inode limit */
+	time_t  itime;	  /* time limit for excessive inode use */
+	__u32   icurrent;       /* current # allocated inodes */
+};
+
+struct dq_kinfo {
+	time_t	  bexpire;   /* expire timeout for excessive disk use */
+	time_t	  iexpire;   /* expire timeout for excessive inode use */
+	unsigned	flags;     /* see previos defines */
+};
+
+struct vz_quota_kstat {
+	struct dq_kstat dq_stat;
+	struct dq_kinfo dq_info;
+};
+
+static inline void user_dqstat2dqstat(struct dq_stat *odqs,
+				struct dq_kstat *dqs)
+{
+	dqs->bhardlimit = odqs->bhardlimit;
+	dqs->bsoftlimit = odqs->bsoftlimit;
+	dqs->bcurrent = odqs->bcurrent;
+	dqs->breserved = 0;
+	dqs->btime = odqs->btime;
+
+	dqs->ihardlimit = odqs->ihardlimit;
+	dqs->isoftlimit = odqs->isoftlimit;
+	dqs->icurrent = odqs->icurrent;
+	dqs->itime = odqs->itime;
+}
+
+static inline void user_dqinfo2dqinfo(struct dq_info *odqi,
+				struct dq_kinfo *dqi)
+{
+	dqi->bexpire = odqi->bexpire;
+	dqi->iexpire = odqi->iexpire;
+	dqi->flags = odqi->flags;
+}
+
+static inline void dqstat2user_dqstat(struct dq_kstat *dqs,
+				struct dq_stat *odqs)
+{
+	odqs->bhardlimit = dqs->bhardlimit;
+	odqs->bsoftlimit = dqs->bsoftlimit;
+	odqs->bcurrent = dqs->bcurrent;
+	odqs->btime = dqs->btime;
+
+	odqs->ihardlimit = dqs->ihardlimit;
+	odqs->isoftlimit = dqs->isoftlimit;
+	odqs->icurrent = dqs->icurrent;
+	odqs->itime = dqs->itime;
+}
+
+static inline void dqinfo2user_dqinfo(struct dq_kinfo *dqi,
+				struct dq_info *odqi)
+{
+	odqi->bexpire = dqi->bexpire;
+	odqi->iexpire = dqi->iexpire;
+	odqi->flags = dqi->flags;
+}
+
+/* master quota record - one per veid */
+struct vz_quota_master {
+	struct list_head	dq_hash;	/* next quota in hash list */
+	atomic_t		dq_count;	/* inode reference count */
+	unsigned int		dq_flags;	/* see VZDQUG_FIXED_SET */
+	unsigned int		dq_state;	/* see values above */
+	unsigned int		dq_id;		/* VEID this applies to */
+	struct dq_kstat	 dq_stat;	/* limits, grace, usage stats */
+	struct dq_kinfo	 dq_info;	/* grace times and flags */
+	spinlock_t		dq_data_lock;	/* for dq_stat */
+
+	struct mutex		dq_mutex;	/* mutex to protect
+						   ugid tree */
+
+	struct list_head	dq_ilink_list;	/* list of vz_quota_ilink */
+	struct quotatree_tree	*dq_uid_tree;	/* vz_quota_ugid tree for UIDs */
+	struct quotatree_tree	*dq_gid_tree;	/* vz_quota_ugid tree for GIDs */
+	unsigned int		dq_ugid_count;	/* amount of ugid records */
+	unsigned int		dq_ugid_max;	/* max amount of ugid records */
+	struct dq_kinfo	 dq_ugid_info[MAXQUOTAS]; /* ugid grace times */
+
+	struct inode		*qfile;
+	struct mutex		dq_write_lock;
+
+	struct path		dq_root_path;	/* path of fs tree */
+	struct super_block	*dq_sb;	      /* superblock of our quota root */
+	void			*dq_snap;       /* pointer to vzsnap struct */
+};
+
+/* UID/GID quota record - one per pair (quota_master, uid or gid) */
+struct vz_quota_ugid {
+	unsigned int		qugid_id;     /* UID/GID this applies to */
+	struct dq_kstat	 qugid_stat;   /* limits, options, usage stats */
+	int			qugid_type;   /* USRQUOTA|GRPQUOTA */
+	atomic_t		qugid_count;  /* reference count */
+};
+
+#define VZ_QUOTA_UGBAD		((struct vz_quota_ugid *)0xfeafea11)
+
+struct vz_quota_datast {
+	struct vz_quota_ilink qlnk;
+};
+
+#define VIRTINFO_QUOTA_GETSTAT	0
+#define VIRTINFO_QUOTA_ON	1
+#define VIRTINFO_QUOTA_OFF	2
+#define VIRTINFO_QUOTA_DISABLE	3
+#define VIRTINFO_ORPHAN_CLEAN	4
+#define VIRTINFO_ORPHAN_DONE	5
+
+struct virt_info_quota {
+	struct super_block *super;
+	struct inode *inode;
+	struct dq_kstat *qstat;
+};
+
+struct virt_info_orphan {
+	struct super_block *super;
+	unsigned int cookie;
+};
+
+void __vzquota_mark_dirty(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid **ugid);
+void vzquota_cur_qmblk_orphan_set(struct vz_quota_master *qmblk);
+int vzquota_on_cookie(struct super_block *sb, unsigned int cookie);
+void vzquota_off_cookies(struct super_block *sb);
+int vzquota_read_ugid(struct vz_quota_master *qmblk, struct inode *ino);
+void vzquota_ugid_dump(struct vz_quota_ugid *ugid,
+		struct vz_quota_ugid_stat_img *img);
+int vzquota_ugid_write(struct inode *ino, struct vz_quota_ugid_stat_img *img,
+		int id, int type);
+int vzquota_read_uginfo(struct vz_quota_master *, struct inode *);
+int vzquota_uginfo_write(struct inode *ino, struct vz_quota_uginfo_img *img);
+void vzquota_uginfo_dump(struct vz_quota_master *qmblk,
+		struct vz_quota_uginfo_img *img);
+
+static inline void vzquota_mark_dirty(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid **ugid)
+{
+	/* FIXME - race with vzquota_off */
+	if (qmblk->qfile != NULL)
+		__vzquota_mark_dirty(qmblk, ugid);
+}
+
+void __vzquota_mark_dirty_ugids(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid **dirty);
+
+static inline struct vz_quota_ugid *__vzquota_get_ugid(struct vz_quota_ugid *qugid)
+{
+	atomic_inc(&qugid->qugid_count);
+	return qugid;
+}
+
+/*
+ * Interface to VZ quota core
+ */
+#define INODE_QLNK(inode)	(&(inode)->i_qlnk)
+#define QLNK_INODE(qlnk)	container_of((qlnk), struct inode, i_qlnk)
+
+#define VZ_QUOTA_BAD		((struct vz_quota_master *)0xefefefef)
+
+#define VZ_QUOTAO_SETE		1
+#define VZ_QUOTAO_INIT		2
+#define VZ_QUOTAO_DESTR		3
+#define VZ_QUOTAO_SWAP		4
+#define VZ_QUOTAO_INICAL	5
+#define VZ_QUOTAO_DRCAL		6
+#define VZ_QUOTAO_QSET		7
+#define VZ_QUOTAO_TRANS		8
+#define VZ_QUOTAO_ACT		9
+#define VZ_QUOTAO_DTREE		10
+#define VZ_QUOTAO_DET		11
+#define VZ_QUOTAO_ON		12
+#define VZ_QUOTAO_RE_LOCK	13
+
+extern struct mutex vz_quota_mutex;
+
+void inode_qmblk_lock(struct super_block *sb);
+void inode_qmblk_unlock(struct super_block *sb);
+void qmblk_data_read_lock(struct vz_quota_master *qmblk);
+void qmblk_data_read_unlock(struct vz_quota_master *qmblk);
+void qmblk_data_write_lock(struct vz_quota_master *qmblk);
+void qmblk_data_write_unlock(struct vz_quota_master *qmblk);
+
+/* for quota operations */
+void vzquota_inode_init_call(struct inode *inode);
+void vzquota_inode_swap_call(struct inode *, struct inode *);
+void vzquota_inode_drop_call(struct inode *inode);
+int vzquota_inode_transfer_call(struct inode *, struct iattr *);
+struct vz_quota_master *vzquota_inode_data(struct inode *inode,
+		struct vz_quota_datast *);
+void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *);
+int vzquota_rename_check(struct inode *inode,
+		struct inode *old_dir, struct inode *new_dir);
+struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode);
+/* for second-level quota */
+struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
+/* for management operations */
+struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
+		struct vz_quota_kstat *qstat);
+void vzquota_free_master(struct vz_quota_master *);
+struct vz_quota_master *vzquota_find_master(unsigned int quota_id);
+int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
+		struct vz_quota_master *qmblk, char __user *buf);
+int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk,
+		char __user *buf, int force);
+int vzquota_get_super(struct super_block *sb);
+void vzquota_put_super(struct super_block *sb);
+
+/* ----------------------------------------------------------------------
+ *
+ * Passing quota information through current
+ *
+ * Used in inode -> qmblk lookup at inode creation stage (since at that
+ * time there are no links between the inode being created and its parent
+ * directory).
+ *
+ * Used also in NFS - when one opens inode by its i_ino the inode is
+ * actually detached and vzquota can't find qmblk for it. However the
+ * export's root is a good candidate for this.
+ *
+ * --------------------------------------------------------------------- */
+
+#define VZDQ_CUR_MAGIC		0x57d0fee2
+#define VZDQ_CUR_CLEANUP	0x56d2def4
+
+static inline void vzquota_cur_qmblk_set(struct inode *data)
+{
+	struct task_struct *tsk;
+
+	tsk = current;
+
+	WARN_ON(tsk->magic == VZDQ_CUR_CLEANUP);
+	tsk->magic = VZDQ_CUR_MAGIC;
+	tsk->ino = data;
+}
+
+static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk)
+{
+	if (!atomic_read(&qmblk->dq_count))
+		BUG();
+	atomic_inc(&qmblk->dq_count);
+	return qmblk;
+}
+
+static inline void __qmblk_put(struct vz_quota_master *qmblk)
+{
+	atomic_dec(&qmblk->dq_count);
+}
+
+static inline void qmblk_put(struct vz_quota_master *qmblk)
+{
+	if (!atomic_dec_and_test(&qmblk->dq_count))
+		return;
+	vzquota_free_master(qmblk);
+}
+
+extern struct list_head vzquota_hash_table[];
+extern int vzquota_hash_size;
+
+/*
+ * Interface to VZ UGID quota
+ */
+extern struct quotactl_ops vz_quotactl_operations;
+extern struct dquot_operations vz_quota_operations2;
+extern struct dquot_operations vz_quota_operations2_rsv;
+extern struct quota_format_type vz_quota_empty_v2_format;
+
+#define QUGID_TREE(qmblk, type)	(((type) == USRQUOTA) ?		\
+					qmblk->dq_uid_tree :	\
+					qmblk->dq_gid_tree)
+
+#define VZDQUG_FIND_DONT_ALLOC	1
+#define VZDQUG_FIND_FAKE	2
+struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
+		unsigned int quota_id, int type, int flags);
+struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
+		unsigned int quota_id, int type, int flags);
+struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid);
+void vzquota_put_ugid(struct vz_quota_master *qmblk,
+		struct vz_quota_ugid *qugid);
+void vzquota_kill_ugid(struct vz_quota_master *qmblk);
+int vzquota_ugid_init(void);
+void vzquota_ugid_release(void);
+int vzquota_transfer_usage(struct inode *inode, int mask,
+		struct vz_quota_ilink *qlnk, struct vz_quota_ugid **dirty);
+void vzquota_inode_off(struct inode *inode);
+
+long do_vzquotaugidctl(int cmd, unsigned int quota_id,
+		unsigned int ugid_index, unsigned int ugid_size,
+		void *addr, int compat);
+
+/*
+ * Other VZ quota parts
+ */
+extern struct dquot_operations vz_quota_operations;
+extern struct dquot_operations vz_quota_operations_rsv;
+
+#define IS_VZ_QUOTA(sb) ((sb)->dq_op == &vz_quota_operations ||		\
+				(sb)->dq_op == &vz_quota_operations_rsv)
+
+long do_vzquotactl(int cmd, unsigned int quota_id,
+		struct vz_quota_stat __user *qstat, const char __user *ve_root,
+		int compat);
+int vzquota_proc_init(void);
+void vzquota_proc_release(void);
+struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
+
+void vzaquota_init(void);
+void vzaquota_fini(void);
+
+struct vzsnap_struct;
+extern int vzquota_snap_init(struct super_block *, void *, struct path *);
+extern int vzquota_snap_stop(struct super_block *, void *);
+
+
+/* This is the ugliest hack of the release, we have to fixup filesystem type
+ * in order to support quota tools.
+ */
+static inline int vzquota_fake_fstype(const struct task_struct *tsk)
+{
+	const char **p;
+	const char *comm;
+	const char *comm_list[] = {
+		"convertquota",
+		"edquota",
+		"quota",
+		"quot",
+		"quotacheck",
+		"quotadebug",
+		"quotaon",
+		"quotaoff",
+		"quotastats",
+		"quota_nld",
+		"repquota",
+		"rpc.rquotad",
+		"setquota",
+		"setup_quota_group",
+		"xqmstats"
+		"warnquota",
+		NULL,
+	};
+	comm = strrchr(tsk->comm, '/');
+	if (comm)
+		comm++;
+	else
+		comm = tsk->comm;
+
+	p = comm_list;
+	while (*p != NULL) {
+		if (!strcmp(*p, comm))
+			return 1;
+		p++;
+	}
+	return 0;
+}
+
+/* quotacheck uses direct scan mode for ext2/ext3 */
+#define VZQUOTA_FAKE_FSTYPE "reiserfs"
+
+#endif /* __KERNEL__ */
+
+#endif /* _VZDQUOTA_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzquota_qlnk.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzquota_qlnk.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzquota_qlnk.h	2015-01-21 12:02:53.121963610 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzquota_qlnk.h	2015-01-21 12:02:53.121963610 +0300
@@ -0,0 +1,25 @@
+/*
+ *  include/linux/vzquota_qlnk.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VZDQUOTA_QLNK_H
+#define _VZDQUOTA_QLNK_H
+
+struct vz_quota_master;
+struct vz_quota_ugid;
+
+/* inode link, used to track inodes using quota via dq_ilink_list */
+struct vz_quota_ilink {
+	struct vz_quota_master *qmblk;
+	struct vz_quota_ugid *qugid[MAXQUOTAS];
+	struct list_head list;
+	unsigned char origin[2];
+};
+
+#endif /* _VZDQUOTA_QLNK_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzsnap.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzsnap.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzsnap.h	2015-01-21 12:02:53.584951321 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzsnap.h	2015-01-21 12:02:53.597950975 +0300
@@ -0,0 +1,156 @@
+/*
+ *
+ * Copyright (C) 2007-2011 SWsoft
+ * All rights reserved.
+ * 
+ */
+
+#ifndef _VZSNAP_H
+#define _VZSNAP_H
+
+#define VZSNAPCTLTYPE ';'
+
+#define VZCTL_VZSNAP_NEW_CTL	_IO(VZSNAPCTLTYPE, 1)
+
+
+#define VZSNAPCTL_SET_ID	0
+#define VZSNAPCTL_BIND_VZFS	1
+#define VZSNAPCTL_BIND_VE	2
+#define VZSNAPCTL_PREPARE_DIR	3
+#define VZSNAPCTL_SCAN_FD	4
+#define VZSNAPCTL_RESCAN_FD	5
+#define VZSNAPCTL_SCAN_NAME	6
+#define VZSNAPCTL_START		7
+#define VZSNAPCTL_STOP		8
+#define VZSNAPCTL_GETROOT	9
+
+#define VZSNAPCTL_GETBMAPSIZE	10
+#define VZSNAPCTL_GETIMAPSIZE	11
+#define VZSNAPCTL_GETBMAPMAP	12
+#define VZSNAPCTL_GETIMAPMAP	13
+
+#define VZSNAPCTL_SCAN_INODE	14
+#define VZSNAPCTL_MERGEMAP	15
+#define VZSNAPCTL_SCAN_INODE2	16
+#define VZSNAPCTL_GETBMAPSIZE2	17
+#define VZSNAPCTL_GETIMAPSIZE2	18
+#define VZSNAPCTL_GETBMAPMAP2	19
+#define VZSNAPCTL_GETIMAPMAP2	20
+#define VZSNAPCTL_GETROOT_TMPL	21
+#define VZSNAPCTL_SUBTREE_FD	22
+
+/* ioctl request structure for VZSNAPCTL_SCAN_NAME. "Novel idea" is to use
+ * 64bit interface even on 32bit hosts. I know, I know... */
+
+struct vzsnap_name_req
+{
+	__s32	dirfd;
+	__s32	pad;
+	__u64	ptr;
+} __attribute__((aligned (8)));
+
+struct vzsnap_scan_inode_req
+{
+	__s32	root_fd;
+	__s32	inode;
+} __attribute__((aligned (8)));
+
+/* Offsets on vzsnap "bus". */
+#define VZSNAP_BMAP_PGOFF	0
+#define VZSNAP_IMAP_PGOFF	0x20000000
+
+#define VZSNAP_PRIVATE_PGOFF	0
+#define VZSNAP_TEMPLATE_PGOFF	0x10000
+
+#define VZSNAP_BMAP_PR_PGOFF    (VZSNAP_BMAP_PGOFF|VZSNAP_PRIVATE_PGOFF) /* Block map for private root */
+#define VZSNAP_IMAP_PR_PGOFF	(VZSNAP_IMAP_PGOFF|VZSNAP_PRIVATE_PGOFF) /* Inode map for private root */
+#define VZSNAP_BMAP_TMPL_PGOFF  (VZSNAP_BMAP_PGOFF|VZSNAP_TEMPLATE_PGOFF) /* Block map for template root */
+#define VZSNAP_IMAP_TMPL_PGOFF  (VZSNAP_IMAP_PGOFF|VZSNAP_TEMPLATE_PGOFF) /* Inode map for template root */
+
+enum
+{
+	IS_NONE		= 0,	/* Not scanned or not within our tree */
+	IS_SCANNED	= 1,	/* Inode is ours, scan is started */
+	IS_RESCAN	= 3	/* Inode is ours, needs rescan */
+};
+
+#ifdef __KERNEL__
+
+struct vzsnap_struct;
+struct vzsnap_iterate_ops;
+struct vzsnap_map
+{
+	struct page		**inode_map;
+	struct page		**block_map;
+	ino_t			inode_max;
+	sector_t		block_max;
+	struct super_block	*sb;
+	const struct vzsnap_iterate_ops *ops;
+	struct vzsnap_struct	*vzs;
+};
+
+struct vzsnap_struct
+{
+	atomic_t		refcnt;
+	unsigned long		dead;
+	unsigned long		state;
+	struct list_head	list;
+	int			id;
+
+	struct vzsnap_ops	*ops;
+
+	int			error;
+
+	int			ve_frozen;
+	struct ve_struct	*ve;
+	struct vzsnap_map	*pmap;
+	struct vzsnap_map	*tmap;
+
+	struct vfsmount		*vzfs_mnt;
+	struct dentry		*vzfs_root;
+	unsigned long		priv_ino;
+	unsigned long		cow_ino;
+
+	struct vfsmount		*vzdq_mnt;
+	struct dentry		*vzdq_root;
+
+	struct vfsmount		*vzfs_tmpl_mnt;
+	struct dentry		*vzfs_tmpl_root;
+
+	struct super_block	*psb;
+	struct super_block	*tsb;
+
+	spinlock_t		lock;	/* Protects bitmap operations */
+	struct mutex		mutex;	/* ioctl serialization */
+};
+
+struct vzsnap_ops
+{
+	void (*addblock)(struct vzsnap_struct *vzs, struct inode * inode);
+	void (*create)(struct vzsnap_struct *vzs, struct inode *dir, struct dentry *de);
+	void (*unlink)(struct vzsnap_struct *vzs, struct inode *dir, struct dentry *de);
+	void (*rename)(struct vzsnap_struct *vzs, struct inode *ndir,
+		   struct dentry *nde, struct inode *odir, struct dentry *ode);
+	void (*truncate)(struct vzsnap_struct *vzs, struct inode *dir, size_t len);
+};
+
+/* Should be protected with user-specific serializer */
+
+static inline struct vzsnap_struct *vzsnap_get(struct vzsnap_struct * vzs)
+{
+	atomic_inc(&vzs->refcnt);
+	return vzs;
+}
+
+static inline void __vzsnap_put(struct vzsnap_struct * vzs)
+{
+	atomic_dec(&vzs->refcnt);
+}
+
+extern int vzsnap_release_map(struct vzsnap_struct *vzs);
+extern struct vzsnap_map * vzsnap_get_map(int id, struct block_device *bdev);
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _VZSNAP_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/vzstat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzstat.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/vzstat.h	2015-01-21 12:02:44.088203434 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/vzstat.h	2015-01-21 12:02:53.939941897 +0300
@@ -0,0 +1,226 @@
+/*
+ *  include/linux/vzstat.h
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VZSTAT_H__
+#define __VZSTAT_H__
+
+#include <linux/mmzone.h>
+
+struct swap_cache_info_struct {
+	unsigned long add_total;
+	unsigned long del_total;
+	unsigned long find_success;
+	unsigned long find_total;
+};
+
+struct kstat_lat_snap_struct {
+	u64 maxlat, totlat;
+	unsigned long count;
+};
+struct kstat_lat_pcpu_snap_struct {
+	u64 maxlat, totlat;
+	unsigned long count;
+	seqcount_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct kstat_lat_struct {
+	struct kstat_lat_snap_struct cur, last;
+	u64 avg[3];
+};
+struct kstat_lat_pcpu_struct {
+	struct kstat_lat_pcpu_snap_struct *cur;
+	u64 max_snap;
+	struct kstat_lat_snap_struct last;
+	u64 avg[3];
+};
+
+struct kstat_perf_snap_struct {
+	u64 wall_tottime, cpu_tottime;
+	u64 wall_maxdur, cpu_maxdur;
+	unsigned long count;
+};
+
+struct kstat_perf_pcpu_snap_struct {
+	u64 wall_tottime, cpu_tottime;
+	u64 wall_maxdur, cpu_maxdur;
+	unsigned long count;
+	seqcount_t lock;
+};
+
+struct kstat_perf_pcpu_struct {
+	struct kstat_perf_pcpu_snap_struct *cur;
+	struct kstat_perf_snap_struct last;
+};
+
+struct kstat_zone_avg {
+	unsigned long		free_pages_avg[3],
+				nr_active_avg[3],
+				nr_inactive_avg[3];
+};
+
+enum {
+	KSTAT_ALLOCSTAT_ATOMIC,
+	KSTAT_ALLOCSTAT_LOW,
+	KSTAT_ALLOCSTAT_HIGH,
+	KSTAT_ALLOCSTAT_LOW_MP,
+	KSTAT_ALLOCSTAT_HIGH_MP,
+	KSTAT_ALLOCSTAT_NR,
+};
+
+struct kernel_stat_glob {
+	unsigned long nr_unint_avg[3];
+
+	unsigned long alloc_fails[NR_CPUS][KSTAT_ALLOCSTAT_NR];
+	struct kstat_lat_pcpu_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
+	struct kstat_lat_pcpu_struct sched_lat;
+	struct kstat_lat_pcpu_struct page_in;
+	struct kstat_lat_struct swap_in;
+
+	struct kstat_perf_pcpu_struct ttfp, cache_reap,
+			refill_inact, shrink_icache, shrink_dcache;
+
+	struct kstat_zone_avg zone_avg[MAX_NR_ZONES];
+} ____cacheline_aligned;
+
+extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
+extern spinlock_t kstat_glb_lock;
+
+extern void kstat_init(void);
+
+static inline void
+KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time, u64 cpu_time)
+{
+	struct kstat_perf_pcpu_snap_struct *cur = get_cpu_ptr(ptr->cur);
+
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->wall_maxdur < real_time)
+		cur->wall_maxdur = real_time;
+	cur->wall_tottime += real_time;
+	if (cur->cpu_maxdur < cpu_time)
+		cur->cpu_maxdur = cpu_time;
+	cur->cpu_tottime += real_time;
+	write_seqcount_end(&cur->lock);
+	put_cpu_ptr(cur);
+}
+
+#ifdef CONFIG_VE
+#define KSTAT_PERF_ENTER(name)				\
+	u64 start, sleep_time;				\
+							\
+	start = ktime_to_ns(ktime_get());		\
+	sleep_time = VE_TASK_INFO(current)->sleep_time;	\
+
+#define KSTAT_PERF_LEAVE(name)				\
+	start = ktime_to_ns(ktime_get()) - start;	\
+	sleep_time = VE_TASK_INFO(current)->sleep_time - sleep_time; \
+	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
+
+#else
+#define KSTAT_PERF_ENTER(name)
+#define KSTAT_PERF_LEAVE(name)
+#endif
+
+/*
+ * Add another statistics reading.
+ * Serialization is the caller's due.
+ */
+static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p,
+		u64 dur)
+{
+	p->cur.count++;
+	if (p->cur.maxlat < dur)
+		p->cur.maxlat = dur;
+	p->cur.totlat += dur;
+}
+
+/*
+ * Must be called with disabled interrupts to remove any possible
+ * locks and seqcounts under write-lock and avoid this 3-way deadlock:
+ *
+ * timer interrupt:
+ *	write_seqlock(&xtime_lock);
+ *	 spin_lock_irqsave(&kstat_glb_lock);
+ *
+ * update_schedule_latency():
+ *	spin_lock_irq(&kstat_glb_lock);
+ *	 read_seqcount_begin(&cur->lock)
+ *
+ * some-interrupt during KSTAT_LAT_PCPU_ADD()
+ *   KSTAT_LAT_PCPU_ADD()
+ *    write_seqcount_begin(&cur->lock);
+ *     <interrupt>
+ *      ktime_get()
+ *       read_seqcount_begin(&xtime_lock);
+ */
+static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu,
+		u64 dur)
+{
+	struct kstat_lat_pcpu_snap_struct *cur;
+
+	cur = per_cpu_ptr(p->cur, cpu);
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->maxlat < dur)
+		cur->maxlat = dur;
+	cur->totlat += dur;
+	write_seqcount_end(&cur->lock);
+}
+
+/*
+ * Move current statistics to last, clear last.
+ * Serialization is the caller's due.
+ */
+static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p)
+{
+	u64 m;
+	memcpy(&p->last, &p->cur, sizeof(p->last));
+	p->cur.maxlat = 0;
+	m = p->last.maxlat;
+	CALC_LOAD(p->avg[0], EXP_1, m)
+	CALC_LOAD(p->avg[1], EXP_5, m)
+	CALC_LOAD(p->avg[2], EXP_15, m)
+}
+
+static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
+{
+	unsigned i, cpu;
+	struct kstat_lat_pcpu_snap_struct snap, *cur;
+	u64 m;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		do {
+			i = read_seqcount_begin(&cur->lock);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(&cur->lock, i));
+		/* 
+		 * read above and this update of maxlat is not atomic,
+		 * but this is OK, since it happens rarely and losing
+		 * a couple of peaks is not essential. xemul
+		 */
+		cur->maxlat = 0;
+
+		p->last.count += snap.count;
+		p->last.totlat += snap.totlat;
+		if (p->last.maxlat < snap.maxlat)
+			p->last.maxlat = snap.maxlat;
+	}
+
+	m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
+	CALC_LOAD(p->avg[0], EXP_1, m);
+	CALC_LOAD(p->avg[1], EXP_5, m);
+	CALC_LOAD(p->avg[2], EXP_15, m);
+	/* reset max_snap to calculate it correctly next time */
+	p->max_snap = 0;
+}
+
+#endif /* __VZSTAT_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/workqueue.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/workqueue.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/workqueue.h	2014-12-12 23:29:28.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/workqueue.h	2015-01-21 12:02:46.936127825 +0300
@@ -173,10 +173,10 @@ struct execute_work {
 extern struct workqueue_struct *
 __create_workqueue_key(const char *name, int singlethread,
 		       int freezeable, int rt, struct lock_class_key *key,
-		       const char *lock_name);
+		       const char *lock_name, void *ve);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, singlethread, freezeable, rt)	\
+#define __create_workqueue(name, singlethread, freezeable, rt, ve)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -188,22 +188,27 @@ __create_workqueue_key(const char *name,
 								\
 	__create_workqueue_key((name), (singlethread),		\
 			       (freezeable), (rt), &__key,	\
-			       __lock_name);			\
+			       __lock_name, (ve));		\
 })
 #else
-#define __create_workqueue(name, singlethread, freezeable, rt)	\
+#define __create_workqueue(name, singlethread, freezeable, rt, ve)	\
 	__create_workqueue_key((name), (singlethread), (freezeable), (rt), \
-			       NULL, NULL)
+			       NULL, NULL, ve)
 #endif
 
-#define create_workqueue(name) __create_workqueue((name), 0, 0, 0)
-#define create_rt_workqueue(name) __create_workqueue((name), 0, 0, 1)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
+#define create_workqueue(name) __create_workqueue((name), 0, 0, 0, NULL)
+#define create_rt_workqueue(name) __create_workqueue((name), 0, 0, 1, NULL)
+#define create_freezeable_workqueue(name)	\
+				__create_workqueue((name), 1, 1, 0, NULL)
 #define create_freezable_workqueue(name) create_freezeable_workqueue(name)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
+#define create_singlethread_workqueue(name)	\
+				__create_workqueue((name), 1, 0, 0, NULL)
 
 #define alloc_ordered_workqueue(name, flags) create_singlethread_workqueue(name)
 
+#define create_workqueue_ve(name, ve) __create_workqueue((name), 0, 0, 0, ve)
+#define create_singlethread_workqueue_ve(name, ve) __create_workqueue((name), 1, 0, 0, ve)
+
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/writeback.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/writeback.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/writeback.h	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/writeback.h	2015-01-21 12:02:52.555978634 +0300
@@ -52,25 +52,33 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	unsigned for_sync:1;		/* sync_fs() will be executed after this work is done */
+
+	struct user_beancounter *wb_ub;	/* only for this beancounter */
 
 	/* reserved for Red Hat */
 	unsigned long rh_reserved[5];
+
+	void *fsdata; /* Private fs data */
 };
 
 /*
  * fs/fs-writeback.c
  */	
 struct bdi_writeback;
+struct user_beancounter;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
+void writeback_inodes_sb_ub(struct super_block *, struct user_beancounter *);
 int writeback_inodes_sb_if_idle(struct super_block *);
 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
 void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb_ub(struct super_block *, struct user_beancounter *);
 void writeback_inodes_wb(struct bdi_writeback *wb,
 		struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
-void wakeup_flusher_threads(long nr_pages);
+void wakeup_flusher_threads(struct user_beancounter *, long nr_pages);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/linux/xattr.h linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/xattr.h
--- linux-2.6.32-504.3.3.el6.orig/include/linux/xattr.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/linux/xattr.h	2015-01-21 12:02:43.677214346 +0300
@@ -10,6 +10,13 @@
 #ifndef _LINUX_XATTR_H
 #define _LINUX_XATTR_H
 
+#ifdef CONFIG_VE
+extern int ve_xattr_policy;
+#define VE_XATTR_POLICY_ACCEPT	0
+#define VE_XATTR_POLICY_IGNORE	1
+#define VE_XATTR_POLICY_REJECT	2
+#endif
+
 #define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
 #define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/addrconf.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/addrconf.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/addrconf.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/addrconf.h	2015-01-21 12:02:45.356169771 +0300
@@ -295,5 +295,9 @@ extern int if6_proc_init(void);
 extern void if6_proc_exit(void);
 #endif
 
+int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
+		unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
+		__u32 valid_lft);
+
 #endif
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/af_unix.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/af_unix.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/af_unix.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/af_unix.h	2015-01-21 12:02:51.381009825 +0300
@@ -11,6 +11,7 @@ extern void unix_notinflight(struct file
 extern void unix_gc(void);
 extern void wait_for_unix_gc(void);
 extern struct sock *unix_get_socket(struct file *filp);
+extern void unix_destruct_scm(struct sk_buff *skb);
 
 #define UNIX_HASH_SIZE	256
 
@@ -23,6 +24,9 @@ struct unix_address {
 	struct sockaddr_un name[0];
 };
 
+int unix_bind_path(struct sock *, struct dentry *, struct vfsmount *);
+int unix_attach_addr(struct sock *, struct sockaddr_un *, int);
+
 struct unix_skb_parms {
 	struct pid		*pid;		/* Skb credentials	*/
 	const struct cred	*cred;
@@ -30,6 +34,7 @@ struct unix_skb_parms {
 #ifdef CONFIG_SECURITY_NETWORK
 	u32			secid;		/* Security ID		*/
 #endif
+	u32			consumed;
 };
 
 #define UNIXCB(skb) 	(*(struct unix_skb_parms*)&((skb)->cb))
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/dst.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/dst.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/dst.h	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/dst.h	2015-01-21 12:02:47.076124108 +0300
@@ -48,6 +48,7 @@ struct dst_entry
 #define DST_NOXFRM		2
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
+#define DST_FREE		32
 #define DST_FAKE_RTABLE		0x0080
 	unsigned long		expires;
 
@@ -116,6 +117,8 @@ struct dst_entry
 	atomic_t		__refcnt;	/* client references	*/
 	int			__use;
 	unsigned long		lastuse;
+	unsigned int		privnet_mark;
+
 	union {
 		struct dst_entry *next;
 		struct rtable    *rt_next;
@@ -125,6 +128,10 @@ struct dst_entry
 };
 
 #ifdef __KERNEL__
+void dst_dump_one(struct dst_entry *d);
+void ip_rt_dump_dsts(void);
+void dst_cache_dump(void);
+extern void (*ip6_rt_dump_dsts)(void);
 
 static inline u32
 dst_metric(const struct dst_entry *dst, int metric)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/fib_rules.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/fib_rules.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/fib_rules.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/fib_rules.h	2015-01-21 12:02:42.445247053 +0300
@@ -67,6 +67,7 @@ struct fib_rules_ops
 	struct list_head	rules_list;
 	struct module		*owner;
 	struct net		*fro_net;
+	struct rcu_head		rcu;
 };
 
 #define FRA_GENERIC_POLICY \
@@ -102,7 +103,7 @@ static inline u32 frh_get_table(struct f
 	return frh->table;
 }
 
-extern int fib_rules_register(struct fib_rules_ops *);
+extern struct fib_rules_ops *fib_rules_register(struct fib_rules_ops *, struct net *);
 extern void fib_rules_unregister(struct fib_rules_ops *);
 extern void                     fib_rules_cleanup_ops(struct fib_rules_ops *);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/flow.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/flow.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/flow.h	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/flow.h	2015-01-21 12:02:45.356169771 +0300
@@ -10,6 +10,7 @@
 #include <linux/in6.h>
 #include <asm/atomic.h>
 
+struct ve_struct;
 struct flowi {
 	int	oif;
 	int	iif;
@@ -77,6 +78,9 @@ struct flowi {
 #define fl_icmp_code	uli_u.icmpt.code
 #define fl_ipsec_spi	uli_u.spi
 #define fl_mh_type	uli_u.mht.type
+#ifdef CONFIG_VE
+	struct ve_struct *owner_env;
+#endif
 	__u32           secid;	/* used by xfrm; see secid.txt */
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/inet_frag.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/inet_frag.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/inet_frag.h	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/inet_frag.h	2015-01-21 12:02:45.357169744 +0300
@@ -58,6 +58,9 @@ struct inet_frag_queue {
 	u16			max_size;
 
 	struct netns_frags	*net;
+#ifdef CONFIG_VE
+	struct ve_struct	*owner_ve;
+#endif
 };
 
 #define INETFRAGS_HASHSZ	1024
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/inet_timewait_sock.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/inet_timewait_sock.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/inet_timewait_sock.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/inet_timewait_sock.h	2015-01-21 12:02:51.272012718 +0300
@@ -82,6 +82,7 @@ struct inet_timewait_death_row {
 	struct inet_hashinfo 	*hashinfo;
 	int			sysctl_tw_recycle;
 	int			sysctl_max_tw_buckets;
+	int			ub_managed;
 };
 
 extern void inet_twdr_hangman(unsigned long data);
@@ -138,6 +139,7 @@ struct inet_timewait_sock {
 	unsigned long		tw_ttd;
 	struct inet_bind_bucket	*tw_tb;
 	struct hlist_node	tw_death_node;
+	envid_t			tw_owner_env;
 };
 
 static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
@@ -217,14 +219,14 @@ extern void inet_twsk_schedule(struct in
 extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
 				 struct inet_timewait_death_row *twdr);
 
-extern void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
+extern void inet_twsk_purge(struct inet_hashinfo *hashinfo,
 			    struct inet_timewait_death_row *twdr, int family);
 
 static inline
 struct net *twsk_net(const struct inet_timewait_sock *twsk)
 {
 #ifdef CONFIG_NET_NS
-	return twsk->tw_net;
+	return rcu_dereference(twsk->tw_net);
 #else
 	return &init_net;
 #endif
@@ -234,7 +236,7 @@ static inline
 void twsk_net_set(struct inet_timewait_sock *twsk, struct net *net)
 {
 #ifdef CONFIG_NET_NS
-	twsk->tw_net = net;
+	rcu_assign_pointer(twsk->tw_net, net);
 #endif
 }
 #endif	/* _INET_TIMEWAIT_SOCK_ */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/ip6_fib.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ip6_fib.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/ip6_fib.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ip6_fib.h	2015-01-21 12:02:45.357169744 +0300
@@ -166,6 +166,7 @@ struct fib6_table {
 	u32			tb6_id;
 	rwlock_t		tb6_lock;
 	struct fib6_node	tb6_root;
+	struct ve_struct	*owner_env;
 };
 
 #define RT6_TABLE_UNSPEC	RT_TABLE_UNSPEC
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/ip6_route.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ip6_route.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/ip6_route.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ip6_route.h	2015-01-21 12:02:58.036833147 +0300
@@ -40,6 +40,7 @@ struct route_info {
 
 
 extern void			ip6_route_input(struct sk_buff *skb);
+extern void			__ip6_route_input(struct sk_buff *skb, struct in6_addr *daddr);
 
 extern struct dst_entry *	ip6_route_output(struct net *net,
 						 struct sock *sk,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/ip_tunnels.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ip_tunnels.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/ip_tunnels.h	2014-12-12 23:29:29.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ip_tunnels.h	2015-01-21 12:02:51.143016142 +0300
@@ -95,7 +95,7 @@ struct ip_tunnel_net {
 
 int ip_tunnel_init(struct net_device *dev);
 void ip_tunnel_uninit(struct net_device *dev);
-void  ip_tunnel_dellink(struct net_device *dev);
+void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
@@ -119,6 +119,10 @@ int ip_tunnel_newlink(struct net_device 
 		      struct ip_tunnel_parm *p);
 void ip_tunnel_setup(struct net_device *dev, int net_id);
 
+struct cpt_tunnel_image;
+int ip_tunnel_rst(struct net *net, struct ip_tunnel_net *itn,
+		  const struct cpt_tunnel_image *v, const char *name);
+
 /* Extract dsfield from inner protocol */
 static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 				       const struct sk_buff *skb)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/ipv6.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ipv6.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/ipv6.h	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ipv6.h	2015-01-21 12:02:50.571031327 +0300
@@ -702,5 +702,7 @@ extern int ipv6_static_sysctl_register(v
 extern void ipv6_static_sysctl_unregister(void);
 #endif
 
+extern bool ipv6_is_enabled(void);
+
 #endif /* __KERNEL__ */
 #endif /* _NET_IPV6_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/ndisc.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ndisc.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/ndisc.h	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/ndisc.h	2015-01-21 12:02:42.497245673 +0300
@@ -84,6 +84,15 @@ struct nd_opt_hdr {
 	__u8		nd_opt_len;
 } __attribute__((__packed__));
 
+static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd)
+{
+	const u32 *p32 = pkey;
+
+	return (((p32[0] ^ dev->ifindex) * hash_rnd[0]) +
+		(p32[1] * hash_rnd[1]) +
+		(p32[2] * hash_rnd[2]) +
+		(p32[3] * hash_rnd[3]));
+}
 
 extern int			ndisc_init(void);
 extern int			ndisc_late_init(void);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/neighbour.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/neighbour.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/neighbour.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/neighbour.h	2015-01-21 12:02:42.497245673 +0300
@@ -212,7 +212,7 @@ struct pneigh_entry
 /*
  *	neighbour table manipulation
  */
-
+#define NEIGH_NUM_HASH_RND	4
 
 struct neigh_table
 {
@@ -243,7 +243,7 @@ struct neigh_table
 	struct neigh_statistics	*stats;
 	struct neighbour	**hash_buckets;
 	unsigned int		hash_mask;
-	__u32			hash_rnd;
+	__u32			hash_rnd[NEIGH_NUM_HASH_RND];
 	struct pneigh_entry	**phash_buckets;
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/net_namespace.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/net_namespace.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/net_namespace.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/net_namespace.h	2015-01-21 12:02:51.373010036 +0300
@@ -43,7 +43,8 @@ struct net {
 						 */
 #endif
 	struct list_head	list;		/* list of network namespaces */
-	struct work_struct	work;		/* work struct for freeing */
+	struct list_head	cleanup_list;	/* namespaces on death row */
+	struct list_head	exit_list;	/* Use only net_mutex */
 
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;
@@ -57,6 +58,14 @@ struct net {
 	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
+	unsigned int		dev_base_seq;   /* protected by rtnl_mutex */
+
+	int			ifindex;
+
+#ifdef CONFIG_VE
+	struct completion	*sysfs_completion;
+	struct ve_struct	*owner_ve;
+#endif
 
 	/* core fib_rules */
 	struct list_head	rules_ops;
@@ -64,6 +73,8 @@ struct net {
 
 	struct sock 		*rtnl;			/* rtnetlink socket */
 	struct sock		*genl_sock;
+	struct sock 		*_audit_sock;		/* audit socket */
+	struct sock 		*uevent_sock;		/* kobject uevent socket */
 
 	struct netns_core	core;
 	struct netns_mib	mib;
@@ -81,6 +92,8 @@ struct net {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct netns_ct		ct;
 #endif
+	struct sock		*nfnl;
+	struct sock		*nfnl_stash;
 #endif
 #ifdef CONFIG_XFRM
 	struct netns_xfrm	xfrm;
@@ -162,6 +175,12 @@ int net_eq(const struct net *net1, const
 {
 	return net1 == net2;
 }
+
+/* Returns whether curr can mess with net's objects */
+static inline int net_access_allowed(const struct net *net, const struct net *curr)
+{
+	return net_eq(curr, &init_net) || net_eq(curr, net);
+}
 #else
 
 static inline struct net *get_net(struct net *net)
@@ -183,6 +202,11 @@ int net_eq(const struct net *net1, const
 {
 	return 1;
 }
+
+static inline int net_access_allowed(const struct net *net, const struct net *curr)
+{
+	return 1;
+}
 #endif
 
 
@@ -249,6 +273,9 @@ struct pernet_operations {
 	struct list_head list;
 	int (*init)(struct net *net);
 	void (*exit)(struct net *net);
+	void (*exit_batch)(struct list_head *net_exit_list);
+	int *id;
+	size_t size;
 };
 
 /*
@@ -272,12 +299,8 @@ struct pernet_operations {
  */
 extern int register_pernet_subsys(struct pernet_operations *);
 extern void unregister_pernet_subsys(struct pernet_operations *);
-extern int register_pernet_gen_subsys(int *id, struct pernet_operations *);
-extern void unregister_pernet_gen_subsys(int id, struct pernet_operations *);
 extern int register_pernet_device(struct pernet_operations *);
 extern void unregister_pernet_device(struct pernet_operations *);
-extern int register_pernet_gen_device(int *id, struct pernet_operations *);
-extern void unregister_pernet_gen_device(int id, struct pernet_operations *);
 
 struct ctl_path;
 struct ctl_table;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack.h	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack.h	2015-01-21 12:02:46.980126657 +0300
@@ -282,6 +282,7 @@ extern struct nf_conn *
 nf_conntrack_alloc(struct net *net,
 		   const struct nf_conntrack_tuple *orig,
 		   const struct nf_conntrack_tuple *repl,
+		   struct user_beancounter *,
 		   gfp_t gfp);
 
 /* It's confirmed if it is, or has been in the hash table. */
@@ -302,7 +303,7 @@ static inline int nf_ct_is_untracked(con
 
 extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
 extern unsigned int nf_conntrack_htable_size;
-extern unsigned int nf_conntrack_max;
+extern int ip_conntrack_disable_ve0 /* XXX: unused */;
 
 #define NF_CT_STAT_INC(net, count)	\
 	(per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++)
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack_core.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack_core.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack_core.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack_core.h	2015-01-21 12:02:51.291012214 +0300
@@ -26,7 +26,15 @@ extern unsigned int nf_conntrack_in(stru
 				    struct sk_buff *skb);
 
 extern int nf_conntrack_init(struct net *net);
-extern void nf_conntrack_cleanup(struct net *net);
+extern void nf_conntrack_cleanup_list(struct list_head *net_exit_list);
+
+static inline void nf_conntrack_cleanup(struct net *net)
+{
+	LIST_HEAD(single);
+
+	list_add(&net->exit_list, &single);
+	nf_conntrack_cleanup_list(&single);
+}
 
 extern int nf_conntrack_proto_init(void);
 extern void nf_conntrack_proto_fini(void);
@@ -62,7 +70,7 @@ static inline int nf_conntrack_confirm(s
 	int ret = NF_ACCEPT;
 
 	if (ct && ct != &nf_conntrack_untracked) {
-		if (!nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
+		if (!nf_ct_is_confirmed(ct))
 			ret = __nf_conntrack_confirm(skb);
 		if (likely(ret == NF_ACCEPT))
 			nf_ct_deliver_cached_events(ct);
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack_ecache.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack_ecache.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack_ecache.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack_ecache.h	2015-01-21 12:02:42.536244638 +0300
@@ -65,18 +65,18 @@ struct nf_ct_event_notifier {
 	int (*fcn)(unsigned int events, struct nf_ct_event *item);
 };
 
-extern struct nf_ct_event_notifier *nf_conntrack_event_cb;
-extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
-extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
+extern int nf_conntrack_register_notifier(struct net *net, struct nf_ct_event_notifier *nb);
+extern void nf_conntrack_unregister_notifier(struct net *net, struct nf_ct_event_notifier *nb);
 
 extern void nf_ct_deliver_cached_events(struct nf_conn *ct);
 
 static inline void
 nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *e;
 
-	if (nf_conntrack_event_cb == NULL)
+	if (net->ct.nf_conntrack_event_cb == NULL)
 		return;
 
 	e = nf_ct_ecache_find(ct);
@@ -98,7 +98,7 @@ nf_conntrack_eventmask_report(unsigned i
 	struct nf_conntrack_ecache *e;
 
 	rcu_read_lock();
-	notify = rcu_dereference(nf_conntrack_event_cb);
+	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
 	if (notify == NULL)
 		goto out_unlock;
 
@@ -163,9 +163,8 @@ struct nf_exp_event_notifier {
 	int (*fcn)(unsigned int events, struct nf_exp_event *item);
 };
 
-extern struct nf_exp_event_notifier *nf_expect_event_cb;
-extern int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *nb);
-extern void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *nb);
+extern int nf_ct_expect_register_notifier(struct net *net, struct nf_exp_event_notifier *nb);
+extern void nf_ct_expect_unregister_notifier(struct net *net, struct nf_exp_event_notifier *nb);
 
 static inline void
 nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
@@ -177,7 +176,7 @@ nf_ct_expect_event_report(enum ip_conntr
 	struct nf_exp_event_notifier *notify;
 
 	rcu_read_lock();
-	notify = rcu_dereference(nf_expect_event_cb);
+	notify = rcu_dereference(net->ct.nf_expect_event_cb);
 	if (notify == NULL)
 		goto out_unlock;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack_expect.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack_expect.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_conntrack_expect.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_conntrack_expect.h	2015-01-21 12:02:50.360036929 +0300
@@ -7,7 +7,6 @@
 #include <net/netfilter/nf_conntrack.h>
 
 extern unsigned int nf_ct_expect_hsize;
-extern unsigned int nf_ct_expect_max;
 
 struct nf_conntrack_expect
 {
@@ -81,6 +80,7 @@ void nf_conntrack_expect_fini(struct net
 struct nf_conntrack_expect *
 __nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple);
 
+
 struct nf_conntrack_expect *
 nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_nat.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_nat.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netfilter/nf_nat.h	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netfilter/nf_nat.h	2015-01-21 12:02:51.072018028 +0300
@@ -78,6 +78,8 @@ struct nf_conn_nat
 #endif
 };
 
+void nf_nat_hash_conntrack(struct net *net, struct nf_conn *ct);
+
 /* Set up the info structure to map into this range. */
 extern unsigned int nf_nat_setup_info(struct nf_conn *ct,
 				      const struct nf_nat_range *range,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netlink_sock.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netlink_sock.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netlink_sock.h	2015-01-21 12:02:45.357169744 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netlink_sock.h	2015-01-21 12:02:45.357169744 +0300
@@ -0,0 +1,23 @@
+#ifndef __NET_NETLINK_SOCK_H
+#define __NET_NETLINK_SOCK_H
+
+struct netlink_sock {
+	/* struct sock has to be the first member of netlink_sock */
+	struct sock		sk;
+	u32			pid;
+	u32			dst_pid;
+	u32			dst_group;
+	u32			flags;
+	u32			subscriptions;
+	u32			ngroups;
+	unsigned long		*groups;
+	unsigned long		state;
+	wait_queue_head_t	wait;
+	struct netlink_callback	*cb;
+	struct mutex		*cb_mutex;
+	struct mutex		cb_def_mutex;
+	void			(*netlink_rcv)(struct sk_buff *skb);
+	struct module		*module;
+};
+
+#endif /* __NET_NETLINK_SOCK_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netns/conntrack.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/conntrack.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netns/conntrack.h	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/conntrack.h	2015-01-21 12:02:46.987126472 +0300
@@ -10,7 +10,9 @@ struct nf_conntrack_ecache;
 
 struct netns_ct {
 	atomic_t		count;
+	unsigned int		max;
 	unsigned int		expect_count;
+	unsigned int		expect_max;
 	unsigned int		htable_size;
 	struct kmem_cache	*nf_conntrack_cachep;
 	struct hlist_nulls_head	*hash;
@@ -18,6 +20,8 @@ struct netns_ct {
 	struct hlist_nulls_head	unconfirmed;
 	struct hlist_nulls_head	dying;
 	struct ip_conntrack_stat *stat;
+	struct nf_ct_event_notifier *nf_conntrack_event_cb;
+	struct nf_exp_event_notifier *nf_expect_event_cb;
 	int			sysctl_events;
 	unsigned int		sysctl_events_retry_timeout;
 	int			sysctl_acct;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netns/generic.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/generic.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netns/generic.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/generic.h	2015-01-21 12:02:51.237013648 +0300
@@ -12,9 +12,11 @@
  * stuff on the struct net without explicit struct net modification
  *
  * The rules are simple:
- * 1. register the ops with register_pernet_gen_device to get the id
- *    of your private pointer;
- * 2. call net_assign_generic() to put the private data on the struct
+ * 1. set pernet_operations->id.  After register_pernet_device you
+ *    will have the id of your private pointer.
+ * 2. Either set pernet_operations->size (to have the code allocate and
+ *    free a private structure pointed to from struct net ) or 
+ *    call net_assign_generic() to put the private data on the struct
  *    net (most preferably this should be done in the ->init callback
  *    of the ops registered);
  * 3. do not change this pointer while the net is alive;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netns/ipv6.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/ipv6.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netns/ipv6.h	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/ipv6.h	2015-01-21 12:02:45.357169744 +0300
@@ -14,6 +14,7 @@ struct netns_sysctl_ipv6 {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *table;
 	struct ctl_table_header *frags_hdr;
+	struct ctl_table_header *nf_frags_hdr;
 #endif
 	int bindv6only;
 	int flush_delay;
@@ -32,6 +33,7 @@ struct netns_ipv6 {
 	struct ipv6_devconf	*devconf_all;
 	struct ipv6_devconf	*devconf_dflt;
 	struct netns_frags	frags;
+	struct netns_frags	ct_frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*ip6table_filter;
 	struct xt_table		*ip6table_mangle;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/netns/xfrm.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/xfrm.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/netns/xfrm.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/netns/xfrm.h	2015-01-21 12:02:51.257013118 +0300
@@ -43,6 +43,7 @@ struct netns_xfrm {
 	struct work_struct	policy_hash_work;
 
 	struct sock		*nlsk;
+	struct sock		*nlsk_stash;
 
 	u32			sysctl_aevent_etime;
 	u32			sysctl_aevent_rseqth;
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/route.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/route.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/route.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/route.h	2015-01-21 12:02:45.357169744 +0300
@@ -141,6 +141,7 @@ static inline void ip_rt_put(struct rtab
 #define IPTOS_RT_MASK	(IPTOS_TOS_MASK & ~3)
 
 extern const __u8 ip_tos2prio[16];
+extern int ip_rt_src_check;
 
 static inline char rt_tos2priority(u8 tos)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/rtnetlink.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/rtnetlink.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/rtnetlink.h	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/rtnetlink.h	2015-01-21 12:02:51.144016116 +0300
@@ -64,7 +64,8 @@ struct rtnl_link_ops {
 	int			(*changelink)(struct net_device *dev,
 					      struct nlattr *tb[],
 					      struct nlattr *data[]);
-	void			(*dellink)(struct net_device *dev);
+	void			(*dellink)(struct net_device *dev,
+					   struct list_head *head);
 
 	size_t			(*get_size)(const struct net_device *dev);
 	int			(*fill_info)(struct sk_buff *skb,
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/sock.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/sock.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/sock.h	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/sock.h	2015-01-21 12:02:51.389009613 +0300
@@ -60,6 +60,8 @@
 #include <net/dst.h>
 #include <net/checksum.h>
 
+#include <bc/net.h>
+
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
@@ -308,6 +310,8 @@ struct sock {
   	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);  
 	void                    (*sk_destruct)(struct sock *sk);
+	struct sock_beancounter sk_bc;
+	struct ve_struct	*owner_env;
 };
 
 struct inet_cork_extended {
@@ -727,6 +731,8 @@ static inline void sock_rps_save_rxhash(
 	})
 
 extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
+extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p,
+				unsigned long amount);
 extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
 extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
 extern int sk_stream_error(struct sock *sk, int flags, int err);
@@ -1030,12 +1036,15 @@ static inline int sk_wmem_schedule(struc
 		__sk_mem_schedule(sk, size, SK_MEM_SEND);
 }
 
-static inline int sk_rmem_schedule(struct sock *sk, int size)
+static inline int sk_rmem_schedule(struct sock *sk,  struct sk_buff *skb)
 {
 	if (!sk_has_account(sk))
 		return 1;
-	return size <= sk->sk_forward_alloc ||
-		__sk_mem_schedule(sk, size, SK_MEM_RECV);
+	if (!(skb->truesize <= sk->sk_forward_alloc ||
+	      __sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV)))
+		return 0;
+
+	return !ub_sockrcvbuf_charge(sk, skb);
 }
 
 static inline void sk_mem_reclaim(struct sock *sk)
@@ -1159,6 +1168,12 @@ extern struct sk_buff 		*sock_alloc_send
 						      unsigned long data_len,
 						      int noblock,
 						      int *errcode);
+extern struct sk_buff 		*sock_alloc_send_skb2(struct sock *sk,
+						     unsigned long head_len,
+						     unsigned long data_len,
+						     unsigned long min_size,
+						     int noblock,
+						     int *errcode);
 extern void *sock_kmalloc(struct sock *sk, int size,
 			  gfp_t priority);
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
@@ -1547,7 +1562,8 @@ static inline void sock_poll_wait(struct
 
 static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 {
-	skb_orphan(skb);
+	WARN_ON(skb->destructor);
+	__skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = sock_wfree;
 	/*
@@ -1560,7 +1576,8 @@ static inline void skb_set_owner_w(struc
 
 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
-	skb_orphan(skb);
+	WARN_ON(skb->destructor);
+	__skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = sock_rfree;
 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
@@ -1775,6 +1792,13 @@ static inline void sk_change_net(struct 
 	sock_net_set(sk, hold_net(net));
 }
 
+static inline void sk_change_net_get(struct sock *sk, struct net *net)
+{
+	struct net *old_net = sock_net(sk);
+	sock_net_set(sk, get_net(net));
+	put_net(old_net);
+}
+
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (unlikely(skb->sk)) {
diff -upr linux-2.6.32-504.3.3.el6.orig/include/net/tcp.h linux-2.6.32-504.3.3.el6-042stab103_6/include/net/tcp.h
--- linux-2.6.32-504.3.3.el6.orig/include/net/tcp.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/net/tcp.h	2015-01-21 12:02:58.104831344 +0300
@@ -44,6 +44,13 @@
 #include <net/dst.h>
 
 #include <linux/seq_file.h>
+#include <bc/net.h>
+
+#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+
+#define TW_WSCALE_MASK		0x0f
+#define TW_WSCALE_SPEC		0x10
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -237,7 +244,9 @@ extern int sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
+#ifndef sysctl_tcp_adv_win_scale
 extern int sysctl_tcp_adv_win_scale;
+#endif
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_frto_response;
@@ -257,6 +266,10 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
+extern int sysctl_tcp_use_sg;
+extern int sysctl_tcp_max_tw_kmem_fraction;
+extern int sysctl_tcp_max_tw_buckets_ub;
+
 
 extern atomic_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -646,7 +659,11 @@ extern u32	__tcp_select_window(struct so
  * to use only the low 32-bits of jiffies and hide the ugly
  * casts with the following macro.
  */
+#ifdef CONFIG_VE
+#define tcp_time_stamp		((__u32)(jiffies + get_exec_env()->jiffies_fixup))
+#else
 #define tcp_time_stamp		((__u32)(jiffies))
+#endif
 
 /* This is what the send packet queuing engine uses to pass
  * TCP per-packet control information to the transmission
@@ -993,7 +1010,13 @@ static inline int tcp_prequeue(struct so
 
 		tp->ucopy.memory = 0;
 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
-		wake_up_interruptible_poll(sk->sk_sleep,
+		/*
+		 * It's unclean how sk->sk_sleep might be nil here,
+		 * so to not panicing rather warn-on-once. psbm-27040.
+		 * 	-- cyrillos
+		 */
+		WARN_ON_ONCE(!sk->sk_sleep);
+		wake_up_interruptible_sync_poll(sk->sk_sleep,
 					   POLLIN | POLLRDNORM | POLLRDBAND);
 		if (!inet_csk_ack_scheduled(sk))
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -1562,6 +1585,11 @@ struct tcp_request_sock_ops {
 #endif
 };
 
+#ifdef CONFIG_TCP_MD5SIG
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
+#endif
+
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/include/sound/core.h linux-2.6.32-504.3.3.el6-042stab103_6/include/sound/core.h
--- linux-2.6.32-504.3.3.el6.orig/include/sound/core.h	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/sound/core.h	2015-01-21 12:02:41.798264233 +0300
@@ -29,6 +29,7 @@
 #include <linux/pm.h>			/* pm_message_t */
 #include <linux/device.h>
 #include <linux/stringify.h>
+#include <linux/sysfs.h>
 
 /* number of supported soundcards */
 #ifdef CONFIG_SND_DYNAMIC_MINORS
@@ -133,9 +134,7 @@ struct snd_card {
 	int free_on_last_close;		/* free in context of file_release */
 	wait_queue_head_t shutdown_sleep;
 	struct device *dev;		/* device assigned to this card */
-#ifndef CONFIG_SYSFS_DEPRECATED
 	struct device *card_dev;	/* cardX object for sysfs */
-#endif
 
 #ifdef CONFIG_PM
 	unsigned int power_state;	/* power state */
@@ -199,11 +198,10 @@ struct snd_minor {
 /* return a device pointer linked to each sound device as a parent */
 static inline struct device *snd_card_get_device_link(struct snd_card *card)
 {
-#ifdef CONFIG_SYSFS_DEPRECATED
-	return card ? card->dev : NULL;
-#else
-	return card ? card->card_dev : NULL;
-#endif
+	if (sysfs_deprecated)
+		return card ? card->dev : NULL;
+	else
+		return card ? card->card_dev : NULL;
 }
 
 /* sound.c */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/trace/events/ext4.h linux-2.6.32-504.3.3.el6-042stab103_6/include/trace/events/ext4.h
--- linux-2.6.32-504.3.3.el6.orig/include/trace/events/ext4.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/trace/events/ext4.h	2015-01-21 12:02:52.555978634 +0300
@@ -639,6 +639,61 @@ TRACE_EVENT(ext4_sync_file,
 		  (unsigned long) __entry->parent, __entry->datasync)
 );
 
+TRACE_EVENT(ext4_sync_files_iterate,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int datasync),
+
+	TP_ARGS(dentry, tid, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->datasync	= datasync;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+	),
+
+	TP_printk("dev %s ino %ld parent %ld datasync %d tid %u",
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync,
+		  __entry->tid)
+);
+
+TRACE_EVENT(ext4_sync_files_exit,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int barrier),
+
+	TP_ARGS(dentry, tid, barrier),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	barrier			)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+		__entry->barrier	= barrier;
+	),
+
+	TP_printk("dev %s ino %ld parent %ld explicit_barrier %d tid %u",
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->barrier,
+		  __entry->tid)
+);
+
+
 TRACE_EVENT(ext4_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
 
@@ -924,6 +979,59 @@ TRACE_EVENT(ext4_update_reserve_space,
 		 __entry->allocated, __entry->reserved)
 );
 
+DECLARE_EVENT_CLASS(ext4__data_csum,
+
+	TP_PROTO(struct inode *inode, loff_t pos),
+
+	TP_ARGS(inode, pos),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	end			)
+		__field(	loff_t,	pos			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->end	= EXT4_I(inode)->i_data_csum_end;
+		__entry->pos	= pos;
+	),
+
+	TP_printk("dev %d,%d ino %lu end %lld pos %lld",
+		MAJOR(__entry->dev), MINOR(__entry->dev),
+		(unsigned long) __entry->ino, __entry->end, __entry->pos)
+);
+
+DEFINE_EVENT(ext4__data_csum, ext4_start_data_csum,
+
+	TP_PROTO(struct inode *inode, loff_t pos),
+
+	TP_ARGS(inode, pos)
+);
+
+DEFINE_EVENT(ext4__data_csum, ext4_update_data_csum,
+
+	TP_PROTO(struct inode *inode, loff_t pos),
+
+	TP_ARGS(inode, pos)
+);
+
+DEFINE_EVENT(ext4__data_csum, ext4_save_data_csum,
+
+	TP_PROTO(struct inode *inode, loff_t pos),
+
+	TP_ARGS(inode, pos)
+);
+
+DEFINE_EVENT(ext4__data_csum, ext4_truncate_data_csum,
+
+	TP_PROTO(struct inode *inode, loff_t pos),
+
+	TP_ARGS(inode, pos)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
diff -upr linux-2.6.32-504.3.3.el6.orig/include/trace/events/kmem.h linux-2.6.32-504.3.3.el6-042stab103_6/include/trace/events/kmem.h
--- linux-2.6.32-504.3.3.el6.orig/include/trace/events/kmem.h	2014-12-12 23:29:22.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/trace/events/kmem.h	2015-01-21 12:02:43.394221859 +0300
@@ -683,26 +683,103 @@ TRACE_EVENT(mm_directreclaim_reclaimzone
 	TP_printk("node = %d reclaimed=%ld, priority=%ld",
 			__entry->node, __entry->reclaimed, __entry->priority)
 	);
+
 TRACE_EVENT(mm_pagereclaim_shrinkzone,
 
-	TP_PROTO(unsigned long reclaimed, unsigned long priority),
+	TP_PROTO(int nid, int zid, int priority, unsigned long reclaimed),
 
-	TP_ARGS(reclaimed, priority),
+	TP_ARGS(nid, zid, priority, reclaimed),
 
 	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, zid)
+		__field(int, priority)
 		__field(unsigned long, reclaimed)
-		__field(unsigned long, priority)
 	),
 
 	TP_fast_assign(
-		__entry->reclaimed = reclaimed;
+		__entry->nid = nid;
+		__entry->zid = zid;
 		__entry->priority = priority;
+		__entry->reclaimed = reclaimed;
 	),
 
-	TP_printk("reclaimed=%ld priority=%ld",
-			__entry->reclaimed, __entry->priority)
+	TP_printk("nid=%d zid=%d priority=%d reclaimed=%ld",
+			__entry->nid, __entry->zid,
+			__entry->priority, __entry->reclaimed)
 	);
 
+TRACE_EVENT(mm_pagereclaim_shrinkgang,
+
+	TP_PROTO(int nid, int zid, int gid, int type,
+		 int priority, unsigned long reclaimed),
+
+	TP_ARGS(nid, zid, gid, type, priority, reclaimed),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, zid)
+		__field(int, gid)
+		__field(int, type)
+		__field(int, priority)
+		__field(unsigned long, reclaimed)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->zid = zid;
+		__entry->gid = gid;
+		__entry->type = type;
+		__entry->priority = priority;
+		__entry->reclaimed = reclaimed;
+	),
+
+	TP_printk("nid=%d zid=%d gid=%u type=%d priority=%d reclaimed=%ld",
+			__entry->nid, __entry->zid, __entry->gid, __entry->type,
+			__entry->priority, __entry->reclaimed)
+	);
+
+TRACE_EVENT(mm_pagereclaim_reschedule,
+
+       TP_PROTO(int nid, int zid, int gid, int type,
+		unsigned long usage, unsigned long shadow,
+		unsigned long limit, unsigned long age,
+		unsigned long max_age, int priority),
+
+       TP_ARGS(nid, zid, gid, type, usage, shadow, limit, age, max_age, priority),
+
+       TP_STRUCT__entry(
+	       __field(int, nid)
+	       __field(int, zid)
+	       __field(int, gid)
+	       __field(int, type)
+	       __field(unsigned long, usage)
+	       __field(unsigned long, shadow)
+	       __field(unsigned long, limit)
+	       __field(unsigned long, age)
+	       __field(unsigned long, max_age)
+	       __field(int, priority)
+       ),
+
+       TP_fast_assign(
+	       __entry->nid = nid;
+	       __entry->zid = zid;
+	       __entry->gid = gid;
+	       __entry->type = type;
+	       __entry->usage = usage;
+	       __entry->shadow = shadow;
+	       __entry->limit = limit;
+	       __entry->age = age;
+	       __entry->max_age = max_age;
+	       __entry->priority = priority;
+       ),
+       TP_printk("nid=%d zid=%d gid=%u type=%d usage=%ld "
+		 "shadow=%ld limit=%ld age=%ld max_age=%ld priority=%d",
+		 __entry->nid, __entry->zid, __entry->gid, __entry->type,
+		 __entry->usage ,__entry->shadow, __entry->limit,
+		 __entry->age, __entry->max_age, __entry->priority)
+       );
+
 TRACE_EVENT(mm_pagereclaim_shrinkactive,
 
 	TP_PROTO(unsigned long scanned, int file, int priority),
diff -upr linux-2.6.32-504.3.3.el6.orig/include/trace/ftrace.h linux-2.6.32-504.3.3.el6-042stab103_6/include/trace/ftrace.h
--- linux-2.6.32-504.3.3.el6.orig/include/trace/ftrace.h	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/include/trace/ftrace.h	2015-01-21 12:02:58.401823460 +0300
@@ -65,7 +65,8 @@
 	};
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)	\
-	static struct ftrace_event_call event_##name
+	static struct ftrace_event_call			\
+	__attribute__((__aligned__(4))) event_##name
 
 #undef DEFINE_EVENT_FN
 #define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)	\
@@ -804,6 +805,9 @@ __attribute__((section("_ftrace_events_p
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
+#undef TP_perf_assign
+#define TP_perf_assign(args...) args
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace void							\
diff -upr linux-2.6.32-504.3.3.el6.orig/init/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/init/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/init/Kconfig	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/init/Kconfig	2015-01-21 12:02:53.982940756 +0300
@@ -251,6 +251,18 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.gnu.org/software/acct/>.
 
+config FHANDLE
+	bool "open by fhandle syscalls"
+	select EXPORTFS
+	help
+	  If you say Y here, a user level program will be able to map
+	  file names to handle and then later use the handle for
+	  different file system operations. This is useful in implementing
+	  userspace file servers, which now track files using handles instead
+	  of names. The handle would remain the same even if file names
+	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
+	  syscalls.
+
 config TASKSTATS
 	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
 	depends on NET
@@ -286,7 +298,7 @@ config TASK_XACCT
 
 config TASK_IO_ACCOUNTING
 	bool "Enable per-task storage I/O accounting (EXPERIMENTAL)"
-	depends on TASK_XACCT
+	depends on TASK_XACCT && BEANCOUNTERS
 	help
 	  Collect information on the number of bytes of storage I/O which this
 	  task has caused.
@@ -470,6 +482,10 @@ config CFS_BANDWIDTH
 	  restriction.
 	  See tip/Documentation/scheduler/sched-bwc.txt for more information.
 
+config CFS_CPULIMIT
+	bool
+	depends on CFS_BANDWIDTH
+
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL
@@ -506,6 +522,19 @@ config CGROUP_SCHED
 
 endchoice
 
+config VZ_FAIRSCHED
+	bool "OpenVZ fairsched compat"
+	select CPUSETS
+	select CGROUP_CPUACCT
+	select CGROUP_SCHED
+	select FAIR_GROUP_SCHED
+	select CFS_BANDWIDTH
+	select CFS_CPULIMIT
+	default y
+	help
+	  This option add task cpu cgroup control with OpenVZ compatible
+	  syscall and procfs interface.
+
 menuconfig CGROUPS
 	boolean "Control Group support"
 	depends on EVENTFD
@@ -535,7 +564,7 @@ config CGROUP_DEBUG
 
 config CGROUP_NS
 	bool "Namespace cgroup subsystem"
-	depends on CGROUPS
+	depends on CGROUPS && !VE
 	help
 	  Provides a simple namespace cgroup subsystem to
 	  provide hierarchical naming of sets of namespaces,
@@ -581,6 +610,7 @@ config CGROUP_CPUACCT
 
 config RESOURCE_COUNTERS
 	bool "Resource counters"
+	depends on !BEANCOUNTERS
 	help
 	  This option enables controller independent resource accounting
 	  infrastructure that works with cgroups.
@@ -721,6 +751,16 @@ config SYSFS_DEPRECATED_V2
 	  if the original kernel, that came with your distribution, has
 	  this option set to N.
 
+config SYSFS_DEPRECATED_DYN
+	bool "make deprecated sysfs layout dynamically"
+	depends on SYSFS
+	default y
+	select SYSFS_DEPRECATED_V2
+	help
+	  This option works like the DEPRECATED_V2 but allows selecting the
+	  sysfs layout dynamically, i.e. on boot. To select the old
+	  (deprecated) layout, supply old_sysfs kernel boot parameter.
+
 config RELAY
 	bool "Kernel->user space relay support (formerly relayfs)"
 	help
diff -upr linux-2.6.32-504.3.3.el6.orig/init/main.c linux-2.6.32-504.3.3.el6-042stab103_6/init/main.c
--- linux-2.6.32-504.3.3.el6.orig/init/main.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/init/main.c	2015-01-21 12:02:58.451822134 +0300
@@ -70,8 +70,12 @@
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/perf_event.h>
+#include <linux/pram.h>
+#include <linux/kexec.h>
 #include <trace/boot.h>
 
+#include <bc/beancounter.h>
+
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
@@ -102,6 +106,14 @@ extern void tc_init(void);
 enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
+#ifdef CONFIG_VE
+extern void init_ve_system(void);
+extern void init_ve0(void);
+#else
+#define init_ve_system()		do { } while (0)
+#define init_ve0()			do { } while (0)
+#endif
+
 /*
  * Boot command-line arguments
  */
@@ -601,6 +613,9 @@ asmlinkage void __init start_kernel(void
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	init_ve0();
+	ub_init_early();
+	kstat_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL);
@@ -621,6 +636,8 @@ asmlinkage void __init start_kernel(void
 	sort_main_extable();
 	trap_init();
 	mm_init();
+	pram_init();
+	kexec_crash_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -693,8 +710,8 @@ asmlinkage void __init start_kernel(void
 	page_cgroup_init();
 	enable_debug_pagealloc();
 	kmemtrace_init();
-	kmemleak_init();
 	debug_objects_mem_init();
+	kmemleak_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
@@ -715,6 +732,7 @@ asmlinkage void __init start_kernel(void
 	cred_init();
 	fork_init(totalram_pages);
 	proc_caches_init();
+	ub_init_late();
 	buffer_init();
 	key_init();
 	radix_tree_init();
@@ -828,6 +846,7 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
+	init_ve_system();
 	init_workqueues();
 	cgroup_wq_init();
 	cpuset_init_smp();
diff -upr linux-2.6.32-504.3.3.el6.orig/init/version.c linux-2.6.32-504.3.3.el6-042stab103_6/init/version.c
--- linux-2.6.32-504.3.3.el6.orig/init/version.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/init/version.c	2015-01-21 12:02:43.606216232 +0300
@@ -38,6 +38,12 @@ struct uts_namespace init_uts_ns = {
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
+struct new_utsname virt_utsname = {
+	/* we need only this field */
+	.release        = UTS_RELEASE,
+};
+EXPORT_SYMBOL(virt_utsname);
+
 /* FIXED STRINGS! Don't touch! */
 const char linux_banner[] =
 	"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/ipc_sysctl.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/ipc_sysctl.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/ipc_sysctl.c	2014-12-12 23:29:12.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/ipc_sysctl.c	2015-01-21 12:02:44.037204788 +0300
@@ -306,19 +306,14 @@ static struct ctl_table ipc_kern_table[]
 	{}
 };
 
-static struct ctl_table ipc_root_table[] = {
-	{
-		.ctl_name	= CTL_KERN,
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= ipc_kern_table,
-	},
+static struct ctl_path ipc_path[] = {
+	{ .ctl_name = CTL_KERN, .procname = "kernel", },
 	{}
 };
 
 static int __init ipc_sysctl_init(void)
 {
-	register_sysctl_table(ipc_root_table);
+	register_sysctl_glob_paths(ipc_path, ipc_kern_table, 1);
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/mqueue.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/mqueue.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/mqueue.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/mqueue.c	2015-01-21 12:02:44.739186151 +0300
@@ -730,7 +730,6 @@ static struct file *do_create(struct ipc
 			struct mq_attr *attr)
 {
 	const struct cred *cred = current_cred();
-	struct file *result;
 	int ret;
 
 	if (attr) {
@@ -752,24 +751,11 @@ static struct file *do_create(struct ipc
 	}
 
 	mode &= ~current_umask();
-	ret = mnt_want_write(ipc_ns->mq_mnt);
-	if (ret)
-		goto out;
 	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
 	dentry->d_fsdata = NULL;
 	if (ret)
-		goto out_drop_write;
-
-	result = dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
-	/*
-	 * dentry_open() took a persistent mnt_want_write(),
-	 * so we can now drop this one.
-	 */
-	mnt_drop_write(ipc_ns->mq_mnt);
-	return result;
-
-out_drop_write:
-	mnt_drop_write(ipc_ns->mq_mnt);
+		goto out;
+	return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
 out:
 	dput(dentry);
 	mntput(ipc_ns->mq_mnt);
@@ -809,6 +795,9 @@ SYSCALL_DEFINE4(mq_open, const char __us
 	struct mq_attr attr;
 	int fd, error;
 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct vfsmount *mnt = ipc_ns->mq_mnt;
+	struct dentry *root = mnt->mnt_root;
+	int ro;
 
 	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
 		return -EFAULT;
@@ -822,14 +811,14 @@ SYSCALL_DEFINE4(mq_open, const char __us
 	if (fd < 0)
 		goto out_putname;
 
-	mutex_lock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
-	dentry = lookup_one_len(name->name, ipc_ns->mq_mnt->mnt_root,
-				strlen(name->name));
+	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
+	mutex_lock(&root->d_inode->i_mutex);
+	dentry = lookup_one_len(name->name, root, strlen(name->name));
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
 		goto out_putfd;
 	}
-	mntget(ipc_ns->mq_mnt);
+	mntget(mnt);
 
 	if (oflag & O_CREAT) {
 		if (dentry->d_inode) {	/* entry already exists */
@@ -841,7 +830,11 @@ SYSCALL_DEFINE4(mq_open, const char __us
 			filp = do_open(ipc_ns, dentry, oflag);
 		} else {
 			audit_inode_parent_hidden(name, ipc_ns->mq_mnt->mnt_root);
-			filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root,
+			if (ro) {
+				error = ro;
+				goto out;
+			}
+			filp = do_create(ipc_ns, root,
 						dentry, oflag, mode,
 						u_attr ? &attr : NULL);
 		}
@@ -864,12 +857,14 @@ SYSCALL_DEFINE4(mq_open, const char __us
 
 out:
 	dput(dentry);
-	mntput(ipc_ns->mq_mnt);
+	mntput(mnt);
 out_putfd:
 	put_unused_fd(fd);
 	fd = error;
 out_upsem:
-	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
+	mutex_unlock(&root->d_inode->i_mutex);
+	if (!ro)
+		mnt_drop_write(mnt);
 out_putname:
 	putname(name);
 	return fd;
@@ -882,42 +877,39 @@ SYSCALL_DEFINE1(mq_unlink, const char __
 	struct dentry *dentry;
 	struct inode *inode = NULL;
 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct vfsmount *mnt = ipc_ns->mq_mnt;
 
 	name = getname(u_name);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
 	audit_inode_parent_hidden(name, ipc_ns->mq_mnt->mnt_root);
-	mutex_lock_nested(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex,
-			I_MUTEX_PARENT);
-	dentry = lookup_one_len(name->name, ipc_ns->mq_mnt->mnt_root,
-				strlen(name->name));
+	err = mnt_want_write(mnt);
+	if (err)
+		goto out_name;
+	mutex_lock_nested(&mnt->mnt_root->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name->name, mnt->mnt_root, strlen(name->name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out_unlock;
 	}
 
-	if (!dentry->d_inode) {
-		err = -ENOENT;
-		goto out_err;
-	}
-
 	inode = dentry->d_inode;
-	if (inode)
+	if (!inode) {
+		err = -ENOENT;
+	} else {
 		atomic_inc(&inode->i_count);
-	err = mnt_want_write(ipc_ns->mq_mnt);
-	if (err)
-		goto out_err;
-	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
-	mnt_drop_write(ipc_ns->mq_mnt);
-out_err:
+		err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+	}
 	dput(dentry);
 
 out_unlock:
-	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
-	putname(name);
+	mutex_unlock(&mnt->mnt_root->d_inode->i_mutex);
 	if (inode)
 		iput(inode);
+	mnt_drop_write(mnt);
+out_name:
+	putname(name);
 
 	return err;
 }
@@ -1425,6 +1417,7 @@ static struct file_system_type mqueue_fs
 	.name = "mqueue",
 	.get_sb = mqueue_get_sb,
 	.kill_sb = kill_litter_super,
+	.fs_flags = FS_VIRTUALIZED,
 };
 
 int mq_init_ns(struct ipc_namespace *ns)
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/msg.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/msg.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/msg.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/msg.c	2015-01-21 12:02:47.940101171 +0300
@@ -193,6 +193,7 @@ static int newque(struct ipc_namespace *
 	int id, retval;
 	key_t key = params->key;
 	int msgflg = params->flg;
+	int msqid = params->id;
 
 	msq = ipc_rcu_alloc(sizeof(*msq));
 	if (!msq)
@@ -211,7 +212,7 @@ static int newque(struct ipc_namespace *
 	/*
 	 * ipc_addid() locks msq
 	 */
-	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
+	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid);
 	if (id < 0) {
 		ipc_rcu_putref(msq, ipc_rcu_free);
 		return id;
@@ -331,6 +332,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int,
 
 	msg_params.key = key;
 	msg_params.flg = msgflg;
+	msg_params.id = -1;
 
 	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
 }
@@ -701,7 +703,8 @@ long do_msgsnd(int msqid, long mtype, vo
 
 		ipc_lock_by_ptr(&msq->q_perm);
 		ipc_rcu_putref(msq, ipc_rcu_free);
-		if (msq->q_perm.deleted) {
+		/* raced with RMID? */
+		if (!ipc_valid_object(&msq->q_perm)) {
 			err = -EIDRM;
 			goto out_unlock_free;
 		}
@@ -889,6 +892,12 @@ long do_msgrcv(int msqid, long *pmtype, 
 		ipc_lock_by_ptr(&msq->q_perm);
 		rcu_read_unlock();
 
+		/* raced with RMID? */
+		if (!ipc_valid_object(&msq->q_perm)) {
+			msg = ERR_PTR(-EIDRM);
+			goto out_unlock;
+		}
+
 		/* Lockless receive, part 4:
 		 * Repeat test after acquiring the spinlock.
 		 */
@@ -955,3 +964,55 @@ static int sysvipc_msg_proc_show(struct 
 			msq->q_ctime);
 }
 #endif
+
+#ifdef CONFIG_VE
+#include <linux/module.h>
+
+int sysvipc_setup_msg(key_t key, int msqid, int msgflg)
+{
+	struct ipc_namespace *ns;
+	struct ipc_ops msg_ops;
+	struct ipc_params msg_params;
+
+	ns = current->nsproxy->ipc_ns;
+
+	msg_ops.getnew = newque;
+	msg_ops.associate = msg_security;
+	msg_ops.more_checks = NULL;
+
+	msg_params.key = key;
+	msg_params.flg = msgflg | IPC_CREAT;
+	msg_params.id = msqid;
+
+	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
+}
+EXPORT_SYMBOL_GPL(sysvipc_setup_msg);
+
+int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg)
+{
+	int err = 0;
+	struct msg_queue * msq;
+	struct ipc_namespace *ns;
+	int next_id;
+	int total, in_use;
+
+	ns = current->nsproxy->ipc_ns;
+
+	down_write(&msg_ids(ns).rw_mutex);
+	in_use = msg_ids(ns).in_use;
+	for (total = 0, next_id = 0; total < in_use; next_id++) {
+		msq = idr_find(&msg_ids(ns).ipcs_idr, next_id);
+		if (msq == NULL)
+			continue;
+		ipc_lock_by_ptr(&msq->q_perm);
+		err = func(ipc_buildid(next_id, msq->q_perm.seq), msq, arg);
+		msg_unlock(msq);
+		if (err)
+			break;
+		total++;
+	}
+	up_write(&msg_ids(ns).rw_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(sysvipc_walk_msg);
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/msgutil.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/msgutil.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/msgutil.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/msgutil.c	2015-01-21 12:02:47.940101171 +0300
@@ -8,6 +8,7 @@
  * See the file COPYING for more details.
  */
 
+#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #include <linux/security.h>
@@ -19,6 +20,8 @@
 
 #include "util.h"
 
+#include <bc/kmem.h>
+
 DEFINE_SPINLOCK(mq_lock);
 
 /*
@@ -48,52 +51,53 @@ struct msg_msgseg {
 #define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
 #define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
 
-struct msg_msg *load_msg(const void __user *src, int len)
+struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset,
+					  void * data), int len, void * data)
 {
 	struct msg_msg *msg;
 	struct msg_msgseg **pseg;
 	int err;
 	int alen;
+	int offset = 0;
 
 	alen = len;
 	if (alen > DATALEN_MSG)
 		alen = DATALEN_MSG;
 
-	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC);
 	if (msg == NULL)
 		return ERR_PTR(-ENOMEM);
 
 	msg->next = NULL;
 	msg->security = NULL;
 
-	if (copy_from_user(msg + 1, src, alen)) {
+	if (load(msg + 1, alen, offset, data)) {
 		err = -EFAULT;
 		goto out_err;
 	}
 
 	len -= alen;
-	src = ((char __user *)src) + alen;
+	offset += alen;
 	pseg = &msg->next;
 	while (len > 0) {
 		struct msg_msgseg *seg;
 		alen = len;
 		if (alen > DATALEN_SEG)
 			alen = DATALEN_SEG;
-		seg = kmalloc(sizeof(*seg) + alen,
-						 GFP_KERNEL);
+		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC);
 		if (seg == NULL) {
 			err = -ENOMEM;
 			goto out_err;
 		}
 		*pseg = seg;
 		seg->next = NULL;
-		if (copy_from_user(seg + 1, src, alen)) {
+		if (load(seg + 1, alen, offset, data)) {
 			err = -EFAULT;
 			goto out_err;
 		}
 		pseg = &seg->next;
 		len -= alen;
-		src = ((char __user *)src) + alen;
+		offset += alen;
 	}
 
 	err = security_msg_msg_alloc(msg);
@@ -106,33 +110,58 @@ out_err:
 	free_msg(msg);
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(sysv_msg_load);
 
-int store_msg(void __user *dest, struct msg_msg *msg, int len)
+static int do_load_msg(void * dst, int len, int offset, void * data)
+{
+	return copy_from_user(dst, data + offset, len);
+}
+
+struct msg_msg *load_msg(const void __user *src, int len)
+{
+	return sysv_msg_load(do_load_msg, len, (void*)src);
+}
+
+int sysv_msg_store(struct msg_msg *msg,
+		   int (*store)(void * src, int len, int offset, void * data),
+		   int len, void * data)
 {
 	int alen;
+	int offset = 0;
 	struct msg_msgseg *seg;
-
+	
 	alen = len;
 	if (alen > DATALEN_MSG)
 		alen = DATALEN_MSG;
-	if (copy_to_user(dest, msg + 1, alen))
+	if (store(msg + 1, alen, offset, data))
 		return -1;
 
 	len -= alen;
-	dest = ((char __user *)dest) + alen;
+	offset += alen;
 	seg = msg->next;
 	while (len > 0) {
 		alen = len;
 		if (alen > DATALEN_SEG)
 			alen = DATALEN_SEG;
-		if (copy_to_user(dest, seg + 1, alen))
+		if (store(seg + 1, alen, offset, data))
 			return -1;
 		len -= alen;
-		dest = ((char __user *)dest) + alen;
+		offset += alen;
 		seg = seg->next;
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(sysv_msg_store);
+
+static int do_store_msg(void * src, int len, int offset, void * data)
+{
+	return copy_to_user(data + offset, src, len);
+}
+
+int store_msg(void __user *dest, struct msg_msg *msg, int len)
+{
+	return sysv_msg_store(msg, do_store_msg, len, dest);
+}
 
 void free_msg(struct msg_msg *msg)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/sem.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/sem.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/sem.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/sem.c	2015-01-21 12:02:58.125830786 +0300
@@ -90,13 +90,7 @@
 #include <asm/uaccess.h>
 #include "util.h"
 
-/* One semaphore structure for each semaphore in the system. */
-struct sem {
-	int	semval;		/* current value */
-	int	sempid;		/* pid of last operation */
-	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
-	struct list_head sem_pending; /* pending single-sop operations */
-};
+#include <bc/kmem.h>
 
 /* One queue for each sleeping process in the system. */
 struct sem_queue {
@@ -110,31 +104,6 @@ struct sem_queue {
 	int			alter;	 /* does *sops alter the array? */
 };
 
-/* Each task has a list of undo requests. They are executed automatically
- * when the process exits.
- */
-struct sem_undo {
-	struct list_head	list_proc;	/* per-process list: *
-						 * all undos from one process
-						 * rcu protected */
-	struct rcu_head		rcu;		/* rcu struct for sem_undo */
-	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
-	struct list_head	list_id;	/* per semaphore array list:
-						 * all undos for one array */
-	int			semid;		/* semaphore set identifier */
-	short			*semadj;	/* array of adjustments */
-						/* one per semaphore */
-};
-
-/* sem_undo_list controls shared access to the list of sem_undo structures
- * that may be shared among all a CLONE_SYSVSEM task group.
- */
-struct sem_undo_list {
-	atomic_t		refcnt;
-	spinlock_t		lock;
-	struct list_head	list_proc;
-};
-
 
 #define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
 
@@ -421,6 +390,7 @@ static int newary(struct ipc_namespace *
 	key_t key = params->key;
 	int nsems = params->u.nsems;
 	int semflg = params->flg;
+	int semid = params->id;
 	int i;
 
 	if (!nsems)
@@ -445,7 +415,7 @@ static int newary(struct ipc_namespace *
 		return retval;
 	}
 
-	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
+	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid);
 	if (id < 0) {
 		ipc_rcu_putref(sma, sem_rcu_free);
 		return id;
@@ -515,6 +485,7 @@ SYSCALL_DEFINE3(semget, key_t, key, int,
 	sem_params.key = key;
 	sem_params.flg = semflg;
 	sem_params.u.nsems = nsems;
+	sem_params.id = -1;
 
 	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
 }
@@ -1416,7 +1387,7 @@ static inline int get_undo_list(struct s
 
 	undo_list = current->sysvsem.undo_list;
 	if (!undo_list) {
-		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC);
 		if (undo_list == NULL)
 			return -ENOMEM;
 		spin_lock_init(&undo_list->lock);
@@ -1500,7 +1471,8 @@ static struct sem_undo *find_alloc_undo(
 	rcu_read_unlock();
 
 	/* step 2: allocate new undo structure */
-	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
+			GFP_KERNEL_UBC);
 	if (!new) {
 		ipc_rcu_putref(sma, ipc_rcu_free);
 		return ERR_PTR(-ENOMEM);
@@ -1591,7 +1563,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, 
 	if (nsops > ns->sc_semopm)
 		return -E2BIG;
 	if(nsops > SEMOPM_FAST) {
-		sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
+		sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC);
 		if(sops==NULL)
 			return -ENOMEM;
 	}
@@ -1933,3 +1905,58 @@ static int sysvipc_sem_proc_show(struct 
 			  sma->sem_ctime);
 }
 #endif
+
+#ifdef CONFIG_VE
+#include <linux/module.h>
+
+int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg)
+{
+	struct ipc_namespace *ns;
+	struct ipc_ops sem_ops;
+	struct ipc_params sem_params;
+
+	ns = current->nsproxy->ipc_ns;
+
+	sem_ops.getnew = newary;
+	sem_ops.associate = sem_security;
+	sem_ops.more_checks = sem_more_checks;
+
+	sem_params.key = key;
+	sem_params.flg = semflg | IPC_CREAT;
+	sem_params.u.nsems = size;
+	sem_params.id = semid;
+
+	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+}
+EXPORT_SYMBOL_GPL(sysvipc_setup_sem);
+
+int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg)
+{
+	int err = 0;
+	struct sem_array *sma;
+	struct ipc_namespace *ns;
+	int next_id;
+	int total, in_use;
+
+	ns = current->nsproxy->ipc_ns;
+
+	down_write(&sem_ids(ns).rw_mutex);
+	in_use = sem_ids(ns).in_use;
+	for (total = 0, next_id = 0; total < in_use; next_id++) {
+		sma = idr_find(&sem_ids(ns).ipcs_idr, next_id);
+		if (sma == NULL)
+			continue;
+		ipc_lock_by_ptr(&sma->sem_perm);
+		err = func(ipc_buildid(next_id, sma->sem_perm.seq), sma, arg);
+		sem_unlock(sma, -1);
+		rcu_read_unlock();
+		if (err)
+			break;
+		total++;
+	}
+	up_write(&sem_ids(ns).rw_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(sysvipc_walk_sem);
+EXPORT_SYMBOL_GPL(exit_sem);
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/shm.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/shm.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/shm.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/shm.c	2015-01-21 12:02:57.969834926 +0300
@@ -39,27 +39,15 @@
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/uaccess.h>
 
-#include "util.h"
-
-struct shm_file_data {
-	int id;
-	struct ipc_namespace *ns;
-	struct file *file;
-	const struct vm_operations_struct *vm_ops;
-};
 
-#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
+#include "util.h"
 
-static const struct file_operations shm_file_operations;
 static const struct vm_operations_struct shm_vm_ops;
 
-#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
-
-#define shm_unlock(shp)			\
-	ipc_unlock(&(shp)->shm_perm)
 
 static int newseg(struct ipc_namespace *, struct ipc_params *);
 static void shm_open(struct vm_area_struct *vma);
@@ -117,20 +105,6 @@ void __init shm_init (void)
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
-/*
- * shm_lock_(check_) routines are called in the paths where the rw_mutex
- * is not necessarily held.
- */
-static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
-{
-	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
-
-	if (IS_ERR(ipcp))
-		return (struct shmid_kernel *)ipcp;
-
-	return container_of(ipcp, struct shmid_kernel, shm_perm);
-}
-
 static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
 						int id)
 {
@@ -397,11 +371,12 @@ static unsigned long shm_get_unmapped_ar
 						pgoff, flags);
 }
 
-static const struct file_operations shm_file_operations = {
+const struct file_operations shm_file_operations = {
 	.mmap		= shm_mmap,
 	.fsync		= shm_fsync,
 	.release	= shm_release,
 };
+EXPORT_SYMBOL(shm_file_operations);
 
 static const struct file_operations shm_file_operations_huge = {
 	.mmap		= shm_mmap,
@@ -438,13 +413,14 @@ static int newseg(struct ipc_namespace *
 	key_t key = params->key;
 	int shmflg = params->flg;
 	size_t size = params->u.size;
+	int shmid = params->id;
 	int error;
 	struct shmid_kernel *shp;
 	int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
 	struct file * file;
-	char name[13];
+	char name[64];
 	int id;
-	int acctflag = 0;
+	vm_flags_t acctflag = 0;
 
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
@@ -467,7 +443,7 @@ static int newseg(struct ipc_namespace *
 		return error;
 	}
 
-	sprintf (name, "SYSV%08x", key);
+	snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key);
 	if (shmflg & SHM_HUGETLB) {
 		/* hugetlb_file_setup applies strict accounting */
 		if (shmflg & SHM_NORESERVE)
@@ -488,7 +464,7 @@ static int newseg(struct ipc_namespace *
 	if (IS_ERR(file))
 		goto no_file;
 
-	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
+	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, shmid);
 	if (id < 0) {
 		error = id;
 		goto no_id;
@@ -562,6 +538,7 @@ SYSCALL_DEFINE3(shmget, key_t, key, size
 	shm_params.key = key;
 	shm_params.flg = shmflg;
 	shm_params.u.size = size;
+	shm_params.id = -1;
 
 	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
 }
@@ -1216,3 +1193,67 @@ static int sysvipc_shm_proc_show(struct 
 			  swp * PAGE_SIZE);
 }
 #endif
+
+#ifdef CONFIG_VE
+#include <linux/module.h>
+
+struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg)
+{
+	struct ipc_namespace *ns;
+	struct ipc_ops shm_ops;
+	struct ipc_params shm_params;
+	struct shmid_kernel *shp;
+	struct file *file;
+	int rv;
+
+	ns = current->nsproxy->ipc_ns;
+
+	shm_ops.getnew = newseg;
+	shm_ops.associate = shm_security;
+	shm_ops.more_checks = shm_more_checks;
+
+	shm_params.key = key;
+	shm_params.flg = shmflg | IPC_CREAT;
+	shm_params.u.size = size;
+	shm_params.id = shmid;
+
+	rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
+	if (rv < 0)
+		return ERR_PTR(rv);
+	shp = shm_lock(ns, rv);
+	BUG_ON(IS_ERR(shp));
+	file = shp->shm_file;
+	get_file(file);
+	shm_unlock(shp);
+	return file;
+}
+EXPORT_SYMBOL_GPL(sysvipc_setup_shm);
+
+int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg)
+{
+	int err = 0;
+	struct shmid_kernel* shp;
+	struct ipc_namespace *ns;
+	int next_id;
+	int total, in_use;
+
+	ns = current->nsproxy->ipc_ns;
+
+	down_write(&shm_ids(ns).rw_mutex);
+	in_use = shm_ids(ns).in_use;
+	for (total = 0, next_id = 0; total < in_use; next_id++) {
+		shp = idr_find(&shm_ids(ns).ipcs_idr, next_id);
+		if (shp == NULL)
+			continue;
+		ipc_lock_by_ptr(&shp->shm_perm);
+		err = func(shp, arg);
+		shm_unlock(shp);
+		if (err)
+			break;
+		total++;
+	}
+	up_write(&shm_ids(ns).rw_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(sysvipc_walk_shm);
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/util.c linux-2.6.32-504.3.3.el6-042stab103_6/ipc/util.c
--- linux-2.6.32-504.3.3.el6.orig/ipc/util.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/util.c	2015-01-21 12:02:57.969834926 +0300
@@ -38,6 +38,8 @@
 
 #include <asm/unistd.h>
 
+#include <bc/kmem.h>
+
 #include "util.h"
 
 struct ipc_proc_iface {
@@ -157,8 +159,8 @@ void __init ipc_init_proc_interface(cons
 	iface->show	= show;
 
 	pde = proc_create_data(path,
-			       S_IRUGO,        /* world readable */
-			       NULL,           /* parent dir */
+			       S_IRUGO,		/* world readable */
+			       &glob_proc_root,	/* parent dir */
 			       &sysvipc_proc_fops,
 			       iface);
 	if (!pde) {
@@ -238,6 +240,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
  *	@ids: IPC identifier set
  *	@new: new IPC permission set
  *	@size: limit for the number of used ids
+ *	@reqid: if >= 0, get this id exactly. If -1 -- don't care.
  *
  *	Add an entry 'new' to the IPC ids idr. The permissions object is
  *	initialised and the first free entry is set up and the id assigned
@@ -247,7 +250,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
  *	Called with ipc_ids.rw_mutex held as a writer.
  */
  
-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
+int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid)
 {
 	uid_t euid;
 	gid_t egid;
@@ -264,7 +267,16 @@ int ipc_addid(struct ipc_ids* ids, struc
 	rcu_read_lock();
 	spin_lock(&new->lock);
 
-	err = idr_get_new(&ids->ipcs_idr, new, &id);
+	if (reqid >= 0) {
+		id = reqid % SEQ_MULTIPLIER;
+		err = idr_get_new_above(&ids->ipcs_idr, new, id, &id);
+		if (!err && id != (reqid % SEQ_MULTIPLIER)) {
+			idr_remove(&ids->ipcs_idr, id);
+			err = -EEXIST;
+		}
+	} else
+		err = idr_get_new(&ids->ipcs_idr, new, &id);
+
 	if (err) {
 		spin_unlock(&new->lock);
 		rcu_read_unlock();
@@ -277,9 +289,13 @@ int ipc_addid(struct ipc_ids* ids, struc
 	new->cuid = new->uid = euid;
 	new->gid = new->cgid = egid;
 
-	new->seq = ids->seq++;
-	if(ids->seq > ids->seq_max)
-		ids->seq = 0;
+	if (reqid >= 0) {
+		new->seq = reqid/SEQ_MULTIPLIER;
+	} else {
+		new->seq = ids->seq++;
+		if(ids->seq > ids->seq_max)
+			ids->seq = 0;
+	}
 
 	new->id = ipc_buildid(id, new->seq);
 	return id;
@@ -443,9 +459,9 @@ void *ipc_alloc(int size)
 {
 	void *out;
 	if(size > PAGE_SIZE)
-		out = vmalloc(size);
+		out = ub_vmalloc(size);
 	else
-		out = kmalloc(size, GFP_KERNEL);
+		out = kmalloc(size, GFP_KERNEL_UBC);
 	return out;
 }
 
@@ -642,6 +658,7 @@ err1:
 	rcu_read_unlock();
 	return out;
 }
+EXPORT_SYMBOL(ipc_lock);
 
 /**
  * ipc_obtain_object_check
@@ -769,7 +786,7 @@ struct kern_ipc_perm *ipcctl_pre_down_no
 
 	euid = current_euid();
 	if (euid == ipcp->cuid ||
-	    euid == ipcp->uid  || capable(CAP_SYS_ADMIN))
+	    euid == ipcp->uid  || capable(CAP_VE_SYS_ADMIN))
 		return ipcp;
 
 out_up:
diff -upr linux-2.6.32-504.3.3.el6.orig/ipc/util.h linux-2.6.32-504.3.3.el6-042stab103_6/ipc/util.h
--- linux-2.6.32-504.3.3.el6.orig/ipc/util.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/ipc/util.h	2015-01-21 12:02:47.941101145 +0300
@@ -65,6 +65,7 @@ struct ipc_params {
 		size_t size;	/* for shared memories */
 		int nsems;	/* for semaphores */
 	} u;			/* holds the getnew() specific param */
+	int id;
 };
 
 /*
@@ -94,14 +95,10 @@ void __init ipc_init_proc_interface(cons
 #define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
 #endif
 
-#define IPC_SEM_IDS	0
-#define IPC_MSG_IDS	1
-#define IPC_SHM_IDS	2
-
 #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
 
 /* must be called with ids->rw_mutex acquired for writing */
-int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
+int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int);
 
 /* must be called with ids->rw_mutex acquired for reading */
 int ipc_get_maxid(struct ipc_ids *);
@@ -129,7 +126,6 @@ int ipc_rcu_getref(void *ptr);
 void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
 void ipc_rcu_free(struct rcu_head *head);
 
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
 struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id);
 
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
@@ -170,15 +166,14 @@ static inline void ipc_lock_by_ptr(struc
 	spin_lock(&perm->lock);
 }
 
-static inline void ipc_unlock(struct kern_ipc_perm *perm)
+static inline void ipc_lock_object(struct kern_ipc_perm *perm)
 {
-	spin_unlock(&perm->lock);
-	rcu_read_unlock();
+	spin_lock(&perm->lock);
 }
 
-static inline void ipc_lock_object(struct kern_ipc_perm *perm)
+static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
 {
-	spin_lock(&perm->lock);
+	return !perm->deleted;
 }
 
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/Kconfig.openvz linux-2.6.32-504.3.3.el6-042stab103_6/kernel/Kconfig.openvz
--- linux-2.6.32-504.3.3.el6.orig/kernel/Kconfig.openvz	2015-01-21 12:02:44.085203514 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/Kconfig.openvz	2015-01-21 12:02:58.949808915 +0300
@@ -0,0 +1,126 @@
+# Copyright (C) 2005  SWsoft
+# All rights reserved.
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+menu "OpenVZ"
+
+config VE
+	bool "Virtual Environment support"
+	default y
+	select NAMESPACES
+	select PID_NS
+	select IPC_NS
+	select UTS_NS
+	select NET_NS
+	select USER_NS
+	select CGROUPS
+	select CGROUP_DEVICE
+	select CGROUP_FREEZER
+	help
+	  This option adds support of virtual Linux running on the original box
+	  with fully supported virtual network driver, tty subsystem and
+	  configurable access for hardware and other resources.
+
+config VE_CALLS
+	tristate "VE calls interface"
+	depends on VE
+	select VZ_DEV
+	default m
+	help
+	  This option controls how to build vzmon code containing VE calls.
+	  By default it's build in module vzmon.o
+
+config VZ_GENCALLS
+	bool
+	default y
+
+config VE_NETDEV
+	tristate "VE network device"
+	depends on VE_CALLS && NET
+	select VZ_DEV
+	default m
+	help
+	  This option controls whether to build venet device. This is a
+	  common interface for networking in VE.
+
+config VE_ETHDEV
+	tristate "Virtual ethernet device"
+	depends on VE_CALLS && NET
+	select VZ_DEV
+	default m
+	help
+	  This option controls whether to build virtual ethernet device.
+
+config VZ_DEV
+	tristate "VE device"
+	default m
+	help
+	  This option adds support of vzdev device, which is used by
+	  user-space applications to control Virtual Environments.
+
+config VE_IPTABLES
+	bool "VE netfiltering"
+	depends on VE && VE_NETDEV && INET && NETFILTER
+	default y
+	help
+	  This option controls whether to build VE netfiltering code.
+
+config VZ_WDOG
+	tristate "VE watchdog module"
+	depends on VE_CALLS
+	default m
+	help
+	  This option controls building of vzwdog module, which dumps
+	  a lot of useful system info on console periodically.
+ 
+config VZ_CHECKPOINT
+ 	tristate "Checkpointing & restoring Virtual Environments"
+	depends on X86 || IA64
+ 	depends on VE_CALLS
+	select PM
+	select PM_SLEEP
+	select TUN
+	select VE_ETHDEV
+	select VE_NETDEV
+ 	default m
+ 	help
+ 	  This option adds two modules, "cpt" and "rst", which allow
+ 	  to save a running Virtual Environment and restore it
+ 	  on another host (live migration) or on the same host (checkpointing).
+
+config VZ_CHECKPOINT_ITER
+	bool "Iterative migration support"
+	depends on VZ_CHECKPOINT
+	default y
+	help
+	  This option turns on iterative migration support.
+
+config VZ_EVENT
+ 	tristate "Enable sending notifications of the VE status change through the netlink socket"
+ 	depends on VE && VE_CALLS && NET
+ 	default m
+ 	help
+ 	  This option provides for sending notifications of the VE
+ 	  events to the curious user space applications through
+ 	  the netlink socket just like the core kernel
+ 	  networking code does. By now just the notifications of
+ 	  the VE essensial status changes are being sent.
+
+config VTTYS
+	bool "Virtual tty device"
+	depends on VE && VZ_DEV
+	default y
+
+
+config FENCE_WATCHDOG
+	bool "Fencing watchdog for HA cluster support"
+	depends on X86_64
+	default n
+
+config VZ_IOLIMIT
+	tristate "Container IO-limiting"
+	depends on VE && VE_CALLS && BC_IO_ACCOUNTING
+	default m
+	help
+	   This option provides io-limiting module.
+endmenu
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/Kconfig.preempt linux-2.6.32-504.3.3.el6-042stab103_6/kernel/Kconfig.preempt
--- linux-2.6.32-504.3.3.el6.orig/kernel/Kconfig.preempt	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/Kconfig.preempt	2015-01-21 12:02:54.124936987 +0300
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
+	select PREEMPT_COUNT
 	help
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 
 endchoice
 
+config PREEMPT_COUNT
+       bool
\ No newline at end of file
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/kernel/Makefile
--- linux-2.6.32-504.3.3.el6.orig/kernel/Makefile	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/Makefile	2015-01-21 12:02:58.505820701 +0300
@@ -29,6 +29,10 @@ obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+obj-$(CONFIG_BEANCOUNTERS) += bc/
+obj-y += ve/
+obj-$(CONFIG_VZ_CHECKPOINT) += cpt/
+
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
@@ -94,6 +98,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayac
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_VZ_FAIRSCHED) += fairsched.o
 obj-$(CONFIG_BINFMT_ELF) += elfcore.o
 obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
@@ -107,6 +112,7 @@ obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += events/
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_FENCE_WATCHDOG) += fence-watchdog.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/acct.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/acct.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/acct.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/acct.c	2015-01-21 12:02:47.194120976 +0300
@@ -360,8 +360,6 @@ void acct_exit_ns(struct pid_namespace *
 	if (acct != NULL) {
 		if (acct->file != NULL)
 			acct_file_reopen(acct, NULL, NULL);
-
-		kfree(acct);
 	}
 	spin_unlock(&acct_lock);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/audit.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/audit.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/audit.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/audit.c	2015-01-21 12:02:45.730159842 +0300
@@ -113,8 +113,7 @@ u32		audit_sig_sid = 0;
 */
 static atomic_t    audit_lost = ATOMIC_INIT(0);
 
-/* The netlink socket. */
-static struct sock *audit_sock;
+#define audit_sock	(get_exec_env()->ve_netns->_audit_sock)
 
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -656,6 +655,9 @@ static int audit_receive_msg(struct sk_b
 	char			*ctx = NULL;
 	u32			len;
 
+	if (!ve_is_super(skb->owner_env))
+		return -ECONNREFUSED;
+
 	err = audit_netlink_ok(skb, msg_type);
 	if (err)
 		return err;
@@ -960,22 +962,49 @@ static void audit_receive(struct sk_buff
 	mutex_unlock(&audit_cmd_mutex);
 }
 
+static int __net_init audit_net_init(struct net *net)
+{
+	struct sock *sk;
+
+	sk = netlink_kernel_create(net, NETLINK_AUDIT, 0,
+					   audit_receive, NULL, THIS_MODULE);
+	if (!sk) {
+		audit_panic("cannot initialize netlink socket");
+		return -ENODEV;
+	}
+
+	sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	net->_audit_sock = sk;
+
+	return 0;
+}
+
+static void __net_exit audit_net_exit(struct net *net)
+{
+	netlink_kernel_release(net->_audit_sock);
+	net->_audit_sock = NULL;
+}
+
+
+static struct pernet_operations audit_net_ops = {
+	.init = audit_net_init,
+	.exit = audit_net_exit,
+};
+
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
-	int i;
+	int i, res;
 
 	if (audit_initialized == AUDIT_DISABLED)
 		return 0;
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
-					   audit_receive, NULL, THIS_MODULE);
-	if (!audit_sock)
-		audit_panic("cannot initialize netlink socket");
-	else
-		audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	res = register_pernet_subsys(&audit_net_ops);
+	if (res < 0)
+		return res;
 
 	skb_queue_head_init(&audit_skb_queue);
 	skb_queue_head_init(&audit_skb_hold_queue);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/auditsc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/auditsc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/auditsc.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/auditsc.c	2015-01-21 12:02:45.875155992 +0300
@@ -2146,13 +2146,8 @@ void __audit_getname(struct filename *na
 	n->name_put = true;
 	name->aname = n;
 
-	if (!context->pwd.dentry) {
-		read_lock(&current->fs->lock);
-		context->pwd = current->fs->pwd;
-		path_get(&current->fs->pwd);
-		read_unlock(&current->fs->lock);
-	}
-
+	if (!context->pwd.dentry)
+		get_fs_pwd(current->fs, &context->pwd);
 }
 
 /* audit_putname - intercept a putname request
@@ -2468,6 +2463,7 @@ int audit_set_loginuid(struct task_struc
 	task->loginuid = loginuid;
 	return 0;
 }
+EXPORT_SYMBOL(audit_set_loginuid);
 
 /**
  * __audit_mq_open - record audit data for a POSIX MQ open
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/Kconfig	2015-01-21 12:02:43.396221806 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/Kconfig	2015-01-21 12:02:43.396221806 +0300
@@ -0,0 +1,100 @@
+#
+# User resources part (UBC)
+#
+# Copyright (C) 2005  SWsoft
+# All rights reserved.
+#
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+menu "User resources"
+
+config BEANCOUNTERS
+	bool "Enable user resource accounting"
+	default y
+	help 
+          This patch provides accounting and allows to configure
+          limits for user's consumption of exhaustible system resources.
+          The most important resource controlled by this patch is unswappable 
+          memory (either mlock'ed or used by internal kernel structures and 
+          buffers). The main goal of this patch is to protect processes
+          from running short of important resources because of an accidental
+          misbehavior of processes or malicious activity aiming to ``kill'' 
+          the system. It's worth to mention that resource limits configured 
+          by setrlimit(2) do not give an acceptable level of protection 
+          because they cover only small fraction of resources and work on a 
+          per-process basis.  Per-process accounting doesn't prevent malicious
+          users from spawning a lot of resource-consuming processes.
+
+config BC_RSS_ACCOUNTING
+	bool "Account physical memory usage"
+	default y
+	depends on BEANCOUNTERS
+	select MEMORY_GANGS
+	select MEMORY_VSWAP
+	help
+          This allows to estimate per beancounter physical memory usage.
+          Implemented alghorithm accounts shared pages of memory as well,
+          dividing them by number of beancounter which use the page.
+
+config BC_IO_ACCOUNTING
+	bool "Account file I/O"
+	default y
+	depends on BEANCOUNTERS
+	help
+	  This option allows seeing I/O activity caused by tasks from each UB
+
+config BC_IO_PRIORITY
+	bool "Disk I/O priority"
+	default y
+	depends on BEANCOUNTERS
+	select BLK_CGROUP
+	help
+	  This option add compat-layer on top of the blkio-cgroup for groupping
+	  and prioritizing disk access.
+
+config BC_SWAP_ACCOUNTING
+	bool "Account swap usage"
+	default y
+	depends on BEANCOUNTERS
+	help
+          This allows accounting of swap usage.
+
+config BC_PROC
+	bool "Report resource usage in /proc"
+	default y
+	depends on BEANCOUNTERS
+	help
+          Allows a system administrator to inspect resource accounts and limits.
+
+config BC_DEBUG
+	bool "User resources debug features"
+	default y
+	depends on BEANCOUNTERS
+	help
+	  Enables to setup debug features for user resource accounting
+
+config BC_DEBUG_KMEM
+	bool "Debug kmemsize with cache counters"
+	default n
+	depends on BC_DEBUG
+	help
+	  Adds /proc/user_beancounters_debug entry to get statistics
+	  about cache usage of each beancounter
+
+config BC_KEEP_UNUSED
+	bool "Keep unused beancounter alive"
+	default n
+	depends on BC_DEBUG
+	help
+	  If on, unused beancounters are kept on the hash and maxheld value
+	  can be looked through.
+
+config BC_DEBUG_ITEMS
+	bool "Account resources in items rather than in bytes"
+	default n
+	depends on BC_DEBUG
+	help
+	  When true some of the resources (e.g. kmemsize) are accounted
+	  in items instead of bytes.
+
+endmenu
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/Makefile
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/Makefile	2015-01-21 12:02:43.396221806 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/Makefile	2015-01-21 12:02:43.396221806 +0300
@@ -0,0 +1,17 @@
+#
+# User resources part (UBC)
+#
+# Copyright (C) 2005  SWsoft
+# All rights reserved.
+#
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+obj-y := sys.o beancounter.o kmem.o misc.o \
+	 vm_pages.o statd.o oom_kill.o dcache.o
+
+obj-$(CONFIG_NET) += net.o
+obj-$(CONFIG_BC_PROC)  += proc.o
+obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o
+
+CFLAGS_io_prio.o := -Iblock
+obj-$(CONFIG_BC_IO_PRIORITY) += io_prio.o
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/beancounter.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/beancounter.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/beancounter.c	2015-01-21 12:02:43.396221806 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/beancounter.c	2015-01-21 12:02:58.798812923 +0300
@@ -0,0 +1,1128 @@
+/*
+ *  linux/kernel/bc/beancounter.c
+ *
+ *  Copyright (C) 1998  Alan Cox
+ *                1998-2000  Andrey V. Savochkin <saw@saw.sw.com.sg>
+ *  Copyright (C) 2000-2005 SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * TODO:
+ *   - more intelligent limit check in mremap(): currently the new size is
+ *     charged and _then_ old size is uncharged
+ *     (almost done: !move_vma case is completely done,
+ *      move_vma in its current implementation requires too many conditions to
+ *      do things right, because it may be not only expansion, but shrinking
+ *      also, plus do_munmap will require an additional parameter...)
+ *   - problem: bad pmd page handling
+ *   - consider /proc redesign
+ *   - TCP/UDP ports
+ *   + consider whether __charge_beancounter_locked should be inline
+ *
+ * Changes:
+ *   1999/08/17  Marcelo Tosatti <marcelo@conectiva.com.br>
+ *	- Set "barrier" and "limit" parts of limits atomically.
+ *   1999/10/06  Marcelo Tosatti <marcelo@conectiva.com.br>
+ *	- setublimit system call.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/mmgang.h>
+#include <linux/swap.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <linux/cgroup.h>
+#include <linux/pid_namespace.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/vmpages.h>
+#include <bc/dcache.h>
+#include <bc/proc.h>
+
+static struct kmem_cache *ub_cachep;
+
+struct user_beancounter ub0 = {
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+	.gang_set.gangs = init_gang_array,
+#endif
+};
+EXPORT_SYMBOL(ub0);
+
+static struct workqueue_struct *ub_clean_wq;
+
+const char *ub_rnames[] = {
+	"kmemsize",	/* 0 */
+	"lockedpages",
+	"privvmpages",
+	"shmpages",
+	"dummy",
+	"numproc",	/* 5 */
+	"physpages",
+	"vmguarpages",
+	"oomguarpages",
+	"numtcpsock",
+	"numflock",	/* 10 */
+	"numpty",
+	"numsiginfo",
+	"tcpsndbuf",
+	"tcprcvbuf",
+	"othersockbuf",	/* 15 */
+	"dgramrcvbuf",
+	"numothersock",
+	"dcachesize",
+	"numfile",
+	"dummy",	/* 20 */
+	"dummy",
+	"dummy",
+	"numiptent",
+	"swappages",
+};
+
+unsigned int ub_dcache_thres_ratio __read_mostly = 2; /* percent */
+unsigned int ub_dcache_lru_popup __read_mostly = 1;
+unsigned int ub_dcache_time_thresh __read_mostly = 5;
+unsigned int ub_dcache_no_vzfs_cache __read_mostly = 0;
+EXPORT_SYMBOL(ub_dcache_no_vzfs_cache);
+
+static int ubc_ioprio = 1;
+
+static int ubc_pagecache_isolation = 0;
+
+/* default maximum perpcu resources precharge */
+int ub_resource_precharge[UB_RESOURCES] = {
+	[UB_KMEMSIZE]	= 32 * PAGE_SIZE,
+       [UB_PRIVVMPAGES]= 256,
+	[UB_NUMPROC]	= 4,
+	[UB_PHYSPAGES]	= 512,	/* up to 2Mb, 1 huge page */
+	[UB_NUMSIGINFO]	= 4,
+	[UB_DCACHESIZE] = 4 * PAGE_SIZE,
+	[UB_NUMFILE]	= 8,
+	[UB_SWAPPAGES]	= 256,
+	[UB_SHADOWPAGES] = 256,
+};
+
+/* natural limits for percpu precharge bounds */
+static int resource_precharge_min = 0;
+static int resource_precharge_max = INT_MAX / NR_CPUS;
+
+void init_beancounter_precharge(struct user_beancounter *ub, int resource)
+{
+	if (!atomic_read(&ub->ub_refcount))
+		return;
+
+	/* limit maximum precharge with one half of current resource excess */
+	ub->ub_parms[resource].max_precharge = min_t(long,
+			ub_resource_precharge[resource],
+			ub_resource_excess(ub, resource, UB_SOFT) /
+			(2 * num_possible_cpus()));
+}
+
+static void init_beancounter_precharges(struct user_beancounter *ub)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+		init_beancounter_precharge(ub, resource);
+}
+
+static void __init init_beancounter_precharges_early(struct user_beancounter *ub)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ ) {
+
+		/* DEBUG: sanity checks for initial prechage bounds */
+		BUG_ON(ub_resource_precharge[resource] < resource_precharge_min);
+		BUG_ON(ub_resource_precharge[resource] > resource_precharge_max);
+
+		ub->ub_parms[resource].max_precharge =
+			ub_resource_precharge[resource];
+	}
+}
+
+void ub_precharge_snapshot(struct user_beancounter *ub, int *precharge)
+{
+	int cpu, resource;
+
+	memset(precharge, 0, sizeof(int) * UB_RESOURCES);
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+		for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+			precharge[resource] += pcpu->precharge[resource];
+	}
+	precharge[UB_PHYSPAGES] += precharge[UB_KMEMSIZE] >> PAGE_SHIFT;
+	precharge[UB_OOMGUARPAGES] = precharge[UB_SWAPPAGES];
+}
+
+static void forbid_beancounter_precharge(struct user_beancounter *ub, int val)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+		ub->ub_parms[resource].max_precharge = val;
+}
+
+static void init_beancounter_struct(struct user_beancounter *ub);
+static void init_beancounter_nolimits(struct user_beancounter *ub);
+
+#define UB_HASH_SIZE 256
+#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
+static struct hlist_head ub_hash[UB_HASH_SIZE];
+static DEFINE_SPINLOCK(ub_hash_lock);
+LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */
+EXPORT_SYMBOL(ub_list_head);
+LIST_HEAD(ub_leaked_list);
+
+static struct cgroup *ub_cgroup_root;
+
+int set_task_exec_ub(struct task_struct *tsk, struct user_beancounter *ub)
+{
+	int err;
+
+	if (ub->ub_cgroup) {
+		err = cgroup_kernel_attach(ub->ub_cgroup, tsk);
+		if (err)
+			return err;
+	}
+
+	put_beancounter_longterm(tsk->task_bc.exec_ub);
+	tsk->task_bc.exec_ub = get_beancounter_longterm(ub);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_task_exec_ub);
+
+/*
+ *	Per user resource beancounting. Resources are tied to their luid.
+ *	The resource structure itself is tagged both to the process and
+ *	the charging resources (a socket doesn't want to have to search for
+ *	things at irq time for example). Reference counters keep things in
+ *	hand.
+ *
+ *	The case where a user creates resource, kills all his processes and
+ *	then starts new ones is correctly handled this way. The refcounters
+ *	will mean the old entry is still around with resource tied to it.
+ */
+
+static struct user_beancounter *alloc_ub(uid_t uid)
+{
+	struct user_beancounter *new_ub;
+	char name[16];
+
+	ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub);
+
+	new_ub = kmem_cache_zalloc(ub_cachep, GFP_KERNEL);
+	if (new_ub == NULL)
+		return NULL;
+
+	init_beancounter_nolimits(new_ub);
+	init_beancounter_struct(new_ub);
+
+	init_beancounter_precharges(new_ub);
+
+	if (ubc_pagecache_isolation)
+		set_bit(UB_PAGECACHE_ISOLATION, &new_ub->ub_flags);
+
+	if (ubc_ioprio) {
+		snprintf(name, sizeof(name), "%u", uid);
+		new_ub->ub_cgroup = cgroup_kernel_open(ub_cgroup_root,
+				CGRP_CREAT|CGRP_WEAK, name);
+		if (IS_ERR(new_ub->ub_cgroup))
+			goto fail_cgroup;
+		ub_init_ioprio(new_ub);
+	}
+
+	if (alloc_mem_gangs(get_ub_gs(new_ub)))
+		goto fail_gangs;
+
+	if (percpu_counter_init(&new_ub->ub_orphan_count, 0))
+		goto fail_pcpu;
+
+	new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct);
+	if (new_ub->ub_percpu == NULL)
+		goto fail_free;
+
+	new_ub->ub_uid = uid;
+	return new_ub;
+
+fail_free:
+	percpu_counter_destroy(&new_ub->ub_orphan_count);
+fail_pcpu:
+	free_mem_gangs(get_ub_gs(new_ub));
+fail_gangs:
+	if (new_ub->ub_cgroup) {
+		ub_fini_ioprio(new_ub);
+		cgroup_kernel_close(new_ub->ub_cgroup);
+	}
+fail_cgroup:
+	kmem_cache_free(ub_cachep, new_ub);
+	return NULL;
+}
+
+static inline void __free_ub(struct user_beancounter *ub)
+{
+	free_percpu(ub->ub_percpu);
+	kfree(ub->ub_store);
+	free_mem_gangs(get_ub_gs(ub));
+	kfree(ub->private_data2);
+	kmem_cache_free(ub_cachep, ub);
+}
+
+static inline void free_ub(struct user_beancounter *ub)
+{
+	percpu_counter_destroy(&ub->ub_orphan_count);
+	if (ub->ub_cgroup) {
+		ub_fini_ioprio(ub);
+		cgroup_kernel_close(ub->ub_cgroup);
+	}
+	__free_ub(ub);
+}
+
+int ub_count;
+
+struct user_beancounter *get_beancounter_byuid(uid_t uid, int create)
+{
+	struct user_beancounter *new_ub, *ub;
+	unsigned long flags;
+	struct hlist_head *hash;
+	struct hlist_node *ptr;
+
+	hash = &ub_hash[ub_hash_fun(uid)];
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(ub, ptr, hash, ub_hash) {
+		if (ub->ub_uid != uid)
+			continue;
+
+		if (get_beancounter_rcu(ub)) {
+			rcu_read_unlock();
+			return ub;
+		}
+
+		spin_lock_irqsave(&ub_hash_lock, flags);
+		if (!hlist_unhashed(&ub->ub_hash)) {
+			get_beancounter(ub);
+			spin_unlock_irqrestore(&ub_hash_lock, flags);
+			rcu_read_unlock();
+			cancel_work_sync(&ub->work);
+			return ub;
+		}
+		spin_unlock_irqrestore(&ub_hash_lock, flags);
+	}
+	rcu_read_unlock();
+
+	if (!create)
+		return NULL;
+
+	new_ub = alloc_ub(uid);
+	if (new_ub == NULL)
+		return NULL;
+
+	spin_lock_irqsave(&ub_hash_lock, flags);
+
+	hlist_for_each_entry(ub, ptr, hash, ub_hash) {
+		if (ub->ub_uid != uid)
+			continue;
+
+		get_beancounter(ub);
+		spin_unlock_irqrestore(&ub_hash_lock, flags);
+		free_ub(new_ub);
+		cancel_work_sync(&ub->work);
+		return ub;
+	}
+
+	ub_count++;
+	new_ub->dc_time = 0;
+	new_ub->dc_shrink_ts = 0;
+	rb_init_node(&new_ub->dc_node);
+	list_add_rcu(&new_ub->ub_list, &ub_list_head);
+	hlist_add_head_rcu(&new_ub->ub_hash, hash);
+	add_mem_gangs(get_ub_gs(new_ub));
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+	ub_update_threshold();
+	set_gang_limits(get_ub_gs(new_ub),
+			&new_ub->ub_parms[UB_PHYSPAGES].limit,
+			&node_states[N_HIGH_MEMORY]);
+
+	return new_ub;
+}
+EXPORT_SYMBOL(get_beancounter_byuid);
+
+#ifdef CONFIG_BC_KEEP_UNUSED
+
+void release_beancounter(struct user_beancounter *ub)
+{
+}
+
+#else
+
+static int verify_res(struct user_beancounter *ub, const char *name,
+		unsigned long held)
+{
+	if (likely(held == 0))
+		return 1;
+
+	printk(KERN_WARNING "Ub %u helds %ld in %s on put\n",
+			ub->ub_uid, held, name);
+	return 0;
+}
+
+static inline int bc_verify_held(struct user_beancounter *ub)
+{
+	int i, clean;
+
+	ub_update_resources_locked(ub);
+
+	clean = 1;
+	for (i = 0; i < UB_RESOURCES; i++)
+		clean &= verify_res(ub, ub_rnames[i],
+				__get_beancounter_usage_percpu(ub, i));
+
+	clean &= verify_res(ub, "dirty_pages",
+			__ub_stat_get_exact(ub, dirty_pages));
+	clean &= verify_res(ub, "writeback_pages",
+			__ub_stat_get_exact(ub, writeback_pages));
+	clean &= verify_res(ub, "shadow_pages",
+			__get_beancounter_usage_percpu(ub, UB_SHADOWPAGES));
+	clean &= verify_res(ub, "swap_entries", ub->ub_swapentries);
+	clean &= verify_res(ub, "hugetlb_pages", ub->ub_hugetlb_pages);
+	clean &= verify_res(ub, "tmpfs_respages", ub->ub_tmpfs_respages);
+
+	clean &= verify_res(ub, "refcount", atomic_read(&ub->ub_refcount));
+
+	clean &= verify_res(ub, "pincount", __ub_percpu_sum(ub, pincount));
+
+	clean &= verify_res(ub, "dcache", !list_empty(&ub->ub_dentry_lru));
+
+	clean &= verify_res(ub, "underflow",
+			test_bit(UB_UNDERFLOW, &ub->ub_flags));
+
+	ub_debug_trace(!clean, 5, 60*HZ);
+
+	return clean;
+}
+
+static void bc_free_rcu(struct rcu_head *rcu)
+{
+	struct user_beancounter *ub;
+
+	ub = container_of(rcu, struct user_beancounter, rcu);
+	__free_ub(ub);
+}
+
+static void leak_beancounter(struct user_beancounter *ub)
+{
+	atomic_add(INT_MIN/2, &ub->ub_refcount);
+
+	spin_lock_irq(&ub_hash_lock);
+	list_add_tail_rcu(&ub->ub_leaked_list, &ub_leaked_list);
+	spin_unlock_irq(&ub_hash_lock);
+
+	printk(KERN_ERR "UB: leaked beancounter %u (%p)\n",
+			ub->ub_uid, ub);
+	add_taint(TAINT_CRAP);
+}
+
+static void ub_synchronize_sched(struct rcu_head *rcu);
+static void delayed_cleanup_beancounter(struct work_struct *w);
+
+static void delayed_release_beancounter(struct work_struct *w)
+{
+	struct user_beancounter *ub;
+	unsigned long zero_limit = 0;
+	unsigned long flags;
+	int refcount;
+
+	ub = container_of(w, struct user_beancounter, work);
+
+	spin_lock_irqsave(&ub_hash_lock, flags);
+
+	refcount = atomic_read(&ub->ub_refcount);
+	if (refcount > 0)
+		/* raced with get_beancounter_byuid */
+		goto out;
+
+	if (WARN_ON((ub == get_ub0()))) {
+		printk(KERN_ERR "UB: Trying to put ub0\n");
+		goto out;
+	}
+
+	if (hlist_unhashed(&ub->ub_hash)) {
+		printk(KERN_ERR "UB: Trying to put unhashed ub %u (%p)\n",
+				ub->ub_uid, ub);
+		goto out;
+	}
+
+	ub_count--;
+	hlist_del_init_rcu(&ub->ub_hash);
+	list_del_rcu(&ub->ub_list);
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+	if (WARN_ON(refcount < 0))
+		printk(KERN_ERR "UB: Bad refcount (%d) on put of %u (%p)\n",
+				refcount, ub->ub_uid, ub);
+
+	ub_update_threshold();
+
+	/* reset commitment */
+	set_gang_limits(get_ub_gs(ub), &zero_limit, NULL);
+
+	ub_dcache_unuse(ub);
+
+	if (!verify_res(ub, ub_rnames[UB_KMEMSIZE],
+		       __get_beancounter_usage_percpu(ub, UB_KMEMSIZE)) ||
+	    refcount)
+		return leak_beancounter(ub);
+
+	forbid_beancounter_precharge(ub, 0);
+	/* synchronize with __try_charge_beancounter_percpu() */
+	call_rcu_sched(&ub->rcu, ub_synchronize_sched);
+	return;
+
+out:
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+}
+
+static void ub_synchronize_sched(struct rcu_head *rcu)
+{
+	struct user_beancounter *ub = container_of(rcu,
+			struct user_beancounter, rcu);
+
+	INIT_DELAYED_WORK(&ub->dwork, delayed_cleanup_beancounter);
+	queue_delayed_work(ub_clean_wq, &ub->dwork, 0);
+}
+
+static void delayed_cleanup_beancounter(struct work_struct *w)
+{
+	struct user_beancounter *ub;
+	long pages;
+
+	ub = container_of(w, struct user_beancounter, dwork.work);
+
+	junk_mem_gangs(get_ub_gs(ub));
+
+	pages = __get_beancounter_usage_percpu(ub, UB_SHADOWPAGES);
+	pages += __get_beancounter_usage_percpu(ub, UB_PHYSPAGES);
+
+	/*
+	 * Here we wait for all isolated pages. No new charges at this point
+	 * so per-cpu summing abowe is safe. Memory reclaimer cannot peel
+	 * pages from semi-dead beancounters, thus we shouldn't block here
+	 * because ubcleand is single-threaded. This function queues cleanup
+	 * again and again until all pages are moved to the junkyard.
+	 */
+	if (pages) {
+		queue_delayed_work(ub_clean_wq, &ub->dwork, 1);
+		return;
+	}
+
+	ub_unuse_swap(ub);
+
+	if (!bc_verify_held(ub))
+		return leak_beancounter(ub);
+
+	/* DEBUG: to trigger BUG_ON in precharge/charge/uncharge */
+	forbid_beancounter_precharge(ub, -1);
+	del_mem_gangs(get_ub_gs(ub));
+	ub_free_counters(ub);
+	percpu_counter_destroy(&ub->ub_orphan_count);
+	if (ub->ub_cgroup) {
+		ub_fini_ioprio(ub);
+		cgroup_kernel_close(ub->ub_cgroup);
+	}
+
+	call_rcu(&ub->rcu, bc_free_rcu);
+}
+
+void release_beancounter(struct user_beancounter *ub)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ub_hash_lock, flags);
+	if (!atomic_read(&ub->ub_refcount))
+		queue_work(ub_clean_wq, &ub->work);
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+}
+
+#endif /* CONFIG_BC_KEEP_UNUSED */
+
+EXPORT_SYMBOL(release_beancounter);
+
+/*
+ *	Generic resource charging stuff
+ */
+
+int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n",
+			val, resource, ub, ub->ub_parms[resource].held);
+	/*
+	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
+	 * at the moment is possible so an overflow is impossible.  
+	 */
+	ub->ub_parms[resource].held += val;
+
+	switch (strict & ~UB_SEV_FLAGS) {
+		case UB_HARD:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].barrier)
+				break;
+		case UB_SOFT:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].limit)
+				break;
+		case UB_FORCE:
+			ub_adjust_maxheld(ub, resource);
+			return 0;
+		default:
+			BUG();
+	}
+
+	if (!(strict & UB_TEST)) {
+		if (strict == UB_SOFT && __ratelimit(&ub->ub_ratelimit))
+			printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n",
+			       ub_rnames[resource], ub->ub_uid);
+		ub->ub_parms[resource].failcnt++;
+	}
+	ub->ub_parms[resource].held -= val;
+	return -ENOMEM;
+}
+
+int charge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	int retval;
+	unsigned long flags;
+
+	retval = -EINVAL;
+	if (val > UB_MAXVALUE)
+		goto out;
+
+	if (ub) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		retval = __charge_beancounter_locked(ub, resource, val, strict);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+out:
+	return retval;
+}
+
+EXPORT_SYMBOL(charge_beancounter);
+
+void uncharge_warn(struct user_beancounter *ub, const char *resource,
+		unsigned long val, unsigned long held)
+{
+	set_bit(UB_UNDERFLOW, &ub->ub_flags);
+	add_taint(TAINT_CRAP);
+	printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %u\n",
+			val, held, resource, ub->ub_uid);
+	ub_debug_trace(1, 10, 10*HZ);
+}
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n",
+			val, resource, ub, ub->ub_parms[resource].held);
+	if (ub->ub_parms[resource].held < val) {
+		uncharge_warn(ub, ub_rnames[resource],
+				val, ub->ub_parms[resource].held);
+		val = ub->ub_parms[resource].held;
+	}
+	ub->ub_parms[resource].held -= val;
+}
+
+void uncharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+
+	if (ub) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		__uncharge_beancounter_locked(ub, resource, val);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+}
+
+EXPORT_SYMBOL(uncharge_beancounter);
+
+/* called with disabled interrupts */
+static int __precharge_beancounter_percpu(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct ub_percpu_struct *ub_pcpu = ub_percpu(ub, smp_processor_id());
+	int charge, retval;
+
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] >= val))
+		return 0;
+
+	spin_lock(&ub->ub_lock);
+	charge = max((int)val, ub->ub_parms[resource].max_precharge >> 1) -
+		ub_pcpu->precharge[resource];
+	retval = __charge_beancounter_locked(ub, resource,
+			charge, UB_SOFT | UB_TEST);
+	if (!retval)
+		ub_pcpu->precharge[resource] += charge;
+	spin_unlock(&ub->ub_lock);
+
+	return retval;
+}
+
+/* called with disabled interrupts */
+int __charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	int retval, precharge;
+
+	spin_lock(&ub->ub_lock);
+	precharge = max(0, (ub->ub_parms[resource].max_precharge >> 1) -
+			ub_pcpu->precharge[resource]);
+	retval = __charge_beancounter_locked(ub, resource,
+			val + precharge, UB_SOFT | UB_TEST);
+	if (!retval)
+		ub_pcpu->precharge[resource] += precharge;
+	else {
+		init_beancounter_precharge(ub, resource);
+		retval = __charge_beancounter_locked(ub, resource,
+				val, strict);
+	}
+	spin_unlock(&ub->ub_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(__charge_beancounter_percpu);
+
+/* called with disabled interrupts */
+void __uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val)
+{
+	int uncharge;
+
+	spin_lock(&ub->ub_lock);
+	if (ub->ub_parms[resource].max_precharge !=
+			ub_resource_precharge[resource])
+		init_beancounter_precharge(ub, resource);
+	uncharge = max(0, ub_pcpu->precharge[resource] -
+			(ub->ub_parms[resource].max_precharge >> 1));
+	ub_pcpu->precharge[resource] -= uncharge;
+	smp_wmb();
+	__uncharge_beancounter_locked(ub, resource, val + uncharge);
+	spin_unlock(&ub->ub_lock);
+}
+EXPORT_SYMBOL(__uncharge_beancounter_percpu);
+
+unsigned long __get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource)
+{
+	long held, precharge;
+
+	held = ub->ub_parms[resource].held;
+	smp_rmb();
+	precharge = __ub_percpu_sum(ub, precharge[resource]);
+
+	switch (resource) {
+	case UB_PHYSPAGES:
+		/* kmemsize precharge already charged into physpages  */
+		precharge += __ub_percpu_sum(ub, precharge[UB_KMEMSIZE]) >> PAGE_SHIFT;
+		break;
+	case UB_OOMGUARPAGES:
+		/* oomguarpages contains swappages and its precharge too */
+		precharge = __ub_percpu_sum(ub, precharge[UB_SWAPPAGES]);
+		break;
+	}
+
+	return held - precharge;
+}
+
+unsigned long get_beancounter_usage_percpu(struct user_beancounter *ub, int res)
+{
+	return max_t(long, 0, __get_beancounter_usage_percpu(ub, res));
+}
+
+int precharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+	int retval;
+
+	retval = -EINVAL;
+	if (val > UB_MAXVALUE)
+		goto out;
+
+	local_irq_save(flags);
+	if (ub)
+		retval = __precharge_beancounter_percpu(ub, resource, val);
+	local_irq_restore(flags);
+out:
+	return retval;
+}
+EXPORT_SYMBOL(precharge_beancounter);
+
+void ub_reclaim_rate_limit(struct user_beancounter *ub, int wait, unsigned count)
+{
+	ktime_t wall;
+	u64 step;
+
+	if (!ub->rl_step)
+		return;
+
+	spin_lock(&ub->rl_lock);
+	step = (u64)ub->rl_step * count;
+	wall = ktime_add_ns(ktime_get(), step);
+	if (wall.tv64 < ub->rl_wall.tv64)
+		wall = ktime_add_ns(ub->rl_wall, step);
+	ub->rl_wall = wall;
+	spin_unlock(&ub->rl_lock);
+
+	if (wait && get_exec_ub() == ub && !test_thread_flag(TIF_MEMDIE)) {
+		set_current_state(TASK_KILLABLE | TASK_IOTHROTTLED);
+		schedule_hrtimeout(&wall, HRTIMER_MODE_ABS);
+	}
+}
+
+/*
+ *	Initialization
+ *
+ *	struct user_beancounter contains
+ *	 - limits and other configuration settings,
+ *	   with a copy stored for accounting purposes,
+ *	 - structural fields: lists, spinlocks and so on.
+ *
+ *	Before these parts are initialized, the structure should be memset
+ *	to 0 or copied from a known clean structure.  That takes care of a lot
+ *	of fields not initialized explicitly.
+ */
+
+static void init_beancounter_struct(struct user_beancounter *ub)
+{
+	ub->ub_magic = UB_MAGIC;
+	atomic_set(&ub->ub_refcount, 1);
+	spin_lock_init(&ub->ub_lock);
+	INIT_LIST_HEAD(&ub->ub_tcp_sk_list);
+	INIT_LIST_HEAD(&ub->ub_other_sk_list);
+#ifdef CONFIG_BC_DEBUG_KMEM
+	INIT_LIST_HEAD(&ub->ub_cclist);
+#endif
+	INIT_LIST_HEAD(&ub->ub_dentry_lru);
+#ifndef CONFIG_BC_KEEP_UNUSED
+	INIT_WORK(&ub->work, delayed_release_beancounter);
+#endif
+	INIT_LIST_HEAD(&ub->ub_dentry_top);
+	init_oom_control(&ub->oom_ctrl);
+	spin_lock_init(&ub->rl_lock);
+	ub->rl_wall.tv64 = LLONG_MIN;
+}
+
+static void init_beancounter_nolimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++) {
+		ub->ub_parms[k].limit = UB_MAXVALUE;
+		ub->ub_parms[k].barrier = UB_MAXVALUE;
+	}
+
+	/*
+	 * Unlimited vmguarpages gives immunity against systemwide overcommit
+	 * policy. It makes sense in some cases but by default we must obey it.
+	 */
+	ub->ub_parms[UB_VMGUARPAGES].barrier = 0;
+
+	/*
+	 * Unlimited oomguarpages makes container or host mostly immune to
+	 * to the OOM-killer while other containers exists. Withal we cannot
+	 * set it to zero, otherwise single unconfigured container will be
+	 * first target for OOM-killer. 75% of ram looks like sane default.
+	 */
+	ub->ub_parms[UB_OOMGUARPAGES].barrier = totalram_pages * 3 / 4;
+
+	/* Ratelimit for messages in the kernel log */
+	ub->ub_ratelimit.burst = 4;
+	ub->ub_ratelimit.interval = 300*HZ;
+
+	/* VSwap ratelimit. Safe for ub0, its physpages are unlimited */
+	ub->rl_step = NSEC_PER_SEC / 25600; /* 100 Mb/s */
+}
+
+static DEFINE_PER_CPU(struct ub_percpu_struct, ub0_percpu);
+
+void __init ub_init_early(void)
+{
+	struct user_beancounter *ub;
+
+	init_cache_counters();
+	ub = get_ub0();
+	ub->ub_uid = 0;
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub);
+	init_beancounter_precharges_early(ub);
+	ub->ub_percpu = &per_cpu_var(ub0_percpu);
+
+	memset(&current->task_bc, 0, sizeof(struct task_beancounter));
+	(void)set_exec_ub(ub);
+	current->task_bc.task_ub = get_beancounter_longterm(ub);
+	__charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE);
+	init_mm.mm_ub = get_beancounter_longterm(ub);
+
+	hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]);
+	list_add(&ub->ub_list, &ub_list_head);
+	ub->dc_time = 0;
+	ub->dc_shrink_ts = 0;
+	rb_init_node(&ub->dc_node);
+	ub_count++;
+}
+
+static int proc_resource_precharge(ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(lock);
+	struct user_beancounter *ub;
+	int err;
+
+	mutex_lock(&lock);
+
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (err || !write)
+		goto out;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		spin_lock_irq(&ub->ub_lock);
+		init_beancounter_precharges(ub);
+		spin_unlock_irq(&ub->ub_lock);
+	}
+	rcu_read_unlock();
+
+out:
+	mutex_unlock(&lock);
+	return err;
+}
+
+static unsigned int zero = 0;
+static unsigned int one = 1;
+static unsigned int hundreed = 100;
+static int ubc_pagecache_isolation_id;
+static DEFINE_MUTEX(pagecache_isolation_lock);
+
+static int proc_pagecache_isolation(ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	int err;
+
+	mutex_lock(&pagecache_isolation_lock);
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (err || !write)
+		goto out;
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (ubc_pagecache_isolation)
+			set_bit(UB_PAGECACHE_ISOLATION, &ub->ub_flags);
+		else
+			clear_bit(UB_PAGECACHE_ISOLATION, &ub->ub_flags);
+	}
+	rcu_read_unlock();
+out:
+	mutex_unlock(&pagecache_isolation_lock);
+	return err;
+}
+
+static int proc_pagecache_isolation_change(ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	int err;
+
+	mutex_lock(&pagecache_isolation_lock);
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (err || !write)
+		goto out;
+	ub = get_beancounter_byuid(ubc_pagecache_isolation_id, 0);
+	if (ub) {
+		if (table->extra1)
+			set_bit(UB_PAGECACHE_ISOLATION, &ub->ub_flags);
+		else
+			clear_bit(UB_PAGECACHE_ISOLATION, &ub->ub_flags);
+		put_beancounter(ub);
+	} else
+		err = -ENOENT;
+out:
+	mutex_unlock(&pagecache_isolation_lock);
+	return err;
+}
+
+static ctl_table ub_sysctl_table[] = {
+	{
+		.procname	= "resource_precharge",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_resource_precharge,
+		.extra1		= &resource_precharge_min,
+		.extra2		= &resource_precharge_max,
+		.maxlen		= sizeof(ub_resource_precharge),
+		.mode		= 0644,
+		.proc_handler	= &proc_resource_precharge,
+	},
+	{
+		.procname	= "dcache_threshold_ratio",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_dcache_thres_ratio,
+		.maxlen		= sizeof(ub_dcache_thres_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &hundreed,
+	},
+	{
+		.procname	= "dcache_shrink_time_threshold",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_dcache_time_thresh,
+		.maxlen		= sizeof(ub_dcache_time_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "dcache_lru_popup",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_dcache_lru_popup,
+		.maxlen		= sizeof(ub_dcache_lru_popup),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "dcache_no_vzfs_cache",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_dcache_no_vzfs_cache,
+		.maxlen		= sizeof(ub_dcache_no_vzfs_cache),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "ioprio",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ubc_ioprio,
+		.maxlen		= sizeof(ubc_ioprio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_BC_IO_ACCOUNTING
+	{
+		.procname	= "dirty_ratio",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_dirty_radio,
+		.maxlen		= sizeof ub_dirty_radio,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "dirty_background_ratio",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ub_dirty_background_ratio,
+		.maxlen		= sizeof ub_dirty_background_ratio,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "pagecache_isolation",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ubc_pagecache_isolation,
+		.maxlen		= sizeof ubc_pagecache_isolation,
+		.mode		= 0644,
+		.proc_handler	= proc_pagecache_isolation,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "pagecache_isolation_on",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ubc_pagecache_isolation_id,
+		.maxlen		= sizeof ubc_pagecache_isolation_id,
+		.mode		= 0200,
+		.proc_handler	= proc_pagecache_isolation_change,
+		.extra1		= &one,
+	},
+	{
+		.procname	= "pagecache_isolation_off",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &ubc_pagecache_isolation_id,
+		.maxlen		= sizeof ubc_pagecache_isolation_id,
+		.mode		= 0200,
+		.proc_handler	= proc_pagecache_isolation_change,
+	},
+#endif /* CONFIG_BC_IO_ACCOUNTING */
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ub_sysctl_root[] = {
+       {
+	       .ctl_name	= CTL_UNNUMBERED,
+	       .procname	= "ubc",
+	       .mode		= 0555,
+	       .child		= ub_sysctl_table,
+       },
+       { .ctl_name = 0 }
+};
+
+void __init ub_init_late(void)
+{
+	register_sysctl_table(ub_sysctl_root);
+	ub_cachep = kmem_cache_create("user_beancounters",
+			sizeof(struct user_beancounter),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	init_oom_control(&global_oom_ctrl);
+
+	init_beancounter_nolimits(&ub0);
+	set_gang_limits(get_ub_gs(&ub0), &ub0.ub_parms[UB_PHYSPAGES].limit,
+					 &node_states[N_HIGH_MEMORY]);
+}
+
+static __init int ub_init_wq(void)
+{
+	ub_clean_wq = create_singlethread_workqueue("ubcleand");
+	if (ub_clean_wq == NULL)
+		panic("Can't create ubclean wq");
+	return 0;
+}
+
+late_initcall(ub_init_wq);
+
+int __init ub_init_cgroup(void)
+{
+	struct vfsmount *mnt;
+	struct cgroup_sb_opts opts = {
+		.name		= "beancounter",
+		.subsys_bits    = 1ul << blkio_subsys_id,
+	};
+
+	mnt = cgroup_kernel_mount(&opts);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+	ub_cgroup_root = cgroup_get_root(mnt);
+
+	if (!ubc_ioprio)
+		return 0;
+
+	ub0.ub_cgroup = cgroup_kernel_open(ub_cgroup_root, CGRP_CREAT, "0");
+	if (IS_ERR(ub0.ub_cgroup))
+		return PTR_ERR(ub0.ub_cgroup);
+
+	return cgroup_kernel_attach(ub0.ub_cgroup, init_pid_ns.child_reaper);
+}
+late_initcall(ub_init_cgroup);
+
+static int __init parse_ubc_ioprio(char *arg)
+{
+	ubc_ioprio = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+__setup("ubc.ioprio=", parse_ubc_ioprio);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/dcache.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/dcache.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/dcache.c	2015-01-21 12:02:43.156228178 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/dcache.c	2015-01-21 12:02:58.805812736 +0300
@@ -0,0 +1,456 @@
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#include <bc/dcache.h>
+#include <bc/kmem.h>
+
+static struct rb_root bc_dcache_root = RB_ROOT;
+
+static unsigned int dcache_charge_size(int name_len)
+{
+	return dentry_cache->objuse + inode_cachep->objuse +
+		(name_len > DNAME_INLINE_LEN ? name_len : 0);
+}
+
+int ub_dcache_shrink(struct user_beancounter *ub,
+		unsigned long size, gfp_t gfp_mask)
+{
+	int count, pruned;
+
+	if (!(gfp_mask & __GFP_FS))
+		return -EBUSY;
+
+	count = DIV_ROUND_UP(size, dcache_charge_size(0));
+	spin_lock(&dcache_lock);
+	pruned = __shrink_dcache_ub(ub, count, ub_dcache_lru_popup);
+	spin_unlock(&dcache_lock);
+	if (!pruned)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __ub_dcache_insert(struct user_beancounter *ub)
+{
+	struct rb_node **new = &(bc_dcache_root.rb_node), *parent = NULL;
+
+	while (*new) {
+		struct user_beancounter *this;
+
+		this = container_of(*new, struct user_beancounter, dc_node);
+		parent = *new;
+		if (ub->dc_time <= this->dc_time)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&ub->dc_node, parent, new);
+	rb_insert_color(&ub->dc_node, &bc_dcache_root);
+}
+
+void ub_dcache_insert(struct user_beancounter *ub, unsigned int d_time)
+{
+	if (ub->ub_dentry_unused <= ub->ub_dcache_threshold) {
+		if (list_empty(&ub->ub_dentry_lru))
+			ub->dc_time = d_time;
+		return;
+	}
+
+	if (RB_EMPTY_NODE(&ub->dc_node))
+		__ub_dcache_insert(ub);
+}
+
+static inline void __ub_dcache_remove(struct user_beancounter *ub)
+{
+	rb_erase(&ub->dc_node, &bc_dcache_root);
+}
+
+static inline void ub_dcache_remove(struct user_beancounter *ub)
+{
+	if (!RB_EMPTY_NODE(&ub->dc_node)) {
+		__ub_dcache_remove(ub);
+		RB_CLEAR_NODE(&ub->dc_node);
+	}
+}
+
+bool ub_dcache_shrinkable(gfp_t gfp_mask)
+{
+	/* see prune_dcache_popup() */
+	if (gfp_mask & __GFP_REPEAT)
+		return true;
+
+	if (ub_dcache_lru_popup && RB_EMPTY_ROOT(&bc_dcache_root))
+		return false;
+
+	return true;
+}
+
+struct user_beancounter *ub_dcache_next(void)
+{
+	struct rb_node *n;
+	struct user_beancounter *bc;
+	struct dentry *de;
+
+again:
+	/* FIXME, keep the leftmost pointer */
+	n = bc_dcache_root.rb_node;
+	if (!n)
+		return NULL;
+
+	while (n->rb_left != NULL)
+		n = n->rb_left;
+
+	bc = container_of(n, struct user_beancounter, dc_node);
+	/*
+	 * The ub_dentry_lru list may be empty even if ub_dentry_unused > 0,
+	 * because in __shrink_dcache_ub we can drop the dcache_lock between
+	 * removing unused dentries from the list and fixing up the counter.
+	 */
+	if (bc->ub_dentry_unused <= bc->ub_dcache_threshold ||
+	    list_empty(&bc->ub_dentry_lru)) {
+		ub_dcache_remove(bc);
+		goto again;
+	}
+
+	de = list_entry(bc->ub_dentry_lru.prev, struct dentry, d_bclru);
+
+	if (de->d_lru_time > bc->dc_time) {
+		__ub_dcache_remove(bc);
+		bc->dc_time = de->d_lru_time;
+		__ub_dcache_insert(bc);
+		goto again;
+	}
+
+	if (!get_beancounter_rcu(bc)) {
+		ub_dcache_remove(bc);
+		goto again;
+	}
+
+	bc->dc_shrink_ts = dcache_update_time();
+	return bc;
+}
+
+static void __ub_dcache_charge_nofail(struct user_beancounter *ub,
+		unsigned long size)
+{
+	ub_kmem_charge(ub, size, GFP_ATOMIC|__GFP_NOFAIL);
+	charge_beancounter_fast(ub, UB_DCACHESIZE, size, UB_FORCE);
+}
+
+static void __ub_dcache_uncharge(struct user_beancounter *ub,
+		unsigned long size)
+{
+	uncharge_beancounter_fast(ub, UB_DCACHESIZE, size);
+	ub_kmem_uncharge(ub, size);
+}
+
+int ub_dcache_charge(struct user_beancounter *ub, int name_len)
+{
+	int size, shrink;
+
+	if (ub_check_ram_limits(ub, GFP_KERNEL))
+		return -ENOMEM;
+
+	size = dcache_charge_size(name_len);
+	if (ub_kmem_charge(ub, size, GFP_KERNEL|__GFP_NOWARN|__GFP_SOFT_UBC))
+		return -ENOMEM;
+
+	do {
+		if (!charge_beancounter_fast(ub, UB_DCACHESIZE, size,
+					     UB_SOFT | UB_TEST))
+			return 0;
+
+		shrink = max(size, ub->ub_parms[UB_DCACHESIZE].max_precharge);
+	} while (!ub_dcache_shrink(ub, shrink, GFP_KERNEL));
+
+	ub_kmem_uncharge(ub, size);
+
+	spin_lock_irq(&ub->ub_lock);
+	ub->ub_parms[UB_DCACHESIZE].failcnt++;
+	spin_unlock_irq(&ub->ub_lock);
+
+	return -ENOMEM;
+}
+
+void ub_dcache_uncharge(struct user_beancounter *ub, int name_len)
+{
+	unsigned int size;
+
+	size = dcache_charge_size(name_len);
+	__ub_dcache_uncharge(ub, size);
+}
+
+static unsigned long recharge_subtree(struct dentry *d, struct user_beancounter *ub,
+		struct user_beancounter *cub)
+{
+	struct dentry *orig_root;
+	unsigned long size = 0;
+
+	orig_root = d;
+
+	while (1) {
+		if (d->d_ub != cub) {
+			if (!(d->d_flags & DCACHE_BCTOP)) {
+				printk("%s %s %d %d %d %p %p %p %p\n", __func__,
+						d->d_name.name,
+						d->d_ub->ub_uid,
+						ub->ub_uid,
+						cub->ub_uid,
+						d, d->d_ub, ub, cub);
+				WARN_ON(1);
+			}
+			goto skip_subtree;
+		} else if (d->d_ub == ub)
+			goto skip_recharge;
+
+		if (!list_empty(&d->d_lru)) {
+			ub_dcache_insert(ub, d->d_lru_time);
+			list_move(&d->d_bclru, &ub->ub_dentry_lru);
+			cub->ub_dentry_unused--;
+			ub->ub_dentry_unused++;
+		}
+
+		d->d_ub = ub;
+skip_recharge:
+		size += dcache_charge_size(d->d_name.len);
+
+		if (!list_empty(&d->d_subdirs)) {
+			d = list_entry(d->d_subdirs.next,
+					struct dentry, d_u.d_child);
+			continue;
+		}
+skip_subtree:
+		if (d == orig_root)
+			break;
+		while (d == list_entry(d->d_parent->d_subdirs.prev,
+					struct dentry, d_u.d_child)) {
+			d = d->d_parent;
+			if (d == orig_root)
+				goto out;
+		}
+		d = list_entry(d->d_u.d_child.next,
+				struct dentry, d_u.d_child);
+	}
+out:
+	return size;
+}
+
+unsigned long ub_dcache_get_size(struct dentry *dentry)
+{
+	unsigned long size;
+
+	spin_lock(&dcache_lock);
+	size = recharge_subtree(dentry, dentry->d_ub, dentry->d_ub);
+	spin_unlock(&dcache_lock);
+
+	return size;
+}
+
+void ub_dcache_set_owner(struct dentry *root, struct user_beancounter *ub)
+{
+	struct user_beancounter *cub;
+	unsigned long size;
+
+	spin_lock(&dcache_lock);
+
+	cub = root->d_ub;
+	if (ub != cub) {
+		size = recharge_subtree(root, ub, cub);
+		__ub_dcache_uncharge(cub, size);
+		__ub_dcache_charge_nofail(ub, size);
+	}
+
+	if (root->d_flags & DCACHE_BCTOP) {
+		list_del(&root->d_bclru);
+	} else {
+		spin_lock(&root->d_lock);
+		root->d_flags |= DCACHE_BCTOP;
+		spin_unlock(&root->d_lock);
+	}
+
+	if (!list_empty(&root->d_lru)) {
+		list_del_init(&root->d_lru);
+		list_del(&root->d_bclru);
+		root->d_sb->s_nr_dentry_unused--;
+		cub->ub_dentry_unused--;
+		dentry_stat.nr_unused--;
+	}
+
+	list_add_tail(&root->d_bclru, &ub->ub_dentry_top);
+
+	spin_unlock(&dcache_lock);
+}
+EXPORT_SYMBOL(ub_dcache_set_owner);
+
+void ub_dcache_change_owner(struct dentry *dentry, struct user_beancounter *ub)
+{
+	struct user_beancounter *cub = dentry->d_ub;
+	long size;
+
+	size = recharge_subtree(dentry, ub, cub);
+	__ub_dcache_uncharge(cub, size);
+	__ub_dcache_charge_nofail(ub, size);
+}
+
+#define UB_DCACHE_BATCH 32
+
+int ub_dcache_reclaim(struct user_beancounter *ub,
+		unsigned long numerator, unsigned long denominator)
+{
+	unsigned long flags, batch;
+	int ret = 1;
+
+	if (ub->ub_dentry_unused <= ub->ub_dcache_threshold)
+		return 0;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	batch = ub->ub_dentry_unused * numerator / denominator;
+	/*
+	 * Don't rise reclaimer's hopes if progress not guaranteed.
+	 * It's better to kill someone instead of getting stuck forever.
+	 */
+	if (!batch)
+		ret = 0;
+	batch = ub->ub_dentry_batch = batch + ub->ub_dentry_batch;
+	if (batch < UB_DCACHE_BATCH)
+		batch = 0;
+	else
+		ub->ub_dentry_batch = 0;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	if (batch) {
+		spin_lock(&dcache_lock);
+		ret = __shrink_dcache_ub(ub, batch, ub_dcache_lru_popup);
+		spin_unlock(&dcache_lock);
+	}
+	return ret;
+}
+
+/* under dcache_lock and dentry->d_lock */
+void ub_dcache_clear_owner(struct dentry *dentry)
+{
+	struct user_beancounter *ub, *cub;
+	long size;
+
+	BUG_ON(!list_empty(&dentry->d_subdirs));
+	BUG_ON(!(dentry->d_flags & DCACHE_BCTOP));
+
+	cub = dentry->d_ub;
+	ub = IS_ROOT(dentry) ? get_ub0() : dentry->d_parent->d_ub;
+	dentry->d_ub = ub;
+
+	size = dcache_charge_size(dentry->d_name.len);
+	__ub_dcache_uncharge(cub, size);
+	__ub_dcache_charge_nofail(ub, size);
+
+	dentry->d_flags &= ~DCACHE_BCTOP;
+
+	list_del(&dentry->d_bclru);
+}
+
+void ub_dcache_unuse(struct user_beancounter *cub)
+{
+	struct dentry *dentry, *tmp;
+	struct user_beancounter *ub;
+	long size;
+
+	spin_lock(&dcache_lock);
+	ub_dcache_remove(cub);
+	list_for_each_entry_safe(dentry, tmp, &cub->ub_dentry_top, d_bclru) {
+		/* umount in progress */
+		if (!atomic_read(&dentry->d_sb->s_active))
+			continue;
+
+		BUG_ON(dentry->d_ub != cub);
+		ub = IS_ROOT(dentry) ? get_ub0() : dentry->d_parent->d_ub;
+
+		size = recharge_subtree(dentry, ub, cub);
+		__ub_dcache_uncharge(cub, size);
+		__ub_dcache_charge_nofail(ub, size);
+
+		spin_lock(&dentry->d_lock);
+		BUG_ON(!(dentry->d_flags & DCACHE_BCTOP));
+		dentry->d_flags &= ~DCACHE_BCTOP;
+		spin_unlock(&dentry->d_lock);
+
+		list_del(&dentry->d_bclru);
+	}
+	spin_unlock(&dcache_lock);
+
+	/* wait for concurrent umounts */
+	while (!list_empty(&cub->ub_dentry_top))
+		schedule_timeout_uninterruptible(1);
+
+	BUG_ON(!list_empty(&cub->ub_dentry_lru));
+}
+
+static void __ub_update_threshold(struct user_beancounter *ub,
+		unsigned long available_mem, unsigned long long sum_limit)
+{
+	unsigned long limit;
+	unsigned long long res;
+
+	limit = min(ub->ub_parms[UB_PHYSPAGES].limit, available_mem);
+
+	res = limit;
+	res *= available_mem / 100;
+	res *= ub_dcache_thres_ratio;
+	res = div64_u64(res, sum_limit);
+
+	ub->ub_dcache_threshold = (unsigned long)(res << PAGE_SHIFT) /
+							dcache_charge_size(0);
+}
+
+void ub_update_threshold(void)
+{
+	static DEFINE_MUTEX(mutex);
+	struct user_beancounter *ub;
+	unsigned long available_mem;
+	unsigned long long sum_limit = 0;
+
+	available_mem = totalram_pages - totalhigh_pages;
+
+	mutex_lock(&mutex);
+	rcu_read_lock();
+
+	for_each_beancounter(ub) {
+		if (!get_beancounter_rcu(ub))
+			continue;
+		rcu_read_unlock();
+
+		sum_limit += min(ub->ub_parms[UB_PHYSPAGES].limit,
+				 available_mem);
+
+		rcu_read_lock();
+		put_beancounter(ub);
+	}
+
+	for_each_beancounter(ub) {
+		if (!get_beancounter_rcu(ub))
+			continue;
+		rcu_read_unlock();
+
+		__ub_update_threshold(ub, available_mem, sum_limit);
+
+		rcu_read_lock();
+		put_beancounter(ub);
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&mutex);
+}
+
+static __init int ub_set_ub0_thres(void)
+{
+	printk("Dcache charge unit %u\n", dcache_charge_size(0));
+	ub_update_threshold();
+	return 0;
+}
+late_initcall(ub_set_ub0_thres);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/io_acct.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/io_acct.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/io_acct.c	2015-01-21 12:02:43.346223134 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/io_acct.c	2015-01-21 12:02:43.346223134 +0300
@@ -0,0 +1,297 @@
+/*
+ *  kernel/ub/io_acct.c
+ *
+ *  Copyright (C) 2006  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ *  Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/mempool.h>
+#include <linux/proc_fs.h>
+#include <linux/virtinfo.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/proc.h>
+#include <bc/vmpages.h>
+
+/*
+ * starts writeback at this dirty memory percentage from physpages limit
+ */
+int ub_dirty_radio = 50;
+int ub_dirty_background_ratio = 30;
+
+/* under write lock mapping->tree_lock */
+
+void ub_io_account_dirty(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	WARN_ON_ONCE(!radix_tree_tagged(&mapping->page_tree,
+				PAGECACHE_TAG_DIRTY));
+
+	if (!ub)
+		ub = mapping->dirtied_ub = get_beancounter(get_io_ub());
+
+	ub_stat_inc(ub, dirty_pages);
+}
+
+void ub_io_account_clean(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+	size_t bytes = PAGE_SIZE;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, dirty_pages);
+
+	ub_percpu_inc(ub, async_write_complete);
+
+	ub = set_exec_ub(ub);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+	ub = set_exec_ub(ub);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) ||
+	     !mapping_cap_account_writeback(mapping))) {
+		mapping->dirtied_ub = NULL;
+		__put_beancounter(ub);
+	}
+}
+
+void ub_io_account_cancel(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, dirty_pages);
+
+	ub_percpu_inc(ub, async_write_canceled);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) ||
+	     !mapping_cap_account_writeback(mapping))) {
+		mapping->dirtied_ub = NULL;
+		__put_beancounter(ub);
+	}
+}
+
+void ub_io_writeback_inc(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	WARN_ON_ONCE(!radix_tree_tagged(&mapping->page_tree,
+				PAGECACHE_TAG_WRITEBACK));
+
+	if (!ub)
+		ub = mapping->dirtied_ub = get_beancounter(get_io_ub());
+
+	ub_stat_inc(ub, writeback_pages);
+}
+
+void ub_io_writeback_dec(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, writeback_pages);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) ||
+	     !mapping_cap_account_dirty(mapping))) {
+		mapping->dirtied_ub = NULL;
+		__put_beancounter(ub);
+	}
+}
+
+int ub_dirty_limits(unsigned long *pbackground,
+		    long *pdirty, struct user_beancounter *ub)
+{
+	int dirty_ratio;
+	unsigned long available_memory;
+
+	dirty_ratio = ub_dirty_radio;
+	if (!dirty_ratio)
+		return 0;
+
+	available_memory = ub->ub_parms[UB_PHYSPAGES].limit;
+	if (available_memory == UB_MAXVALUE || available_memory == 0)
+		return 0;
+
+	*pdirty = (dirty_ratio * available_memory) / 100;
+
+	dirty_ratio = ub_dirty_background_ratio;
+	*pbackground = (dirty_ratio * available_memory) / 100;
+	if (!dirty_ratio || *pbackground >= *pdirty)
+		*pbackground = *pdirty / 2;
+
+	return 1;
+}
+
+bool ub_should_skip_writeback(struct user_beancounter *ub, struct inode *inode)
+{
+	struct user_beancounter *dirtied_ub;
+	bool ret;
+
+	rcu_read_lock();
+	dirtied_ub = rcu_dereference(inode->i_mapping->dirtied_ub);
+	ret = !dirtied_ub || (dirtied_ub != ub &&
+			!test_bit(UB_DIRTY_EXCEEDED, &dirtied_ub->ub_flags));
+	rcu_read_unlock();
+
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+#define in_flight(var)	(var > var##_done ? var - var##_done : 0)
+
+static int bc_ioacct_show(struct seq_file *f, void *v)
+{
+	int i;
+	unsigned long long read, write, cancel;
+	unsigned long sync, sync_done;
+	unsigned long fsync, fsync_done;
+	unsigned long fdsync, fdsync_done;
+	unsigned long frsync, frsync_done;
+	struct user_beancounter *ub;
+	unsigned long dirty_pages;
+	unsigned long long dirtied;
+	unsigned long fuse_requests, fuse_bytes;
+
+	ub = seq_beancounter(f);
+
+	dirty_pages = __ub_stat_get(ub, dirty_pages);
+
+	read = write = cancel = 0;
+	sync = sync_done = fsync = fsync_done =
+		fdsync = fdsync_done = frsync = frsync_done = 0;
+	fuse_requests = fuse_bytes = 0;
+	for_each_online_cpu(i) {
+		struct ub_percpu_struct *ub_percpu;
+		ub_percpu = per_cpu_ptr(ub->ub_percpu, i);
+
+		read += ub_percpu->sync_read_bytes;
+		write += ub_percpu->sync_write_bytes;
+
+		dirty_pages += ub_percpu->dirty_pages;
+		write += (u64)ub_percpu->async_write_complete << PAGE_SHIFT;
+		cancel += (u64)ub_percpu->async_write_canceled << PAGE_SHIFT;
+
+		sync += ub_percpu->sync;
+		fsync += ub_percpu->fsync;
+		fdsync += ub_percpu->fdsync;
+		frsync += ub_percpu->frsync;
+		sync_done += ub_percpu->sync_done;
+		fsync_done += ub_percpu->fsync_done;
+		fdsync_done += ub_percpu->fdsync_done;
+		frsync_done += ub_percpu->frsync_done;
+
+		fuse_requests += ub_percpu->fuse_requests;
+		fuse_bytes += ub_percpu->fuse_bytes;
+	}
+
+	if ((long)dirty_pages < 0)
+		dirty_pages = 0;
+
+	dirtied = write + cancel;
+	dirtied += (u64)dirty_pages << PAGE_SHIFT;
+
+	seq_printf(f, bc_proc_llu_fmt, "read", read);
+	seq_printf(f, bc_proc_llu_fmt, "write", write);
+	seq_printf(f, bc_proc_llu_fmt, "dirty", dirtied);
+	seq_printf(f, bc_proc_llu_fmt, "cancel", cancel);
+	seq_printf(f, bc_proc_llu_fmt, "missed", 0ull);
+
+	seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync);
+	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync);
+	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync);
+	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync);
+
+	seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync));
+	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync));
+	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync));
+	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync));
+
+	seq_printf(f, bc_proc_lu_lfmt, "io_pbs", dirty_pages);
+
+	seq_printf(f, bc_proc_lu_lfmt, "fuse_requests", fuse_requests);
+	seq_printf(f, bc_proc_lu_lfmt, "fuse_bytes", fuse_bytes);
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_ioacct_entry = {
+	.name = "ioacct",
+	.u.show = bc_ioacct_show,
+};
+
+static int bc_ioacct_notify(struct vnotifier_block *self,
+		unsigned long event, void *arg, int old_ret)
+{
+	struct user_beancounter *ub;
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long *vm_events;
+	unsigned long long bin, bout;
+	int i;
+
+	if (event != VIRTINFO_VMSTAT)
+		return old_ret;
+
+	ub = get_exec_ub();
+	if (ub == get_ub0())
+		return old_ret;
+
+	/* Think over: do we need to account here bytes_dirty_missed? */
+	bout = 0;
+	bin = 0;
+	for_each_online_cpu(i) {
+		ub_pcpu = per_cpu_ptr(ub->ub_percpu, i);
+		bout += (u64)ub_pcpu->async_write_complete << PAGE_SHIFT;
+		bout += ub_pcpu->sync_write_bytes;
+		bin += ub_pcpu->sync_read_bytes;
+	}
+
+	/* convert to Kbytes */
+	bout >>= 10;
+	bin >>= 10;
+
+	vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS;
+	vm_events[PGPGOUT] = (unsigned long)bout;
+	vm_events[PGPGIN] = (unsigned long)bin;
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block bc_ioacct_nb = {
+	.notifier_call = bc_ioacct_notify,
+};
+
+static int __init bc_ioacct_init(void)
+{
+	bc_register_proc_entry(&bc_ioacct_entry);
+
+	virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb);
+	return 0;
+}
+
+late_initcall(bc_ioacct_init);
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/io_prio.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/io_prio.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/io_prio.c	2015-01-21 12:02:43.346223134 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/io_prio.c	2015-01-21 12:02:43.349223054 +0300
@@ -0,0 +1,227 @@
+/*
+ *  kernel/bc/io_prio.c
+ *
+ *  Copyright (C) 2010  Parallels, inc.
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.Parallels" file.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+#include "blk-cgroup.h"
+
+static u64 ioprio_weight[UB_IOPRIO_MAX] = {320, 365, 410, 460, 500, 550, 600, 640};
+
+void ub_init_ioprio(struct user_beancounter *ub)
+{
+	blkio_cgroup_set_ub(ub->ub_cgroup, ub);
+}
+
+void ub_fini_ioprio(struct user_beancounter *ub)
+{
+	blkio_cgroup_set_ub(ub->ub_cgroup, &ub0);
+}
+
+int ub_set_ioprio(int id, int ioprio)
+{
+	struct user_beancounter *ub;
+	int ret;
+
+	ret = -ERANGE;
+	if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX)
+		goto out;
+
+	ret = -ESRCH;
+	ub = get_beancounter_byuid(id, 0);
+	if (!ub)
+		goto out;
+
+	if (ub->ub_cgroup)
+		ret = blkio_cgroup_set_weight(ub->ub_cgroup,
+				ioprio_weight[ioprio]);
+	else
+		ret = -ENOTSUPP;
+	put_beancounter_longterm(ub);
+out:
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int bc_iostat(struct seq_file *f, struct user_beancounter *bc)
+{
+	struct blkio_group_stats *stats;
+	struct blkio_cgroup *blkcg;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+
+	seq_printf(f, "%s %u %c %lu %lu %lu %u %u %lu %lu %lu %lu\n",
+			"flush" ,
+			(unsigned)bc->ub_uid, '.',
+			0ul, 0ul, 0ul, 0, 0,
+			ub_stat_get_exact(bc, wb_requests),
+			ub_stat_get_exact(bc, wb_sectors), 0ul, 0ul);
+
+	seq_printf(f, "%s %u %c %lu %lu %lu %u %u %lu %lu %lu %lu\n",
+			"fuse" ,
+			(unsigned)bc->ub_uid, '.',
+			0ul, 0ul, 0ul, 0, 0,
+			__ub_percpu_sum(bc, fuse_requests),
+			__ub_percpu_sum(bc, fuse_bytes) >> 9, 0ul, 0ul);
+
+	if (!bc->ub_cgroup)
+		return 0;
+
+	blkcg = cgroup_to_blkio_cgroup(bc->ub_cgroup);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		unsigned long queued, rds, wrs, serviced, sectors;
+		unsigned int used_time, wait_time;
+		uint64_t tmp;
+
+		if (!blkg->dev || blkg->plid != BLKIO_POLICY_PROP)
+			continue;
+
+		spin_lock_irq(&blkg->stats_lock);
+		stats = &blkg->stats;
+		queued    = stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+			    stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+		rds  = blkio_read_stat_cpu(blkg, BLKIO_STAT_CPU_SERVICED, BLKIO_STAT_READ);
+		wrs = blkio_read_stat_cpu(blkg, BLKIO_STAT_CPU_SERVICED, BLKIO_STAT_WRITE);
+		serviced = rds + wrs;
+
+		tmp	  = stats->stat_arr[BLKIO_STAT_WAIT_TIME][BLKIO_STAT_READ] +
+			    stats->stat_arr[BLKIO_STAT_WAIT_TIME][BLKIO_STAT_WRITE];
+		do_div(tmp, NSEC_PER_MSEC);
+		wait_time = tmp;
+
+		used_time = jiffies_to_msecs(stats->time);
+		sectors   = blkio_read_stat_cpu(blkg, BLKIO_STAT_CPU_SECTORS, 0);
+		spin_unlock_irq(&blkg->stats_lock);
+		seq_printf(f, "%s %u %c %lu %lu %lu %u %u %lu %lu %lu %lu\n",
+				blkg->dev_name ?: "none" ,
+				(unsigned)bc->ub_uid, '.',
+				queued, 0ul, 0ul,
+				wait_time, used_time,
+				serviced, sectors, rds, wrs);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int bc_iostat_single(struct seq_file *f, void *v)
+{
+	return bc_iostat(f, seq_beancounter(f));
+}
+
+static struct bc_proc_entry bc_iostat_entry = {
+	.name = "iostat",
+	.u.show = bc_iostat_single,
+};
+
+static void *bc_iostat_start(struct seq_file *f, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	unsigned long pos = *ppos;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!pos--)
+			return ub;
+	}
+	return NULL;
+}
+
+static void *bc_iostat_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct user_beancounter *ub = v;
+	struct list_head *entry;
+
+	entry = &ub->ub_list;
+	list_for_each_continue_rcu(entry, &ub_list_head) {
+		ub = list_entry(entry, struct user_beancounter, ub_list);
+		(*ppos)++;
+		return ub;
+	}
+	return NULL;
+}
+
+static int bc_iostat_show(struct seq_file *f, void *v)
+{
+	return bc_iostat(f, v);
+}
+
+static void bc_iostat_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+
+static struct seq_operations iostat_seq_ops = {
+	.start = bc_iostat_start,
+	.next  = bc_iostat_next,
+	.stop  = bc_iostat_stop,
+	.show  = bc_iostat_show,
+};
+
+static int bc_iostat_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &iostat_seq_ops);
+}
+
+static struct file_operations bc_iostat_ops = {
+	.open		= bc_iostat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct bc_proc_entry bc_root_iostat_entry = {
+	.name = "iostat",
+	.u.fops = &bc_iostat_ops,
+};
+
+static int bc_ioprio_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *bc;
+	struct blkio_cgroup *blkcg;
+	int ioprio;
+
+	bc = seq_beancounter(f);
+
+	if (!bc->ub_cgroup)
+		return 0;
+
+	blkcg = cgroup_to_blkio_cgroup(bc->ub_cgroup);
+
+	ioprio = UB_IOPRIO_MAX - 1;
+	while (ioprio && blkcg->weight < ioprio_weight[ioprio])
+		ioprio--;
+
+	seq_printf(f, "prio: %d\n", ioprio);
+	return 0;
+}
+
+static struct bc_proc_entry bc_ioprio_entry = {
+	.name = "ioprio",
+	.u.show = bc_ioprio_show,
+};
+
+static int __init bc_iostat_init(void)
+{
+	bc_register_proc_entry(&bc_ioprio_entry);
+	bc_register_proc_entry(&bc_iostat_entry);
+	bc_register_proc_root_entry(&bc_root_iostat_entry);
+	return 0;
+}
+late_initcall(bc_iostat_init);
+
+#endif /* CONFIG_PROC_FS */
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/kmem.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/kmem.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/kmem.c	2015-01-21 12:02:43.397221780 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/kmem.c	2015-01-21 12:02:58.743814383 +0300
@@ -0,0 +1,197 @@
+/*
+ *  kernel/bc/kmem.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <bc/beancounter.h>
+#include <bc/oom_kill.h>
+#include <bc/vmpages.h>
+#include <bc/dcache.h>
+#include <bc/kmem.h>
+#include <bc/proc.h>
+
+int __ub_kmem_charge(struct user_beancounter *ub,
+		unsigned long size, gfp_t gfp_mask)
+{
+	unsigned long pages, charge, flags;
+	int kmem_strict, phys_strict;
+	int do_precharge = 1;
+	int failres;
+
+	charge = size + (ub->ub_parms[UB_KMEMSIZE].max_precharge >> 1);
+	pages = PAGE_ALIGN(charge) >> PAGE_SHIFT;
+
+	phys_strict = UB_SOFT | UB_TEST;
+	kmem_strict = ub_gfp_sev(gfp_mask) | UB_TEST;
+
+	if (unlikely(gfp_mask & __GFP_NOFAIL)) {
+		kmem_strict = phys_strict = UB_FORCE | UB_TEST;
+		goto no_precharge;
+	}
+
+	if (unlikely(irqs_disabled() || !(gfp_mask & __GFP_WAIT))) {
+		phys_strict = UB_FORCE | UB_TEST;
+		goto no_precharge;
+	}
+
+	ub_oom_start(&ub->oom_ctrl);
+
+try_again:
+	failres = UB_PHYSPAGES;
+	while (charge_beancounter_fast(ub, UB_PHYSPAGES, pages, phys_strict)) {
+		if (test_thread_flag(TIF_MEMDIE) ||
+		    fatal_signal_pending(current)) {
+			do_precharge = 0;
+			goto no_precharge;
+		} else if (!ub_try_to_free_pages(ub, gfp_mask))
+			continue;
+		goto no_precharge;
+	}
+
+	failres = UB_KMEMSIZE;
+	charge = pages << PAGE_SHIFT;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	while (__charge_beancounter_locked(ub, UB_KMEMSIZE, charge, kmem_strict)) {
+		init_beancounter_precharge(ub, UB_KMEMSIZE);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+		if (ub_dcache_shrink(ub, charge, gfp_mask)) {
+			uncharge_beancounter(ub, UB_PHYSPAGES, pages);
+			goto no_precharge;
+		}
+		spin_lock_irqsave(&ub->ub_lock, flags);
+	}
+	ub_percpu(ub, smp_processor_id())->
+		precharge[UB_KMEMSIZE] += charge - size;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	return 0;
+
+no_precharge:
+	if (do_precharge) {
+		do_precharge = 0;
+		pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+		goto try_again;
+	}
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_parms[failres].failcnt++;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	if (__ratelimit(&ub->ub_ratelimit))
+		printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n",
+				ub_rnames[failres], ub->ub_uid);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(__ub_kmem_charge);
+
+void __ub_kmem_uncharge(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		unsigned long size)
+{
+	unsigned long uncharge;
+
+	spin_lock(&ub->ub_lock);
+
+	if (ub->ub_parms[UB_KMEMSIZE].max_precharge !=
+			ub_resource_precharge[UB_KMEMSIZE])
+		init_beancounter_precharge(ub, UB_KMEMSIZE);
+
+	if (!__try_uncharge_beancounter_percpu(ub, ub_pcpu, UB_KMEMSIZE, size))
+		goto out;
+
+	uncharge = (size + ub_pcpu->precharge[UB_KMEMSIZE]
+			- (ub->ub_parms[UB_KMEMSIZE].max_precharge >> 1)
+		   ) & PAGE_MASK;
+	ub_pcpu->precharge[UB_KMEMSIZE] += size - uncharge;
+	__uncharge_beancounter_locked(ub, UB_KMEMSIZE, uncharge);
+	__uncharge_beancounter_locked(ub, UB_PHYSPAGES, uncharge >> PAGE_SHIFT);
+
+out:
+	spin_unlock(&ub->ub_lock);
+}
+EXPORT_SYMBOL(__ub_kmem_uncharge);
+
+int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags)
+{
+	unsigned int size;
+	struct user_beancounter *ub;
+
+	ub = get_beancounter(get_exec_ub());
+	if (ub == NULL)
+		return 0;
+
+	size = CHARGE_SIZE(kmem_cache_objuse(cachep));
+	if (ub_kmem_charge(ub, size, flags))
+		goto out_err;
+
+	*ub_slab_ptr(cachep, objp) = ub;
+	return 0;
+
+out_err:
+	put_beancounter(ub);
+	return -ENOMEM;
+}
+
+void ub_slab_uncharge(struct kmem_cache *cachep, void *objp)
+{
+	unsigned int size;
+	struct user_beancounter **ub_ref;
+
+	ub_ref = ub_slab_ptr(cachep, objp);
+	if (*ub_ref == NULL)
+		return;
+
+	size = CHARGE_SIZE(kmem_cache_objuse(cachep));
+	ub_kmem_uncharge(*ub_ref, size);
+	put_beancounter(*ub_ref);
+	*ub_ref = NULL;
+}
+
+/* 
+ * takes init_mm.page_table_lock 
+ * some outer lock to protect pages from vmalloced area must be held
+ */
+struct user_beancounter *vmalloc_ub(void *obj)
+{
+	struct page *pg;
+
+	pg = vmalloc_to_page(obj);
+	if (pg == NULL)
+		return NULL;
+
+	return page_kmem_ub(pg);
+}
+
+EXPORT_SYMBOL(vmalloc_ub);
+
+struct user_beancounter *mem_ub(void *obj)
+{
+	struct user_beancounter *ub;
+
+	if ((unsigned long)obj >= VMALLOC_START &&
+	    (unsigned long)obj  < VMALLOC_END)
+		ub = vmalloc_ub(obj);
+	else
+		ub = slab_ub(obj);
+
+	return ub;
+}
+
+EXPORT_SYMBOL(mem_ub);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/misc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/misc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/misc.c	2015-01-21 12:02:43.397221780 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/misc.c	2015-01-21 12:02:43.497219124 +0300
@@ -0,0 +1,167 @@
+/*
+ *  kernel/bc/misc.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+#include <bc/proc.h>
+
+/*
+ * Task staff
+ */
+
+#define TASK_KMEM_SIZE	(sizeof(struct task_struct) + THREAD_SIZE)
+
+int ub_task_charge(struct user_beancounter *ub)
+{
+	if (ub_kmem_charge(ub, TASK_KMEM_SIZE, GFP_KERNEL))
+		goto no_mem;
+
+	if (charge_beancounter_fast(ub, UB_NUMPROC, 1, UB_HARD))
+		goto no_num;
+
+	return 0;
+
+no_num:
+	ub_kmem_uncharge(ub, TASK_KMEM_SIZE);
+no_mem:
+	return -ENOMEM;
+}
+
+void ub_task_uncharge(struct user_beancounter *ub)
+{
+	uncharge_beancounter_fast(ub, UB_NUMPROC, 1);
+	ub_kmem_uncharge(ub, TASK_KMEM_SIZE);
+}
+
+void ub_task_get(struct user_beancounter *ub, struct task_struct *task)
+{
+	struct task_beancounter *new_bc = &task->task_bc;
+
+	new_bc->task_ub = get_beancounter_longterm(ub);
+	new_bc->exec_ub = get_beancounter_longterm(ub);
+}
+
+void ub_task_put(struct task_struct *task)
+{
+	struct task_beancounter *task_bc;
+
+	task_bc = &task->task_bc;
+
+	put_beancounter_longterm(task_bc->exec_ub);
+	put_beancounter_longterm(task_bc->task_ub);
+
+	task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc;
+	task_bc->task_ub = (struct user_beancounter *)0xdead100c;
+}
+
+int ub_file_charge(struct file *f)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	int err;
+
+	err = charge_beancounter_fast(ub, UB_NUMFILE, 1, UB_HARD);
+	if (unlikely(err))
+		goto no_file;
+
+	err = ub_kmem_charge(ub,
+			CHARGE_SIZE(kmem_cache_objuse(filp_cachep)),
+			GFP_KERNEL);
+	if (unlikely(err))
+		goto no_kmem;
+
+	f->f_ub = get_beancounter(ub);
+
+	return 0;
+
+no_kmem:
+	uncharge_beancounter_fast(ub, UB_NUMFILE, 1);
+no_file:
+	return err;
+}
+
+void ub_file_uncharge(struct file *f)
+{
+	struct user_beancounter *ub = f->f_ub;
+
+	ub_kmem_uncharge(ub,
+			CHARGE_SIZE(kmem_cache_objuse(filp_cachep)));
+	uncharge_beancounter_fast(ub, UB_NUMFILE, 1);
+	put_beancounter(ub);
+}
+
+int ub_flock_charge(struct file_lock *fl, int hard)
+{
+	struct user_beancounter *ub;
+	int err;
+
+	/* No need to get_beancounter here since it's already got in slab */
+	ub = slab_ub(fl);
+	if (ub == NULL)
+		return 0;
+
+	err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT);
+	if (!err)
+		fl->fl_charged = 1;
+	return err;
+}
+
+void ub_flock_uncharge(struct file_lock *fl)
+{
+	struct user_beancounter *ub;
+
+	/* Ub will be put in slab */
+	ub = slab_ub(fl);
+	if (ub == NULL || !fl->fl_charged)
+		return;
+
+	uncharge_beancounter(ub, UB_NUMFLOCK, 1);
+	fl->fl_charged = 0;
+}
+
+/*
+ * PTYs
+ */
+
+int ub_pty_charge(struct tty_struct *tty)
+{
+	struct user_beancounter *ub;
+	int retval;
+
+	ub = slab_ub(tty);
+	retval = 0;
+	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+			!test_bit(TTY_CHARGED, &tty->flags)) {
+		retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD);
+		if (!retval)
+			set_bit(TTY_CHARGED, &tty->flags);
+	}
+	return retval;
+}
+
+void ub_pty_uncharge(struct tty_struct *tty)
+{
+	struct user_beancounter *ub;
+
+	ub = slab_ub(tty);
+	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+			test_bit(TTY_CHARGED, &tty->flags)) {
+		uncharge_beancounter(ub, UB_NUMPTY, 1);
+		clear_bit(TTY_CHARGED, &tty->flags);
+	}
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/net.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/net.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/net.c	2015-01-21 12:02:43.281224859 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/net.c	2015-01-21 12:02:47.885102631 +0300
@@ -0,0 +1,1124 @@
+/*
+ *  linux/kernel/bc/net.c
+ *
+ *  Copyright (C) 1998-2004  Andrey V. Savochkin <saw@saw.sw.com.sg>
+ *  Copyright (C) 2005 SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * TODO:
+ *   - sizeof(struct inode) charge
+ *   = tcp_mem_schedule() feedback based on ub limits
+ *   + measures so that one socket won't exhaust all send buffers,
+ *     see bug in bugzilla
+ *   = sk->socket check for NULL in snd_wakeups
+ *     (tcp_write_space checks for NULL itself)
+ *   + in tcp_close(), orphaned socket abortion should be based on ubc
+ *     resources (same in tcp_out_of_resources)
+ *     Beancounter should also have separate orphaned socket counter...
+ *   + for rcv, in-order segment should be accepted
+ *     if only barrier is exceeded
+ *   = tcp_rmem_schedule() feedback based on ub limits
+ *   - repair forward_alloc mechanism for receive buffers
+ *     It's idea is that some buffer space is pre-charged so that receive fast
+ *     path doesn't need to take spinlocks and do other heavy stuff
+ *   + tcp_prune_queue actions based on ub limits
+ *   + window adjustments depending on available buffers for receive
+ *   - window adjustments depending on available buffers for send
+ *   + race around usewreserv
+ *   + avoid allocating new page for each tiny-gram, see letter from ANK
+ *   + rename ub_sock_lock
+ *   + sk->sleep wait queue probably can be used for all wakeups, and
+ *     sk->ub_wait is unnecessary
+ *   + for UNIX sockets, the current algorithm will lead to
+ *     UB_UNIX_MINBUF-sized messages only for non-blocking case
+ *   - charge for af_packet sockets
+ *   + all datagram sockets should be charged to NUMUNIXSOCK
+ *   - we do not charge for skb copies and clones staying in device queues
+ *   + live-lock if number of sockets is big and buffer limits are small
+ *     [diff-ubc-dbllim3]
+ *   - check that multiple readers/writers on the same socket won't cause fatal
+ *     consequences
+ *   - check allocation/charge orders
+ *   + There is potential problem with callback_lock.  In *snd_wakeup we take
+ *     beancounter first, in sock_def_error_report - callback_lock first.
+ *     then beancounter.  This is not a problem if callback_lock taken
+ *     readonly, but anyway...
+ *   - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator
+ * General kernel problems:
+ *   - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC
+ *     notification won't get signals
+ *   - datagram_poll looks racy
+ *
+ */
+
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/err.h>
+#include <linux/socket.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <bc/beancounter.h>
+#include <bc/net.h>
+#include <bc/debug.h>
+
+/* by some reason it is not used currently */
+#define UB_SOCK_MAINTAIN_WMEMPRESSURE	0
+
+
+/* Skb truesize definition. Bad place. Den */
+
+static inline int skb_chargesize_head(struct sk_buff *skb)
+{
+	return skb_charge_size(skb_end_pointer(skb) - skb->head +
+				sizeof(struct skb_shared_info));
+}
+
+int skb_charge_fullsize(struct sk_buff *skb)
+{
+	int chargesize;
+	struct sk_buff *skbfrag;
+
+	chargesize = skb_chargesize_head(skb) +
+		PAGE_SIZE * skb_shinfo(skb)->nr_frags;
+	if (likely(skb_shinfo(skb)->frag_list == NULL))
+		return chargesize;
+	for (skbfrag = skb_shinfo(skb)->frag_list;
+	     skbfrag != NULL;
+	     skbfrag = skbfrag->next) {
+		chargesize += skb_charge_fullsize(skbfrag);
+	}
+	return chargesize;
+}
+EXPORT_SYMBOL(skb_charge_fullsize);
+
+static int ub_sock_makewreserv_locked(struct sock *sk,
+		int bufid, unsigned long size);
+
+int __ub_too_many_orphans(struct sock *sk, int count)
+{
+
+	struct ubparm *ub_sock;
+
+	ub_sock = &sock_bc(sk)->ub->ub_parms[UB_NUMTCPSOCK];
+	if (sock_has_ubc(sk) && (count >= ub_sock->barrier >> 2))
+			return 1;
+	return 0;
+}
+
+/*
+ * Queueing
+ */
+
+static void ub_sock_snd_wakeup(struct user_beancounter *ub)
+{
+	struct list_head *p;
+	struct sock *sk;
+	struct sock_beancounter *skbc;
+	struct socket *sock;
+
+	while (!list_empty(&ub->ub_other_sk_list)) {
+		p = ub->ub_other_sk_list.next;
+		skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
+		sk = skbc_sock(skbc);
+
+		sock = sk->sk_socket;
+		if (sock == NULL) {
+			/* sk being destroyed */
+			list_del_init(&skbc->ub_sock_list);
+			continue;
+		}
+
+		ub_debug(UBD_NET_SLEEP,
+				"Checking queue, waiting %lu, reserv %lu\n",
+				skbc->ub_waitspc, skbc->poll_reserv);
+		if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF,
+					skbc->ub_waitspc))
+			break;
+
+		list_del_init(&skbc->ub_sock_list);
+
+		/*
+		 * See comments in ub_tcp_snd_wakeup.
+		 * Locking note: both unix_write_space and
+		 * sock_def_write_space take callback_lock themselves.
+		 * We take it here just to be on the safe side and to
+		 * act the same way as ub_tcp_snd_wakeup does.
+		 */
+		sock_hold(sk);
+		spin_unlock(&ub->ub_lock);
+
+		read_lock(&sk->sk_callback_lock);
+		sk->sk_write_space(sk);
+		read_unlock(&sk->sk_callback_lock);
+
+		sock_put(sk);
+
+		spin_lock(&ub->ub_lock);
+	}
+}
+
+static void ub_tcp_snd_wakeup(struct user_beancounter *ub)
+{
+	struct list_head *p;
+	struct sock *sk;
+	struct sock_beancounter *skbc;
+	struct socket *sock;
+
+	while (!list_empty(&ub->ub_tcp_sk_list)) {
+		p = ub->ub_tcp_sk_list.next;
+		skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
+		sk = skbc_sock(skbc);
+
+		sock = sk->sk_socket;
+		if (sock == NULL) {
+			/* sk being destroyed */
+			list_del_init(&skbc->ub_sock_list);
+			continue;
+		}
+
+		ub_debug(UBD_NET_SLEEP,
+				"Checking queue, waiting %lu, reserv %lu\n",
+				skbc->ub_waitspc, skbc->poll_reserv);
+		if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF,
+					skbc->ub_waitspc))
+			break;
+
+		list_del_init(&skbc->ub_sock_list);
+
+		/*
+		 * Send async notifications and wake up.
+		 * Locking note: we get callback_lock here because
+		 * tcp_write_space is over-optimistic about calling context
+		 * (socket lock is presumed).  So we get the lock here although
+		 * it belongs to the callback.
+		 */
+		sock_hold(sk);
+		spin_unlock(&ub->ub_lock);
+
+		read_lock(&sk->sk_callback_lock);
+		sk->sk_write_space(sk);
+		read_unlock(&sk->sk_callback_lock);
+
+		sock_put(sk);
+
+		spin_lock(&ub->ub_lock);
+	}
+}
+
+int ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size)
+{
+	unsigned long flags;
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+
+	if (!sock_has_ubc(sk))
+		return 0;
+
+	skbc = sock_bc(sk);
+	ub = skbc->ub;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size);
+	if (!ub_sock_makewreserv_locked(sk, res, size)) {
+		/*
+		 * It looks a bit hackish, but it is compatible with both
+		 * wait_for_xx_ubspace and poll.
+		 * This __set_current_state is equivalent to a wakeup event
+		 * right after spin_unlock_irqrestore.
+		 */
+		__set_current_state(TASK_RUNNING);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+		return 0;
+	}
+
+	ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n");
+	skbc->ub_waitspc = size;
+	if (!list_empty(&skbc->ub_sock_list)) {
+		ub_debug(UBD_NET_SOCKET,
+				"re-adding socket to beancounter %p.\n", ub);
+		goto out;
+	}
+
+	switch (res) {
+		case UB_TCPSNDBUF:
+			list_add_tail(&skbc->ub_sock_list,
+					&ub->ub_tcp_sk_list);
+			break;
+		case UB_OTHERSOCKBUF:
+			list_add_tail(&skbc->ub_sock_list,
+					&ub->ub_other_sk_list);
+			break;
+		default:
+			BUG();
+	}
+out:
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+	return -ENOMEM;
+}
+
+EXPORT_SYMBOL(ub_sock_snd_queue_add);
+
+long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(sk->sk_sleep, &wait);
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size))
+			break;
+
+		if (sk->sk_shutdown & SEND_SHUTDOWN)
+			break;
+		if (sk->sk_err)
+			break;
+		ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size);
+		timeo = schedule_timeout(timeo);
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk->sk_sleep, &wait);
+	return timeo;
+}
+
+void ub_sock_sndqueuedel(struct sock *sk)
+{
+	struct user_beancounter *ub;
+	struct sock_beancounter *skbc;
+	unsigned long flags;
+
+	if (!sock_has_ubc(sk))
+		return;
+	skbc = sock_bc(sk);
+
+	/* race with write_space callback of other socket */
+	ub = skbc->ub;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	list_del_init(&skbc->ub_sock_list);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+/*
+ * Helpers
+ */
+
+static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
+		       unsigned long size, int resource)
+{
+	WARN_ON_ONCE(skb_bc(skb)->ub != NULL);
+
+	skb_bc(skb)->ub = sock_bc(sk)->ub;
+	skb_bc(skb)->charged = size;
+	skb_bc(skb)->resource = resource;
+}
+
+void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
+		       unsigned long size, int resource)
+{
+	if (!sock_has_ubc(sk))
+		return;
+
+	if (sock_bc(sk)->ub == NULL)
+		BUG();
+
+	__ub_skb_set_charge(skb, sk, size, resource);
+
+	/* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */
+	if (skb->sk == NULL)
+		skb->sk = sk;
+}
+
+EXPORT_SYMBOL(ub_skb_set_charge);
+
+static inline void ub_skb_set_uncharge(struct sk_buff *skb)
+{
+	skb_bc(skb)->ub = NULL;
+	skb_bc(skb)->charged = 0;
+	skb_bc(skb)->resource = 0;
+}
+
+static void ub_update_rmem_thres(struct sock_beancounter *skub)
+{
+	struct user_beancounter *ub;
+
+	if (skub && skub->ub) {
+		ub = skub->ub;
+		ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier /
+			(ub->ub_parms[UB_NUMTCPSOCK].held + 1);
+	}
+}
+
+static inline void ub_sock_wcharge_dec(struct sock *sk,
+		unsigned long chargesize)
+{
+	/* The check sk->sk_family != PF_NETLINK is made as the skb is
+	 * queued to the kernel end of socket while changed to the user one.
+	 * Den */
+	if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) {
+		if (sock_bc(sk)->ub_wcharged > chargesize)
+			sock_bc(sk)->ub_wcharged -= chargesize;
+		else
+			sock_bc(sk)->ub_wcharged = 0;
+	}
+}
+
+/*
+ * Charge socket number
+ */
+
+static inline void sk_alloc_beancounter(struct sock *sk)
+{
+	struct sock_beancounter *skbc;
+
+	skbc = sock_bc(sk);
+	memset(skbc, 0, sizeof(struct sock_beancounter));
+}
+
+static inline void sk_free_beancounter(struct sock *sk)
+{
+}
+
+static int __sock_charge(struct sock *sk, int res)
+{
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+	unsigned long added_reserv, added_forw;
+	unsigned long flags;
+
+	ub = get_exec_ub();
+	if (unlikely(ub == NULL))
+		return 0;
+
+	sk_alloc_beancounter(sk);
+	skbc = sock_bc(sk);
+	INIT_LIST_HEAD(&skbc->ub_sock_list);
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0))
+		goto out_limit;
+
+	added_reserv = 0;
+	added_forw = 0;
+	if (res == UB_NUMTCPSOCK) {
+		added_reserv = skb_charge_size(MAX_TCP_HEADER +
+				1500 - sizeof(struct iphdr) -
+					sizeof(struct tcphdr));
+		added_reserv *= 4;
+		ub->ub_parms[UB_TCPSNDBUF].held += added_reserv;
+		if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) {
+			ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv;
+			added_reserv = 0;
+		}
+		skbc->poll_reserv = added_reserv;
+		ub_adjust_maxheld(ub, UB_TCPSNDBUF);
+
+		added_forw = SK_MEM_QUANTUM * 4;
+		ub->ub_parms[UB_TCPRCVBUF].held += added_forw;
+		if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) {
+			ub->ub_parms[UB_TCPRCVBUF].held -= added_forw;
+			added_forw = 0;
+		}
+		skbc->forw_space = added_forw;
+		ub_adjust_maxheld(ub, UB_TCPRCVBUF);
+	}
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	skbc->ub = get_beancounter(ub);
+	return 0;
+
+out_limit:
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+	sk_free_beancounter(sk);
+	return -ENOMEM;
+}
+
+int ub_tcp_sock_charge(struct sock *sk, int kern)
+{
+	int ret;
+
+	if (!kern) {
+		ret = __sock_charge(sk, UB_NUMTCPSOCK);
+		ub_update_rmem_thres(sock_bc(sk));
+	} else
+		ret = 0;
+
+	return ret;
+}
+
+int ub_other_sock_charge(struct sock *sk, int kern)
+{
+	return (!kern) ? __sock_charge(sk, UB_NUMOTHERSOCK) : 0;
+}
+
+EXPORT_SYMBOL(ub_other_sock_charge);
+
+int ub_sock_charge(struct sock *sk, int family, int type, int kern)
+{
+	return (IS_TCP_SOCK(family, type)	?
+		ub_tcp_sock_charge(sk, kern)	:
+		ub_other_sock_charge(sk, kern));
+}
+
+EXPORT_SYMBOL(ub_sock_charge);
+
+/*
+ * Uncharge socket number
+ */
+
+void ub_sock_uncharge(struct sock *sk)
+{
+	int is_tcp_sock;
+	unsigned long flags;
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+	unsigned long reserv, forw;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return;
+
+	is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type);
+	skbc = sock_bc(sk);
+	ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk);
+
+	ub = skbc->ub;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	if (!list_empty(&skbc->ub_sock_list)) {
+		ub_debug(UBD_NET_SOCKET,
+			 "ub_sock_uncharge: removing from ub(%p) queue.\n",
+			 skbc);
+		list_del_init(&skbc->ub_sock_list);
+	}
+
+	reserv = skbc->poll_reserv;
+	forw = skbc->forw_space;
+	__uncharge_beancounter_locked(ub,
+			(is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
+			reserv);
+	if (forw)
+		__uncharge_beancounter_locked(ub,
+				(is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF),
+				forw);
+	__uncharge_beancounter_locked(ub,
+			(is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
+
+	ub_sock_wcharge_dec(sk, reserv);
+	if (unlikely(skbc->ub_wcharged))
+		printk(KERN_WARNING
+		       "ub_sock_uncharge: wch=%lu for ub %p (%d).\n",
+		       skbc->ub_wcharged, ub, ub->ub_uid);
+	skbc->poll_reserv = 0;
+	skbc->forw_space = 0;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	put_beancounter(ub);
+	sk_free_beancounter(sk);
+}
+
+/*
+ * Special case for netlink_dump - (un)charges precalculated size
+ */
+
+int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)
+{
+	int ret;
+	unsigned long chargesize;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	chargesize = skb_charge_fullsize(skb);
+	ret = charge_beancounter(sock_bc(sk)->ub,
+			UB_OTHERSOCKBUF, chargesize, UB_HARD);
+	if (ret < 0)
+		return ret;
+	ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
+	return ret;
+}
+
+/*
+ * Poll reserve accounting
+ *
+ * This is the core of socket buffer management (along with queueing/wakeup
+ * functions.  The rest of buffer accounting either call these functions, or
+ * repeat parts of their logic for some simpler cases.
+ */
+
+static int ub_sock_makewreserv_locked(struct sock *sk,
+		int bufid, unsigned long size)
+{
+	unsigned long wcharge_added;
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+
+	skbc = sock_bc(sk);
+	if (skbc->poll_reserv >= size) /* no work to be done */
+		goto out;
+
+	ub = skbc->ub;
+	ub->ub_parms[bufid].held += size - skbc->poll_reserv;
+
+	wcharge_added = 0;
+	/*
+	 * Logic:
+	 *  1) when used memory hits barrier, we set wmem_pressure;
+	 *     wmem_pressure is reset under barrier/2;
+	 *     between barrier/2 and barrier we limit per-socket buffer growth;
+	 *  2) each socket is guaranteed to get (limit-barrier)/maxsockets
+	 *     calculated on the base of memory eaten after the barrier is hit
+	 */
+	skbc = sock_bc(sk);
+#if UB_SOCK_MAINTAIN_WMEMPRESSURE
+	if (!ub_hfbarrier_hit(ub, bufid)) {
+		if (ub->ub_wmem_pressure)
+			ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 "
+				"sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
+				sk, size, skbc->poll_reserv,
+				ub->ub_parms[bufid].held,
+				skbc->ub_wcharged, sk->sk_sndbuf);
+		ub->ub_wmem_pressure = 0;
+	}
+#endif
+	if (ub_barrier_hit(ub, bufid)) {
+#if UB_SOCK_MAINTAIN_WMEMPRESSURE
+		if (!ub->ub_wmem_pressure)
+			ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 "
+				"sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
+				sk, size, skbc->poll_reserv,
+				ub->ub_parms[bufid].held,
+				skbc->ub_wcharged, sk->sk_sndbuf);
+		ub->ub_wmem_pressure = 1;
+#endif
+		if (sk->sk_family == PF_NETLINK)
+			goto unroll;
+		wcharge_added = size - skbc->poll_reserv;
+		skbc->ub_wcharged += wcharge_added;
+		if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit +
+				ub->ub_parms[bufid].barrier >
+					ub->ub_parms[bufid].limit)
+			goto unroll_wch;
+	}
+	if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit)
+		goto unroll;
+
+	ub_adjust_maxheld(ub, bufid);
+	skbc->poll_reserv = size;
+out:
+	return 0;
+
+unroll_wch:
+	skbc->ub_wcharged -= wcharge_added;
+unroll:
+	ub_debug(UBD_NET_SEND,
+			"makewres: deny "
+			"sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
+			sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held,
+			skbc->ub_wcharged, sk->sk_sndbuf);
+	ub->ub_parms[bufid].failcnt++;
+	ub->ub_parms[bufid].held -= size - skbc->poll_reserv;
+
+	if (sk->sk_socket != NULL) {
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+	}
+	return -ENOMEM;
+}
+
+int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size)
+{
+	struct sock_beancounter *skbc;
+	unsigned long flags;
+	int err;
+
+	skbc = sock_bc(sk);
+
+	/*
+	 * This function provides that there is sufficient reserve upon return
+	 * only if sk has only one user.  We can check poll_reserv without
+	 * serialization and avoid locking if the reserve already exists.
+	 */
+	if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size))
+		return 0;
+
+	spin_lock_irqsave(&skbc->ub->ub_lock, flags);
+	err = ub_sock_makewreserv_locked(sk, bufid, size);
+	spin_unlock_irqrestore(&skbc->ub->ub_lock, flags);
+
+	return err;
+}
+
+EXPORT_SYMBOL(ub_sock_make_wreserv);
+
+int ub_sock_makewres_poll(struct sock *sk, unsigned long size)
+{
+	struct sock_beancounter *skbc;
+	unsigned long flags;
+	int err;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	skbc = sock_bc(sk);
+
+	/*
+	 * This function provides that there is sufficient reserve upon return
+	 * only if sk has only one user.  We can check poll_reserv without
+	 * serialization and avoid locking if the reserve already exists.
+	 */
+	if (likely(skbc->poll_reserv >= size))
+		return 0;
+
+	lock_sock(sk);
+	spin_lock_irqsave(&skbc->ub->ub_lock, flags);
+	err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, size);
+	spin_unlock_irqrestore(&skbc->ub->ub_lock, flags);
+	if (err)
+		ub_sock_sndqueueadd_tcp(sk, size);
+	release_sock(sk);
+
+	return err;
+}
+
+int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size)
+{
+	struct sock_beancounter *skbc;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	/* optimize for the case if socket has sufficient reserve */
+	ub_sock_make_wreserv(sk, bufid, size);
+	skbc = sock_bc(sk);
+	if (likely(skbc->poll_reserv >= size)) {
+		skbc->poll_reserv -= size;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+EXPORT_SYMBOL(ub_sock_get_wreserv);
+
+static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid,
+		unsigned long size, unsigned long ressize)
+{
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+	unsigned long extra;
+	unsigned long flags;
+
+	skbc = sock_bc(sk);
+	ub = skbc->ub;
+
+	extra = 0;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	skbc->poll_reserv += size;
+	if (skbc->poll_reserv > ressize) {
+		extra = skbc->poll_reserv - ressize;
+		ub_sock_wcharge_dec(sk, extra);
+		skbc->poll_reserv = ressize;
+
+		__uncharge_beancounter_locked(ub, bufid, extra);
+		if (bufid == UB_TCPSNDBUF)
+			ub_tcp_snd_wakeup(ub);
+		else
+			ub_sock_snd_wakeup(ub);
+	}
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_sock_ret_wreserv(struct sock *sk, int bufid,
+		unsigned long size, unsigned long ressize)
+{
+	struct sock_beancounter *skbc;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return;
+
+	skbc = sock_bc(sk);
+	/* check if the reserve can be kept */
+	if (ub_barrier_farsz(skbc->ub, bufid)) {
+		skbc->poll_reserv += size;
+		return;
+	}
+	ub_sock_do_ret_wreserv(sk, bufid, size, ressize);
+}
+
+/*
+ * UB_DGRAMRCVBUF
+ */
+
+static int ub_dgramrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
+{
+	unsigned long chargesize;
+
+	chargesize = skb_charge_fullsize(skb);
+	if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF,
+				 chargesize, UB_HARD))
+		return -ENOMEM;
+
+	ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF);
+	return 0;
+}
+
+int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
+{
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	if (IS_TCP_SOCK(sk->sk_family, sk->sk_type))
+		return ub_tcprcvbuf_charge(sk, skb);
+	else
+		return ub_dgramrcvbuf_charge(sk, skb);
+}
+
+EXPORT_SYMBOL(ub_sockrcvbuf_charge);
+
+static void ub_sockrcvbuf_uncharge(struct sk_buff *skb)
+{
+	uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF,
+			     skb_bc(skb)->charged);
+	ub_skb_set_uncharge(skb);
+}
+
+/*
+ * UB_TCPRCVBUF
+ */
+
+int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb,
+			    enum ub_severity strict)
+{
+	int retval;
+	unsigned long flags;
+	struct user_beancounter *ub;
+	struct sock_beancounter *skbc;
+	unsigned long chargesize;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+	skbc = sock_bc(sk);
+
+	chargesize = skb_charge_fullsize(skb);
+	if (likely(skbc->forw_space >= chargesize)) {
+		skbc->forw_space -= chargesize;
+		__ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
+		return 0;
+	}
+
+	/*
+	 * Memory pressure reactions:
+	 *  1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND)
+	 *  2) set UB_RMEM_SHRINK and tcp_clamp_window()
+	 *     tcp_collapse_queues() if rmem_alloc > rcvbuf
+	 *  3) drop OFO, tcp_purge_ofo()
+	 *  4) drop all.
+	 * Currently, we do #2 and #3 at once (which means that current
+	 * collapsing of OFO queue in tcp_collapse_queues() is a waste of time,
+	 * for example...)
+	 * On memory pressure we jump from #0 to #3, and when the pressure
+	 * subsides, to #1.
+	 */
+	retval = 0;
+	ub = sock_bc(sk)->ub;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_parms[UB_TCPRCVBUF].held += chargesize;
+	if (ub->ub_parms[UB_TCPRCVBUF].held >
+			ub->ub_parms[UB_TCPRCVBUF].barrier &&
+			strict != UB_FORCE)
+		goto excess;
+	ub_adjust_maxheld(ub, UB_TCPRCVBUF);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+out:
+	if (retval == 0)
+		ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
+	return retval;
+
+excess:
+	ub->ub_rmem_pressure = UB_RMEM_SHRINK;
+	if (strict == UB_HARD)
+		retval = -ENOMEM;
+	if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit)
+		retval = -ENOMEM;
+	/*
+	 * We try to leave numsock*maxadvmss as a reserve for sockets not
+	 * queueing any data yet (if the difference between the barrier and the
+	 * limit is enough for this reserve).
+	 */
+	if (ub->ub_parms[UB_TCPRCVBUF].held +
+			ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss
+			> ub->ub_parms[UB_TCPRCVBUF].limit &&
+			atomic_read(&sk->sk_rmem_alloc))
+		retval = -ENOMEM;
+	if (retval) {
+		ub->ub_parms[UB_TCPRCVBUF].held -= chargesize;
+		ub->ub_parms[UB_TCPRCVBUF].failcnt++;
+	}
+	ub_adjust_maxheld(ub, UB_TCPRCVBUF);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+	goto out;
+}
+EXPORT_SYMBOL(ub_sock_tcp_chargerecv);
+
+static void ub_tcprcvbuf_uncharge(struct sk_buff *skb)
+{
+	unsigned long flags;
+	unsigned long held, bar;
+	int prev_pres;
+	struct user_beancounter *ub;
+
+	ub = skb_bc(skb)->ub;
+	if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) {
+		sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged;
+		ub_skb_set_uncharge(skb);
+		return;
+	}
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) {
+		printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n",
+				skb_bc(skb)->charged,
+				ub, ub->ub_parms[UB_TCPRCVBUF].held);
+		/* ass-saving bung */
+		skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held;
+	}
+	ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged;
+	held = ub->ub_parms[UB_TCPRCVBUF].held;
+	bar = ub->ub_parms[UB_TCPRCVBUF].barrier;
+	prev_pres = ub->ub_rmem_pressure;
+	if (held <= bar - (bar >> 2))
+		ub->ub_rmem_pressure = UB_RMEM_EXPAND;
+	else if (held <= bar)
+		ub->ub_rmem_pressure = UB_RMEM_KEEP;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	ub_skb_set_uncharge(skb);
+}
+
+
+/*
+ * UB_OTHERSOCKBUF and UB_TCPSNDBUF
+ */
+
+static void ub_socksndbuf_uncharge(struct sk_buff *skb)
+{
+	unsigned long flags;
+	struct user_beancounter *ub;
+	unsigned long chargesize;
+
+	ub = skb_bc(skb)->ub;
+	chargesize = skb_bc(skb)->charged;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	__uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize);
+	if (skb->sk != NULL && sock_has_ubc(skb->sk))
+		ub_sock_wcharge_dec(skb->sk, chargesize);
+	ub_sock_snd_wakeup(ub);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	ub_skb_set_uncharge(skb);
+}
+
+/* expected to be called under socket lock */
+static void ub_tcpsndbuf_uncharge(struct sk_buff *skb)
+{
+	/*
+	 * ub_sock_ret_wreserv call is abused here, we just want to uncharge
+	 * skb size.  However, to reduce duplication of the code doing
+	 * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call
+	 * a function that already does all of this.  2006/04/27  SAW
+	 */
+	ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged,
+			sock_bc(skb->sk)->poll_reserv);
+	ub_skb_set_uncharge(skb);
+}
+
+void ub_skb_uncharge(struct sk_buff *skb)
+{
+	switch (skb_bc(skb)->resource) {
+		case UB_TCPSNDBUF:
+			ub_tcpsndbuf_uncharge(skb);
+			break;
+		case UB_TCPRCVBUF:
+			ub_tcprcvbuf_uncharge(skb);
+			break;
+		case UB_DGRAMRCVBUF:
+			ub_sockrcvbuf_uncharge(skb);
+			break;
+		case UB_OTHERSOCKBUF:
+			ub_socksndbuf_uncharge(skb);
+			break;
+	}
+}
+
+EXPORT_SYMBOL(ub_skb_uncharge);	/* due to skb_orphan()/conntracks */
+
+/*
+ * Other sock reserve managment
+ */
+
+int ub_sock_getwres_other(struct sock *sk, unsigned long size)
+{
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+	unsigned long flags;
+	int err;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	/*
+	 * Nothing except beancounter lock protects skbc->poll_reserv.
+	 * So, take the lock and do the job.
+	 */
+	skbc = sock_bc(sk);
+	ub = skbc->ub;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size);
+	if (!err)
+		skbc->poll_reserv -= size;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	return err;
+}
+EXPORT_SYMBOL(ub_sock_getwres_other);
+
+void ub_sock_retwres_other(struct sock *sk,
+		unsigned long size, unsigned long ressize)
+{
+	if (unlikely(!sock_has_ubc(sk)))
+		return;
+
+	ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize);
+}
+
+/*
+ * TCP send buffers accouting. Paged part
+ */
+
+int ub_sock_tcp_chargepage(struct sock *sk)
+{
+	struct sock_beancounter *skbc;
+	unsigned long extra;
+	int err;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	skbc = sock_bc(sk);
+	ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE);
+	if (likely(skbc->poll_reserv >= PAGE_SIZE)) {
+		skbc->poll_reserv -= PAGE_SIZE;
+		return 0;
+	}
+
+	/*
+	 * Ok, full page is not available.
+	 * However, this function must succeed if poll previously indicated
+	 * that write is possible.  We better make a forced charge here
+	 * than reserve a whole page in poll.
+	 */
+	err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE);
+	if (unlikely(err < 0))
+		goto out;
+	if (skbc->poll_reserv < PAGE_SIZE) {
+		extra = PAGE_SIZE - skbc->poll_reserv;
+		err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra,
+				UB_FORCE);
+		if (err < 0)
+			goto out;
+		skbc->poll_reserv += extra;
+	}
+	skbc->poll_reserv -= PAGE_SIZE;
+	return 0;
+
+out:
+	return err;
+}
+
+void ub_sock_tcp_detachpage(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return;
+
+	/* The page is just detached from socket. The last skb in queue
+	   with paged part holds referrence to it */
+	skb = skb_peek_tail(&sk->sk_write_queue);
+	if (skb == NULL) {
+	   	/* If the queue is empty - all data is sent and page is about
+		   to be freed */
+		ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE,
+				sock_bc(sk)->poll_reserv);
+	} else {
+		/* Last skb is a good aproximation for a last skb with
+		   paged part */
+		skb_bc(skb)->charged += PAGE_SIZE;
+	}
+}
+
+/*
+ * TCPSNDBUF charge functions below are called in the following cases:
+ *  - sending of SYN, SYN-ACK, FIN, the latter charge is forced by
+ *    some technical reasons in TCP code;
+ *  - fragmentation of TCP packets.
+ * These functions are allowed but not required to use poll_reserv.
+ * Originally, these functions didn't do that, since it didn't make
+ * any sense.  Now, since poll_reserv now has a function of general reserve,
+ * they use it.
+ */
+int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb,
+			    enum ub_severity strict)
+{
+	int ret;
+	unsigned long chargesize;
+	struct sock_beancounter *skbc;
+	struct user_beancounter *ub;
+	unsigned long flags;
+
+	if (unlikely(!sock_has_ubc(sk)))
+		return 0;
+
+	skbc = sock_bc(sk);
+	chargesize = skb_charge_fullsize(skb);
+	if (likely(skbc->poll_reserv >= chargesize)) {
+		skbc->poll_reserv -= chargesize;
+		__ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+		/* XXX hack, see ub_skb_set_charge */
+		skb->sk = sk;
+		return 0;
+	}
+
+	ub = skbc->ub;
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF,
+			chargesize, strict);
+	/*
+	 * Note: this check is not equivalent of the corresponding check
+	 * in makewreserv.  It's similar in spirit, but an equivalent check
+	 * would be too long and complicated here.
+	 */
+	if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF))
+		skbc->ub_wcharged += chargesize;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+	if (likely(!ret))
+		ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+	return ret;
+}
+EXPORT_SYMBOL(ub_sock_tcp_chargesend);
+
+/*
+ * Initialization
+ */
+
+int __init skbc_cache_init(void)
+{
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/oom_kill.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/oom_kill.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/oom_kill.c	2015-01-21 12:02:43.439220664 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/oom_kill.c	2015-01-21 12:02:58.762813880 +0300
@@ -0,0 +1,322 @@
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/cpuset.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+
+#include <bc/beancounter.h>
+#include <bc/oom_kill.h>
+#include <bc/vmpages.h>
+
+#define UB_OOM_TIMEOUT	(5 * HZ)
+
+void ub_oom_start(struct oom_control *oom_ctrl)
+{
+	current->task_bc.oom_generation = oom_ctrl->generation;
+}
+
+static inline struct user_beancounter *oom_ctrl_ub(struct oom_control *ctrl)
+{
+	if (ctrl == &global_oom_ctrl)
+		return NULL;
+
+	return container_of(ctrl, struct user_beancounter, oom_ctrl);
+}
+
+static inline int oom_ctrl_id(struct oom_control *ctrl)
+{
+	struct user_beancounter *ub = oom_ctrl_ub(ctrl);
+
+	return ub ? ub->ub_uid : -1;
+}
+
+static inline int oom_ctrl_ratelimit(struct oom_control *ctrl)
+{
+	struct user_beancounter *ub = oom_ctrl_ub(ctrl);
+
+	return ub ? __ratelimit(&ub->ub_ratelimit) : printk_ratelimit();
+}
+
+static void __ub_release_oom_control(struct oom_control *oom_ctrl, char *why)
+{
+	if (oom_ctrl_ratelimit(oom_ctrl)) {
+		struct user_beancounter *ub = oom_ctrl_ub(oom_ctrl);
+
+		printk(KERN_WARNING"oom-killer in ub %d generation %d ends: %s\n",
+		       oom_ctrl_id(oom_ctrl), oom_ctrl->generation, why);
+
+		if (ub)
+			__show_ub_mem(ub);
+		else
+			show_mem(SHOW_MEM_FILTER_NODES);
+	}
+
+	oom_ctrl->kill_counter = 0;
+	oom_ctrl->generation++;
+
+	/* if there is time to sleep in ub_oom_lock -> sleep will continue */
+	wake_up_all(&oom_ctrl->wq);
+}
+
+static void ub_release_oom_control(struct oom_control *oom_ctrl)
+{
+	spin_lock(&oom_ctrl->lock);
+	__ub_release_oom_control(oom_ctrl, "task died");
+	spin_unlock(&oom_ctrl->lock);
+}
+
+/*
+ * Must be called under task_lock() held
+ */
+void ub_oom_mark_mm(struct mm_struct *mm, struct oom_control *oom_ctrl)
+{
+	mm_ub(mm)->ub_parms[UB_OOMGUARPAGES].failcnt++;
+
+	if (oom_ctrl == &global_oom_ctrl)
+		mm->global_oom = 1;
+	else if (oom_ctrl == &mm->mm_ub->oom_ctrl)
+		mm->ub_oom = 1;
+	else {
+		/*
+		 * Task can be killed when using either global oom ctl
+		 * or by mm->mm_ub one. In other case we must release ctl now.
+		 * When this task will die it'll have to decide with ctl
+		 * to use lokking at this flag and we have to sure it
+		 * will use the proper one.
+		 */
+		__ub_release_oom_control(oom_ctrl, "mark bug");
+		WARN_ON(1);
+	}
+}
+
+static inline int ub_oom_completed(struct oom_control *oom_ctrl)
+{
+	if (test_thread_flag(TIF_MEMDIE))
+		/* we were oom killed - just die */
+		return 1;
+	if (current->task_bc.oom_generation != oom_ctrl->generation)
+		/* some task was succesfully killed */
+		return 1;
+	return 0;
+}
+
+static void ub_clear_oom(void)
+{
+	struct user_beancounter *ub;
+
+	rcu_read_lock();
+	for_each_beancounter(ub)
+		clear_bit(UB_OOM_NOPROC, &ub->ub_flags);
+	rcu_read_unlock();
+}
+
+int ub_oom_lock(struct oom_control *oom_ctrl, gfp_t gfp_mask)
+{
+	int timeout;
+	DEFINE_WAIT(oom_w);
+
+	if (oom_ctrl != &global_oom_ctrl && global_oom_ctrl.kill_counter) {
+		/*
+		 * Check if global OOM killeris on the way. If so -
+		 * let the senior handle the situation.
+		 */
+		wait_event_killable(global_oom_ctrl.wq,
+					global_oom_ctrl.kill_counter == 0);
+		return -EAGAIN;
+	}
+
+	spin_lock(&oom_ctrl->lock);
+	if (!oom_ctrl->kill_counter && !ub_oom_completed(oom_ctrl))
+		goto out_do_oom;
+
+	timeout = UB_OOM_TIMEOUT;
+	while (1) {
+		if (ub_oom_completed(oom_ctrl)) {
+			spin_unlock(&oom_ctrl->lock);
+			/*
+			 * We raced with some other OOM killer and need
+			 * to update generation to be sure, that we can
+			 * call OOM killer on next loop iteration.
+			 */
+			ub_oom_start(oom_ctrl);
+			return -EAGAIN;
+		}
+
+		if (timeout == 0) {
+			/*
+			 * Time is up, let's kill somebody else but
+			 * release the oom ctl since the stuck task
+			 * wasn't able to do it.
+			 */
+			__ub_release_oom_control(oom_ctrl, "timeout");
+			break;
+		}
+
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&oom_ctrl->wq, &oom_w);
+		spin_unlock(&oom_ctrl->lock);
+
+		timeout = schedule_timeout(timeout);
+
+		spin_lock(&oom_ctrl->lock);
+		remove_wait_queue(&oom_ctrl->wq, &oom_w);
+
+	}
+
+out_do_oom:
+	ub_clear_oom();
+
+	if (oom_ctrl_ratelimit(oom_ctrl)) {
+		struct user_beancounter *ub = oom_ctrl_ub(oom_ctrl);
+
+		printk(KERN_WARNING"%d (%s) invoked oom-killer in ub %d "
+			"generation %d gfp 0x%x\n",
+			current->pid, current->comm, oom_ctrl_id(oom_ctrl),
+			oom_ctrl->generation, gfp_mask);
+
+		if (ub) {
+			show_ub_mem(ub);
+		} else {
+			dump_stack();
+			show_mem(SHOW_MEM_FILTER_NODES);
+			show_slab_info();
+		}
+	}
+
+	return 0;
+}
+
+long ub_current_overdraft(struct user_beancounter *ub)
+{
+	return ((ub->ub_parms[UB_KMEMSIZE].held
+		  + ub->ub_parms[UB_TCPSNDBUF].held
+		  + ub->ub_parms[UB_TCPRCVBUF].held
+		  + ub->ub_parms[UB_OTHERSOCKBUF].held
+		  + ub->ub_parms[UB_DGRAMRCVBUF].held)
+		 >> PAGE_SHIFT) - ub_oomguarpages_left(ub);
+}
+
+int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk)
+{
+	struct user_beancounter *mm_ub;
+
+	if (ub == NULL)
+		return 0;
+
+	task_lock(tsk);
+	if (tsk->mm == NULL)
+		mm_ub = NULL;
+	else
+		mm_ub = tsk->mm->mm_ub;
+
+	task_unlock(tsk);
+
+	return mm_ub != ub;
+}
+
+struct user_beancounter *ub_oom_select_worst(void)
+{
+	struct user_beancounter *ub, *walkp;
+	long ub_maxover;
+
+	ub_maxover = 0;
+	ub = NULL;
+
+	rcu_read_lock();
+	for_each_beancounter (walkp) {
+		long ub_overdraft;
+
+		if (test_bit(UB_OOM_NOPROC, &walkp->ub_flags))
+			continue;
+
+		ub_overdraft = ub_current_overdraft(walkp);
+		if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) {
+			put_beancounter(ub);
+			ub = walkp;
+			ub_maxover = ub_overdraft;
+		}
+	}
+
+	if (ub) {
+		set_bit(UB_OOM_NOPROC, &ub->ub_flags);
+		printk(KERN_INFO "OOM selected worst BC %d (overdraft %lu):\n",
+				ub->ub_uid, ub_maxover);
+		__show_ub_mem(ub);
+	}
+	rcu_read_unlock();
+
+	return ub;
+}
+
+void ub_oom_unlock(struct oom_control *oom_ctrl)
+{
+	spin_unlock(&oom_ctrl->lock);
+}
+
+void ub_oom_mm_dead(struct mm_struct *mm)
+{
+	if (mm->global_oom)
+		ub_release_oom_control(&global_oom_ctrl);
+
+	if (mm->ub_oom)
+		ub_release_oom_control(&mm_ub(mm)->oom_ctrl);
+}
+
+unsigned long ub_oom_total_pages(struct user_beancounter *ub)
+{
+	return min(totalram_pages, ub->ub_parms[UB_PHYSPAGES].limit) +
+	       min_t(unsigned long, total_swap_pages,
+			       ub->ub_parms[UB_SWAPPAGES].limit);
+}
+
+int out_of_memory_in_ub(struct user_beancounter *ub, gfp_t gfp_mask)
+{
+	struct task_struct *p;
+	int res = 0;
+	unsigned long ub_mem_pages;
+	int points;
+	char message[32];
+
+	if (ub_oom_lock(&ub->oom_ctrl, gfp_mask))
+		goto out;
+
+	snprintf(message, sizeof(message),
+		 "Out of memory in UB %u", ub->ub_uid);
+
+	ub_mem_pages = ub_oom_total_pages(ub);
+	read_lock(&tasklist_lock);
+
+	do {
+		p = select_bad_process(&points, ub_mem_pages, ub, NULL, NULL);
+		if (PTR_ERR(p) == -1UL || !p) {
+			__ub_release_oom_control(&ub->oom_ctrl, "no victims");
+			break;
+		}
+	} while (oom_kill_process(p, gfp_mask, 0, points, ub_mem_pages,
+				  ub, NULL, NULL, message));
+
+	read_unlock(&tasklist_lock);
+	ub_oom_unlock(&ub->oom_ctrl);
+
+	if (!p)
+		res = -ENOMEM;
+out:
+	/*
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory unless "p" is current
+	 */
+	if (!test_thread_flag(TIF_MEMDIE))
+		schedule_timeout_uninterruptible(1);
+
+	return res;
+}
+
+struct oom_control global_oom_ctrl;
+
+void init_oom_control(struct oom_control *oom_ctrl)
+{
+	spin_lock_init(&oom_ctrl->lock);
+	init_waitqueue_head(&oom_ctrl->wq);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/proc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/proc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/proc.c	2015-01-21 12:02:43.397221780 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/proc.c	2015-01-21 12:02:58.940809155 +0300
@@ -0,0 +1,989 @@
+/*
+ *  kernel/bc/proc.c 
+ *
+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ve_proto.h>
+#include <linux/virtinfo.h>
+#include <linux/mmgang.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
+
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+#include <bc/dcache.h>
+
+/* Generic output formats */
+#if BITS_PER_LONG == 32
+const char *bc_proc_lu_fmt = "\t%-20s %10lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n";
+#else
+const char *bc_proc_lu_fmt = "\t%-20s %21lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n";
+#endif
+
+#if BITS_PER_LONG == 32
+static const char *head_fmt = "%10s  %-12s %10s %10s %10s %10s %10s\n";
+static const char *res_fmt = "%10s  %-12s %10lu %10lu %10lu %10lu %10lu\n";
+#else
+static const char *head_fmt = "%10s  %-12s %20s %20s %20s %20s %20s\n";
+static const char *res_fmt = "%10s  %-12s %20lu %20lu %20lu %20lu %20lu\n";
+#endif
+
+static void ub_show_res(struct seq_file *f, struct user_beancounter *ub,
+		int r, int precharge, int show_uid)
+{
+	char ub_uid[64];
+	unsigned long held;
+
+	memset(ub_uid, 0, sizeof(ub_uid));
+	if (show_uid && r == 0)
+		snprintf(ub_uid, sizeof(ub_uid), "%u:", ub->ub_uid);
+
+	held = ub->ub_parms[r].held;
+	held = (held > precharge) ? (held - precharge) : 0;
+
+	seq_printf(f, res_fmt, ub_uid, ub_rnames[r],
+			held,
+			ub->ub_parms[r].maxheld,
+			ub->ub_parms[r].barrier,
+			ub->ub_parms[r].limit,
+			ub->ub_parms[r].failcnt);
+}
+
+static void ub_show_dummy(struct seq_file *f, struct user_beancounter *ub, int r)
+{
+	seq_printf(f, res_fmt, "", ub_rnames[r],
+			0, 0,
+			ub->ub_parms[r].barrier,
+			ub->ub_parms[r].limit,
+			ub->ub_parms[r].failcnt);
+}
+
+static void __show_resources(struct seq_file *f, struct user_beancounter *ub,
+		int show_uid)
+{
+	int i, precharge[UB_RESOURCES];
+
+	ub_update_resources(ub);
+	ub_precharge_snapshot(ub, precharge);
+
+	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+		if (strcmp(ub_rnames[i], "dummy") != 0)
+			ub_show_res(f, ub, i, precharge[i], show_uid);
+
+	for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++)
+		ub_show_res(f, ub, i, precharge[i], show_uid);
+}
+
+static int bc_resources_show(struct seq_file *f, void *v)
+{
+	__show_resources(f, seq_beancounter(f), 0);
+	return 0;
+}
+
+static struct bc_proc_entry bc_resources_entry = {
+	.name = "resources",
+	.u.show = bc_resources_show,
+};
+
+#ifdef CONFIG_BC_DEBUG
+static int bc_debug_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+	unsigned int now;
+	
+	now = dcache_update_time();
+	ub = seq_beancounter(f);
+	seq_printf(f, "uid: %u\n", ub->ub_uid);
+	seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount));
+	seq_printf(f, "flags: 0x%lx\n", ub->ub_flags);
+
+	seq_printf(f, "bc: %p\n", ub);
+	seq_printf(f, "sizeof: %lu\n", sizeof(struct user_beancounter));
+	seq_printf(f, "pincount: %d\n", __ub_percpu_sum(ub, pincount));
+
+	seq_printf(f, "dcache_unused: %u\n", ub->ub_dentry_unused);
+	seq_printf(f, "dcache_pruned: %lu\n", ub->ub_dentry_pruned);
+	seq_printf(f, "dcache_cache_age: %d (%c)\n", now - ub->dc_time,
+			RB_EMPTY_NODE(&ub->dc_node) ? '-' : '+');
+
+	seq_printf(f, "dcache_lru_age:\n");
+	spin_lock(&dcache_lock);
+	{
+		struct dentry *de;
+		unsigned nr = 10;
+
+		list_for_each_entry_reverse(de, &ub->ub_dentry_lru, d_bclru) {
+			if (nr-- <= 0) {
+				seq_printf(f, "     ...\n");
+				break;
+			}
+			seq_printf(f, "     d: %d [%s]\n", now - de->d_lru_time, de->d_name.name);
+		}
+	}
+	spin_unlock(&dcache_lock);
+
+	seq_printf(f, "dcache_shrink_age: %d\n", now - ub->dc_shrink_ts);
+	seq_printf(f, "dcache_thresh: %d\n", ub->ub_dcache_threshold);
+
+	seq_printf(f, "pagecache_isolation: %s\n",
+		test_bit(UB_PAGECACHE_ISOLATION, &ub->ub_flags) ? "on" : "off");
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_debug_entry = {
+	.name = "debug",
+	.u.show = bc_debug_show,
+};
+#endif
+
+static int bc_precharge_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+	int i, cpus = num_possible_cpus();
+	int precharge[UB_RESOURCES];
+
+	seq_printf(f, "%-12s %16s %10s %10s\n",
+			"resource", "real_held", "precharge", "max_precharge");
+
+	ub = seq_beancounter(f);
+	ub_precharge_snapshot(ub, precharge);
+	for ( i = 0 ; i < UB_RESOURCES ; i++ ) {
+		if (!strcmp(ub_rnames[i], "dummy"))
+			continue;
+		seq_printf(f, "%-12s %16lu %10d %10d\n", ub_rnames[i],
+				ub->ub_parms[i].held,
+				precharge[i],
+				ub->ub_parms[i].max_precharge * cpus);
+	}
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_precharge_entry = {
+	.name = "precharge",
+	.u.show = bc_precharge_show,
+};
+
+static void bc_count_slab_show_one(const char *name, int count, void *v)
+{
+	if (count != 0)
+		seq_printf((struct seq_file *)v, "%s: %u\n", name, count);
+}
+
+static int bc_count_slab_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+
+	ub = seq_beancounter(f);
+	slab_walk_ub(ub, bc_count_slab_show_one, f);
+	return 0;
+}
+
+static struct bc_proc_entry bc_count_slab_entry = {
+	.name = "slabinfo",
+	.u.show = bc_count_slab_show
+};
+
+static int bc_proc_meminfo_show(struct seq_file *f, void *v)
+{
+	return meminfo_proc_show_ub(f, NULL,
+			seq_beancounter(f), VE_MEMINFO_COMPLETE);
+}
+
+static struct bc_proc_entry bc_meminfo_entry = {
+	.name = "meminfo",
+	.u.show = bc_proc_meminfo_show,
+};
+
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+static int bc_proc_nodeinfo_show(struct seq_file *f, void *v)
+{
+	int nid;
+	nodemask_t nodemask;
+	struct user_beancounter *ub;
+	unsigned long pages[NR_LRU_LISTS];
+	unsigned long shadow[NR_LRU_LISTS];
+	struct idle_page_stats idle;
+
+	ub = seq_beancounter(f);
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		nodemask = nodemask_of_node(nid);
+		gang_page_stat(&ub->gang_set, &nodemask, pages, shadow);
+		gang_idle_page_stat(&ub->gang_set, &nodemask, &idle);
+		seq_printf(f,
+			"Node %d Active:         %8lu kB\n"
+			"Node %d Inactive:       %8lu kB\n"
+			"Node %d Shadow:         %8lu kB\n"
+			"Node %d Active(anon):   %8lu kB\n"
+			"Node %d Inactive(anon): %8lu kB\n"
+			"Node %d Shadow(anon):   %8lu kB\n"
+			"Node %d Active(file):   %8lu kB\n"
+			"Node %d Inactive(file): %8lu kB\n"
+			"Node %d Shadow(file):   %8lu kB\n"
+			"Node %d Unevictable:    %8lu kB\n"
+#ifdef CONFIG_KSTALED
+			"Node %d IdleClean:      %8lu kB\n"
+			"Node %d IdleDirtyFile:  %8lu kB\n"
+			"Node %d IdleDirtySwap:  %8lu kB\n"
+#endif
+			,
+			nid, K(pages[LRU_ACTIVE_ANON] +
+			       pages[LRU_ACTIVE_FILE]),
+			nid, K(pages[LRU_INACTIVE_ANON] +
+			       pages[LRU_INACTIVE_FILE]),
+			nid, K(shadow[LRU_ACTIVE_ANON] +
+			       shadow[LRU_INACTIVE_ANON] +
+			       shadow[LRU_ACTIVE_FILE] +
+			       shadow[LRU_INACTIVE_FILE] +
+			       shadow[LRU_UNEVICTABLE]),
+			nid, K(pages[LRU_ACTIVE_ANON]),
+			nid, K(pages[LRU_INACTIVE_ANON]),
+			nid, K(shadow[LRU_ACTIVE_ANON] +
+			       shadow[LRU_INACTIVE_ANON]),
+			nid, K(pages[LRU_ACTIVE_FILE]),
+			nid, K(pages[LRU_INACTIVE_FILE]),
+			nid, K(shadow[LRU_ACTIVE_FILE] +
+			       shadow[LRU_INACTIVE_FILE]),
+			nid, K(pages[LRU_UNEVICTABLE])
+#ifdef CONFIG_KSTALED
+			,
+			nid, K(idle.idle_clean),
+			nid, K(idle.idle_dirty_file),
+			nid, K(idle.idle_dirty_swap)
+#endif
+			);
+	}
+	return 0;
+}
+#undef K
+
+static struct bc_proc_entry bc_nodeinfo_entry = {
+	.name = "nodeinfo",
+	.u.show = bc_proc_nodeinfo_show,
+};
+#endif
+
+static int bc_dcache_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub = seq_beancounter(f);
+	struct dentry *dentry, *prev = NULL;
+	struct vfsmount *mnt;
+	struct path root;
+
+	seq_printf(f, "       usage device\tfstype\tmount\tdentry\n");
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(dentry, &ub->ub_dentry_top, d_bclru) {
+		struct super_block *sb = dentry->d_sb;
+
+		/* Prevent race with shrink_dcache_for_umount_subtree() */
+		if (!down_read_trylock(&sb->s_umount))
+			continue;
+		dget(dentry);
+		spin_unlock(&dcache_lock);
+		dput(prev);
+		prev = dentry;
+
+		root.mnt = NULL;
+		root.dentry = NULL;
+		spin_lock(&vfsmount_lock);
+		list_for_each_entry(mnt, &current->nsproxy->mnt_ns->list, mnt_list) {
+			if (mnt->mnt_sb == dentry->d_sb) {
+				root.mnt = mnt;
+				root.dentry = mnt->mnt_root;
+				path_get(&root);
+				break;
+			}
+		}
+		spin_unlock(&vfsmount_lock);
+
+		seq_printf(f, "%12lu %s\t%s\t",
+				ub_dcache_get_size(dentry),
+				dentry->d_sb->s_id,
+				dentry->d_sb->s_type->name);
+		if (root.mnt)
+			seq_path(f, &root, " \t\n\\");
+		else
+			seq_puts(f, "none");
+		seq_putc(f, '\t');
+		seq_dentry(f, dentry, " \t\n\\");
+		seq_putc(f, '\n');
+
+		path_put(&root);
+		up_read(&sb->s_umount);
+
+		spin_lock(&dcache_lock);
+		if (dentry->d_ub != ub)
+			break;
+	}
+	spin_unlock(&dcache_lock);
+	dput(prev);
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_dcacheinfo_entry = {
+	.name = "dcacheinfo",
+	.u.show = bc_dcache_show,
+};
+
+static int ub_show(struct seq_file *f, void *v)
+{
+	int i, precharge[UB_RESOURCES];
+	struct user_beancounter *ub = v;
+
+	ub_update_resources(ub);
+	ub_precharge_snapshot(ub, precharge);
+
+	for (i = 0; i < UB_RESOURCES_COMPAT; i++) {
+		if (strcmp(ub_rnames[i], "dummy") != 0)
+			ub_show_res(f, ub, i, precharge[i], 1);
+		else
+			ub_show_dummy(f, ub, i);
+	}
+	return 0;
+}
+
+static int res_show(struct seq_file *f, void *v)
+{
+	__show_resources(f, (struct user_beancounter *)v, 1);
+	return 0;
+}
+
+static int ub_accessible(struct user_beancounter *exec,
+		struct user_beancounter *target)
+{
+	return (exec == get_ub0() || exec == target);
+}
+
+static void ub_show_header(struct seq_file *f)
+{
+	seq_printf(f, "Version: 2.5\n");
+	seq_printf(f, head_fmt, "uid", "resource",
+			"held", "maxheld", "barrier", "limit", "failcnt");
+}
+
+static void *ub_start(struct seq_file *f, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	struct user_beancounter *exec_ub; 
+	unsigned long pos;
+
+	pos = *ppos;
+	if (pos == 0)
+		ub_show_header(f);
+
+	exec_ub = get_exec_ub();
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!ub_accessible(exec_ub, ub))
+			continue;
+		if (pos-- == 0)
+			return ub;
+	}
+	return NULL;
+}
+
+static void *ub_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	struct list_head *entry;
+	struct user_beancounter *exec_ub;
+
+	exec_ub = get_exec_ub();
+	ub = (struct user_beancounter *)v;
+
+	entry = &ub->ub_list;
+
+	list_for_each_continue_rcu(entry, &ub_list_head) {
+		ub = list_entry(entry, struct user_beancounter, ub_list);
+		if (!ub_accessible(exec_ub, ub))
+			continue;
+		(*ppos)++;
+		return ub;
+	}
+	return NULL;
+}
+
+static void ub_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+
+static struct seq_operations ub_seq_ops = {
+	.start = ub_start,
+	.next  = ub_next,
+	.stop  = ub_stop,
+	.show  = ub_show,
+};
+
+static int ub_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &ub_seq_ops);
+}
+
+static struct file_operations ub_file_operations = {
+	.open		= ub_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct seq_operations res_seq_ops = {
+	.start = ub_start,
+	.next  = ub_next,
+	.stop  = ub_stop,
+	.show  = res_show,
+};
+
+static int res_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &res_seq_ops);
+}
+
+static struct file_operations resources_operations = {
+	.open		= res_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct bc_proc_entry bc_all_resources_entry = {
+	.name = "resources",
+	.u.fops = &resources_operations,
+};
+
+/*
+ * Generic showing stuff
+ */
+
+static int cookies, num_entries;
+static struct bc_proc_entry *bc_entries __read_mostly;
+static struct bc_proc_entry *bc_root_entries __read_mostly;
+static DEFINE_SPINLOCK(bc_entries_lock);
+static struct proc_dir_entry *bc_proc_root;
+
+void bc_register_proc_entry(struct bc_proc_entry *e)
+{
+	spin_lock(&bc_entries_lock);
+	e->cookie = ++cookies;
+	e->next = bc_entries;
+	bc_entries = e;
+	num_entries++;
+	spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_entry);
+
+void bc_register_proc_root_entry(struct bc_proc_entry *e)
+{
+	spin_lock(&bc_entries_lock);
+	e->cookie = ++cookies;
+	e->next = bc_root_entries;
+	bc_root_entries = e;
+	bc_proc_root->nlink++;
+	spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_root_entry);
+
+/*
+ * small helpers
+ */
+
+static inline unsigned long bc_make_ino(struct user_beancounter *ub)
+{
+	return 0xbc000000 | (ub->ub_uid + 1);
+}
+
+static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de)
+{
+	return 0xbe000000 + de->cookie;
+}
+
+static int bc_d_delete(struct dentry *d)
+{
+	return 1;
+}
+
+static void bc_d_release(struct dentry *d)
+{
+	put_beancounter_longterm((struct user_beancounter *)d->d_fsdata);
+}
+
+static struct inode_operations bc_entry_iops;
+static struct file_operations bc_entry_fops;
+static struct dentry_operations bc_dentry_ops = {
+	.d_delete = bc_d_delete,
+	.d_release = bc_d_release,
+};
+
+/*
+ * common directory operations' helpers
+ */
+
+static int bc_readdir(struct file *file, filldir_t filler, void *data,
+		struct user_beancounter *parent)
+{
+	int err = 0;
+	loff_t pos, filled;
+	struct user_beancounter *ub, *prev;
+	struct bc_proc_entry *pde;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EPERM;
+
+	pos = file->f_pos;
+	if (pos == 0) {
+		err = (*filler)(data, ".", 1, pos,
+				file->f_dentry->d_inode->i_ino, DT_DIR);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	if (pos == 1) {
+		err = (*filler)(data, "..", 2, pos,
+				parent_ino(file->f_dentry), DT_DIR);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	filled = 2;
+	for (pde = (parent == NULL ? bc_root_entries : bc_entries);
+			pde != NULL; pde = pde->next) {
+		if (filled++ < pos)
+			continue;
+
+		err = (*filler)(data, pde->name, strlen(pde->name), pos,
+				bc_make_file_ino(pde), DT_REG);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	if (parent)
+		goto out;
+
+	rcu_read_lock();
+	prev = NULL;
+	ub = list_entry(&ub_list_head, struct user_beancounter, ub_list);
+	while (1) {
+		int len;
+		unsigned long ino;
+		char buf[64];
+
+		ub = list_entry(rcu_dereference(ub->ub_list.next),
+				struct user_beancounter, ub_list);
+		if (&ub->ub_list == &ub_list_head)
+			break;
+
+		if (!get_beancounter_rcu(ub))
+			continue;
+
+		if (filled++ < pos) {
+			put_beancounter(ub);
+			continue;
+		}
+
+		rcu_read_unlock();
+		put_beancounter(prev);
+
+		len = snprintf(buf, sizeof(buf), "%u", ub->ub_uid);
+		ino = bc_make_ino(ub);
+
+		err = (*filler)(data, buf, len, pos, ino, DT_DIR);
+		if (err < 0) {
+			err = 0;
+			put_beancounter(ub);
+			goto out;
+		}
+		rcu_read_lock();
+		prev = ub;
+		pos++;
+	}
+	list_for_each_entry_rcu(ub, &ub_leaked_list, ub_leaked_list) {
+		int len;
+		unsigned long ino;
+		char buf[64];
+
+		if (!get_beancounter_rcu(ub))
+			continue;
+
+		if (filled++ < pos) {
+			put_beancounter(ub);
+			continue;
+		}
+
+		rcu_read_unlock();
+		put_beancounter(prev);
+
+		len = snprintf(buf, sizeof(buf), "%u-%p", ub->ub_uid, ub);
+		ino = bc_make_ino(ub);
+
+		err = (*filler)(data, buf, len, pos, ino, DT_DIR);
+		if (err < 0) {
+			err = 0;
+			put_beancounter(ub);
+			goto out;
+		}
+		rcu_read_lock();
+		prev = ub;
+		pos++;
+	}
+	rcu_read_unlock();
+	put_beancounter(prev);
+out:
+	file->f_pos = pos;
+	return err;
+}
+
+static int bc_looktest(struct inode *ino, void *data)
+{
+	return ino->i_op == &bc_entry_iops && ino->i_private == data;
+}
+
+static int bc_lookset(struct inode *ino, void *data)
+{
+	struct user_beancounter *ub;
+
+	ub = (struct user_beancounter *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_ino(ub);
+	ino->i_fop = &bc_entry_fops;
+	ino->i_op = &bc_entry_iops;
+	ino->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
+	/* subbeancounters are not included, but who cares? */
+	ino->i_nlink = num_entries + 2;
+	ino->i_gid = 0;
+	ino->i_uid = 0;
+	return 0;
+}
+
+static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *ino;
+
+	ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub);
+	if (ino == NULL)
+		goto out_put;
+
+	if (ino->i_state & I_NEW)
+		unlock_new_inode(ino);
+	dentry->d_op = &bc_dentry_ops;
+	dentry->d_fsdata = ub;
+	d_add(dentry, ino);
+	return NULL;
+
+out_put:
+	put_beancounter_longterm(ub);
+	return ERR_PTR(-ENOENT);
+}
+
+/*
+ * files (bc_proc_entry) manipulations
+ */
+
+static struct dentry *bc_lookup_file(struct inode *dir,
+		struct dentry *dentry, struct bc_proc_entry *root,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *))
+{
+	struct bc_proc_entry *pde;
+	struct inode *ino;
+
+	for (pde = root; pde != NULL; pde = pde->next)
+		if (strcmp(pde->name, dentry->d_name.name) == 0)
+			break;
+
+	if (pde == NULL)
+		return ERR_PTR(-ESRCH);
+
+	ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde);
+	if (ino == NULL)
+		return ERR_PTR(-ENOENT);
+
+	if (ino->i_state & I_NEW)
+		unlock_new_inode(ino);
+	dentry->d_op = &bc_dentry_ops;
+	d_add(dentry, ino);
+	return NULL;
+}
+
+static int bc_file_open(struct inode *ino, struct file *filp)
+{
+	struct bc_proc_entry *de;
+	struct user_beancounter *ub;
+
+	de = (struct bc_proc_entry *)ino->i_private;
+	ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata;
+	BUG_ON(ub->ub_magic != UB_MAGIC);
+
+	/*
+	 * ub can't disappear: we hold d_parent, he holds the beancounter
+	 */
+	return single_open(filp, de->u.show, ub);
+}
+
+static struct file_operations bc_file_ops = {
+	.open		= bc_file_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int bc_looktest_entry(struct inode *ino, void *data)
+{
+	return ino->i_fop == &bc_file_ops && ino->i_private == data;
+}
+
+static int bc_lookset_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_file_ino(de);
+	ino->i_fop = &bc_file_ops,
+	ino->i_mode = S_IFREG | S_IRUSR;
+	ino->i_nlink = 1;
+	ino->i_gid = 0;
+	ino->i_uid = 0;
+	return 0;
+}
+
+static inline struct dentry *bc_lookup_files(struct inode *dir,
+		struct dentry *de)
+{
+	return bc_lookup_file(dir, de, bc_entries,
+			bc_looktest_entry, bc_lookset_entry);
+}
+
+static int bc_looktest_root_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	return ino->i_fop == de->u.fops && ino->i_private == data;
+}
+
+static int bc_lookset_root_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_file_ino(de);
+	ino->i_fop = de->u.fops;
+	ino->i_mode = S_IFREG | S_IRUSR;
+	ino->i_nlink = 1;
+	ino->i_gid = 0;
+	ino->i_uid = 0;
+	return 0;
+}
+
+static inline struct dentry *bc_lookup_root_files(struct inode *dir,
+		struct dentry *de)
+{
+	return bc_lookup_file(dir, de, bc_root_entries,
+			bc_looktest_root_entry, bc_lookset_root_entry);
+}
+
+/*
+ * /proc/bc/.../<id> directory operations
+ */
+
+static int bc_entry_readdir(struct file *file, void *data, filldir_t filler)
+{
+	return bc_readdir(file, filler, data,
+			(struct user_beancounter *)file->f_dentry->d_fsdata);
+}
+
+static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	struct dentry *de;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return ERR_PTR(-EPERM);
+
+	de = bc_lookup_files(dir, dentry);
+	if (de != ERR_PTR(-ESRCH))
+		return de;
+
+	return ERR_PTR(-ENOENT);
+}
+
+static int bc_entry_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct user_beancounter *ub;
+
+	generic_fillattr(dentry->d_inode, stat);
+	ub = (struct user_beancounter *)dentry->d_fsdata;
+	stat->nlink = 2;
+	return 0;
+}
+
+static struct file_operations bc_entry_fops = {
+	.read = generic_read_dir,
+	.readdir = bc_entry_readdir,
+};
+
+static struct inode_operations bc_entry_iops = {
+	.lookup = bc_entry_lookup,
+	.getattr = bc_entry_getattr,
+};
+
+/*
+ * /proc/bc directory operations
+ */
+
+static int bc_root_readdir(struct file *file, void *data, filldir_t filler)
+{
+	return bc_readdir(file, filler, data, NULL);
+}
+
+static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	int id;
+	char *end;
+	struct user_beancounter *ub;
+	struct dentry *de;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return ERR_PTR(-EPERM);
+
+	de = bc_lookup_root_files(dir, dentry);
+	if (de != ERR_PTR(-ESRCH))
+		return de;
+
+	id = simple_strtol(dentry->d_name.name, &end, 10);
+	if (*end == '-') {
+		unsigned long ptr;
+
+		if (kstrtoul(end+1, 16, &ptr))
+			return ERR_PTR(-ENOENT);
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(ub, &ub_leaked_list, ub_leaked_list) {
+			if (ub != (void *)ptr || ub->ub_uid != id)
+				continue;
+			get_beancounter_longterm(ub);
+			rcu_read_unlock();
+			return bc_lookup(ub, dir, dentry);
+		}
+		rcu_read_unlock();
+	}
+	if (*end != '\0')
+		return ERR_PTR(-ENOENT);
+
+	ub = get_beancounter_byuid(id, 0);
+	if (ub == NULL)
+		return ERR_PTR(-ENOENT);
+
+	return bc_lookup(ub, dir, dentry);
+}
+
+static int bc_root_getattr(struct vfsmount *mnt, struct dentry *dentry,
+	struct kstat *stat)
+{
+	generic_fillattr(dentry->d_inode, stat);
+	stat->nlink = ub_count + 2;
+	return 0;
+}
+
+static struct file_operations bc_root_fops = {
+	.read = generic_read_dir,
+	.readdir = bc_root_readdir,
+};
+
+static struct inode_operations bc_root_iops = {
+	.lookup = bc_root_lookup,
+	.getattr = bc_root_getattr,
+};
+
+static int ub_vswap_show(struct seq_file *f, void *unused)
+{
+	seq_puts(f, "Version: 1.0\n");
+	return 0;
+}
+
+static int ub_vswap_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ub_vswap_show, NULL);
+}
+
+static struct file_operations ub_vswap_fops = {
+	.open		= ub_vswap_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init ub_init_proc(void)
+{
+	struct proc_dir_entry *entry;
+
+	bc_proc_root = create_proc_entry("bc",
+			S_IFDIR | S_IRUSR | S_IXUSR, NULL);
+	if (bc_proc_root == NULL)
+		panic("Can't create /proc/bc entry");
+
+	bc_proc_root->proc_fops = &bc_root_fops;
+	bc_proc_root->proc_iops = &bc_root_iops;
+
+	bc_register_proc_entry(&bc_resources_entry);
+#ifdef CONFIG_BC_DEBUG
+	bc_register_proc_entry(&bc_debug_entry);
+#endif
+	bc_register_proc_entry(&bc_precharge_entry);
+	bc_register_proc_entry(&bc_count_slab_entry);
+	bc_register_proc_entry(&bc_dcacheinfo_entry);
+	bc_register_proc_root_entry(&bc_all_resources_entry);
+	bc_register_proc_entry(&bc_meminfo_entry);
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+	bc_register_proc_entry(&bc_nodeinfo_entry);
+#endif
+
+	entry = proc_create("user_beancounters",
+			S_IRUSR, &glob_proc_root, &ub_file_operations);
+	proc_create("vswap", S_IRUSR, proc_vz_dir, &ub_vswap_fops);
+	proc_create("beancounter", S_IFDIR|S_IRUSR|S_IXUSR, proc_vz_dir, NULL);
+	return 0;
+}
+
+core_initcall(ub_init_proc);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/statd.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/statd.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/statd.c	2015-01-21 12:02:43.397221780 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/statd.c	2015-01-21 12:02:43.397221780 +0300
@@ -0,0 +1,492 @@
+/*
+ *  kernel/bc/statd.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+
+#include <bc/beancounter.h>
+#include <bc/statd.h>
+
+static DEFINE_SPINLOCK(ubs_notify_lock);
+static LIST_HEAD(ubs_notify_list);
+static long ubs_min_interval;
+static ubstattime_t ubs_start_time, ubs_end_time;
+static struct timer_list ubs_timer;
+
+static int ubstat_get_list(void __user *buf, long size)
+{
+	int retval;
+	struct user_beancounter *ub, *ubp;
+	long *page, *ptr, *end;
+	int len;
+
+	page = (long *)__get_free_page(GFP_KERNEL);
+	if (page == NULL)
+		return -ENOMEM;
+
+	retval = 0;
+	ubp = NULL;
+	ptr = page;
+	end = page + PAGE_SIZE / sizeof(*ptr);
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		*ptr++ = ub->ub_uid;
+		if (ptr != end)
+			continue;
+
+		if (!get_beancounter_rcu(ub)) {
+			ptr--;
+			continue;
+		}
+		rcu_read_unlock();
+
+		put_beancounter(ubp);
+		ubp = ub;
+
+		len = min_t(long, (ptr - page) * sizeof(*ptr), size);
+		if (copy_to_user(buf, page, len)) {
+			retval = -EFAULT;
+			goto out_put;
+		}
+		retval += len;
+		if (len < PAGE_SIZE)
+			goto out_put;
+		buf += len;
+		size -= len;
+
+		ptr = page;
+		end = page + PAGE_SIZE / sizeof(*ptr);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	size = min_t(long, (ptr - page) * sizeof(*ptr), size);
+	if (size > 0 && copy_to_user(buf, page, size)) {
+		retval = -EFAULT;
+		goto out_put;
+	}
+	retval += size;
+
+out_put:
+	put_beancounter(ubp);
+	free_page((unsigned long)page);
+	return retval;
+}
+
+static int ubstat_gettime(void __user *buf, long size)
+{
+	ubgettime_t data;
+	int retval;
+
+	spin_lock(&ubs_notify_lock);
+	data.start_time = ubs_start_time;
+	data.end_time = ubs_end_time;
+	data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ;
+	spin_unlock(&ubs_notify_lock);
+
+	retval = min_t(long, sizeof(data), size);
+	if (copy_to_user(buf, &data, retval))
+		retval = -EFAULT;
+	return retval;
+}
+
+static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf)
+{
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparm_t	param[1];
+	} *data;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+
+	data->param[0].maxheld = ub->ub_store[res].maxheld;
+	data->param[0].failcnt = ub->ub_store[res].failcnt;
+
+	return sizeof(*data);
+}
+
+static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size)
+{
+	int wrote;
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparm_t	param[UB_RESOURCES];
+	} *data;
+	int resource;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+	wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		if (size < wrote + sizeof(data->param[resource]))
+			break;
+		data->param[resource].maxheld = ub->ub_store[resource].maxheld;
+		data->param[resource].failcnt = ub->ub_store[resource].failcnt;
+		wrote += sizeof(data->param[resource]); 
+	}
+
+	return wrote;
+}
+
+static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf,
+		int size)
+{
+	int wrote;
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparmf_t	param[UB_RESOURCES];
+	} *data;
+	int resource;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+	wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		ubstatparmf_t *p = &data->param[resource];
+		struct ubparm *s = &ub->ub_store[resource];
+
+		if (size < wrote + sizeof(data->param[resource]))
+			break;
+
+		p->barrier	= s->barrier;
+		p->limit	= s->limit;
+		p->held		= s->held;
+		p->maxheld	= s->maxheld;
+		p->minheld	= s->minheld;
+		p->failcnt	= s->failcnt;
+		p->__unused1	= 0;
+		p->__unused2	= 0;
+
+		wrote += sizeof(data->param[resource]);
+	}
+	return wrote;
+}
+
+int ubstat_alloc_store(struct user_beancounter *ub)
+{
+	if (ub->ub_store == NULL) {
+		struct ubparm *store;
+
+		store = kmemdup(ub->ub_parms,
+				UB_RESOURCES * sizeof(struct ubparm),
+				GFP_KERNEL);
+		if (store == NULL)
+			return -ENOMEM;
+
+		spin_lock(&ubs_notify_lock);
+		if (ub->ub_store != NULL)
+			kfree(store);
+		else
+			ub->ub_store = store;
+		spin_unlock(&ubs_notify_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ubstat_alloc_store);
+
+static int ubstat_get_stat(struct user_beancounter *ub, long cmd,
+		void __user *buf, long size)
+{
+	void *kbuf;
+	int retval;
+
+	kbuf = (void *)__get_free_page(GFP_KERNEL);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	retval = ubstat_alloc_store(ub);
+	if (retval)
+		goto out;
+
+	ub_update_resources(ub);
+
+	spin_lock(&ubs_notify_lock);
+	switch (UBSTAT_CMD(cmd)) {
+		case UBSTAT_READ_ONE:
+			retval = -EINVAL;
+			if (UBSTAT_PARMID(cmd) >= UB_RESOURCES)
+				break;
+			retval = ubstat_do_read_one(ub,
+					UBSTAT_PARMID(cmd), kbuf);
+			break;
+		case UBSTAT_READ_ALL:
+			retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE);
+			break;
+		case UBSTAT_READ_FULL:
+			retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE);
+			break;
+		default:
+			retval = -EINVAL;
+	}
+	spin_unlock(&ubs_notify_lock);
+
+	if (retval > 0) {
+		retval = min_t(long, retval, size);
+		if (copy_to_user(buf, kbuf, retval))
+			retval = -EFAULT;
+	}
+out:
+	free_page((unsigned long)kbuf);
+	return retval;
+}
+
+static int ubstat_handle_notifrq(ubnotifrq_t *req)
+{
+	int retval;
+	struct ub_stat_notify *new_notify;
+	struct list_head *entry;
+	struct task_struct *tsk_to_free;
+
+	new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL);
+	if (new_notify == NULL)
+		return -ENOMEM;
+
+	tsk_to_free = NULL;
+	INIT_LIST_HEAD(&new_notify->list);
+
+	spin_lock(&ubs_notify_lock);
+	list_for_each(entry, &ubs_notify_list) {
+		struct ub_stat_notify *notify;
+
+		notify = list_entry(entry, struct ub_stat_notify, list);
+		if (notify->task == current) {
+			kfree(new_notify);
+			new_notify = notify;
+			break;
+		}
+	}
+
+	retval = -EINVAL;
+	if (req->maxinterval < 1)
+		goto out_unlock;
+	if (req->maxinterval > TIME_MAX_SEC)
+		req->maxinterval = TIME_MAX_SEC;
+	if (req->maxinterval < ubs_min_interval) {
+		unsigned long dif;
+
+		ubs_min_interval = req->maxinterval;
+		dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ;
+		if (dif > req->maxinterval)
+			mod_timer(&ubs_timer,
+					ubs_timer.expires -
+					(dif - req->maxinterval) * HZ);
+	}
+
+	if (entry != &ubs_notify_list) {
+		list_del(&new_notify->list);
+		tsk_to_free = new_notify->task;
+	}
+	if (req->signum) {
+		new_notify->task = current;
+		get_task_struct(new_notify->task);
+		new_notify->signum = req->signum;
+		list_add(&new_notify->list, &ubs_notify_list);
+	} else
+		kfree(new_notify);
+	retval = 0;
+out_unlock:
+	spin_unlock(&ubs_notify_lock);
+	if (tsk_to_free != NULL)
+		put_task_struct(tsk_to_free);
+	return retval;
+}
+
+/*
+ * former sys_ubstat
+ */
+long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
+		void __user *buf, long size)
+{
+	int retval;
+	struct user_beancounter *ub;
+
+	if (func == UBSTAT_UBPARMNUM)
+		return UB_RESOURCES;
+	if (func == UBSTAT_UBLIST)
+		return ubstat_get_list(buf, size);
+	if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)))
+		return -EPERM;
+
+	if (func == UBSTAT_GETTIME) {
+		retval = ubstat_gettime(buf, size);
+		goto notify;
+	}
+
+	ub = get_exec_ub();
+	if (ub != NULL && ub->ub_uid == arg1)
+		get_beancounter_longterm(ub);
+	else /* FIXME must be if (ve_is_super) */
+		ub = get_beancounter_byuid(arg1, 0);
+
+	if (ub == NULL)
+		return -ESRCH;
+
+	retval = ubstat_get_stat(ub, func, buf, size);
+	put_beancounter_longterm(ub);
+notify:
+	/* Handle request for notification */
+	if (retval >= 0) {
+		ubnotifrq_t notifrq;
+		int err;
+
+		err = -EFAULT;
+		if (!copy_from_user(&notifrq, (void __user *)arg2,
+					sizeof(notifrq)))
+			err = ubstat_handle_notifrq(&notifrq);
+		if (err)
+			retval = err;
+	}
+
+	return retval;
+}
+
+static void ubstat_save_onestat(struct user_beancounter *ub)
+{
+	int resource;
+
+	if (ub->ub_store == NULL)
+		return;
+
+	/* called with local irq disabled */
+	spin_lock(&ub->ub_lock);
+	ub_update_resources_locked(ub);
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		memcpy(&ub->ub_store[resource], &ub->ub_parms[resource],
+			sizeof(struct ubparm));
+		ub->ub_parms[resource].minheld = 
+			ub->ub_parms[resource].maxheld =
+			ub->ub_parms[resource].held;
+	}
+	spin_unlock(&ub->ub_lock);
+}
+
+static void ubstat_save_statistics(void)
+{
+	unsigned long flags;
+	struct user_beancounter *ub;
+
+	local_irq_save(flags);
+	for_each_beancounter (ub)
+		ubstat_save_onestat(ub);
+	local_irq_restore(flags);
+}
+
+static void ubstatd_timeout(unsigned long __data)
+{
+	struct task_struct *p;
+
+	p = (struct task_struct *) __data;
+	wake_up_process(p);
+}
+
+/*
+ * Safe wrapper for send_sig. It prevents a race with release_task
+ * for sighand.
+ * Should be called under tasklist_lock.
+ */
+static void task_send_sig(struct ub_stat_notify *notify)
+{
+	if (likely(notify->task->sighand != NULL))
+		send_sig(notify->signum, notify->task, 1);
+}
+
+static inline void do_notifies(void)
+{
+	LIST_HEAD(notif_free_list);
+	struct ub_stat_notify *notify;
+	struct ub_stat_notify *tmp;
+
+	spin_lock(&ubs_notify_lock);
+	ubs_start_time = ubs_end_time;
+	/*
+	 * the expression below relies on time being unsigned long and
+	 * arithmetic promotion rules
+	 */
+	ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ;
+	mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ);
+	ubs_min_interval = TIME_MAX_SEC;
+	/* save statistics accumulated for the interval */
+	ubstat_save_statistics();
+	/* send signals */
+	read_lock(&tasklist_lock);
+	while (!list_empty(&ubs_notify_list)) {
+		notify = list_entry(ubs_notify_list.next,
+				struct ub_stat_notify, list);
+		task_send_sig(notify);
+		list_del(&notify->list);
+		list_add(&notify->list, &notif_free_list);
+	}
+	read_unlock(&tasklist_lock);
+	spin_unlock(&ubs_notify_lock);
+
+	list_for_each_entry_safe(notify, tmp, &notif_free_list, list) {
+		put_task_struct(notify->task);
+		kfree(notify);
+	}
+}
+
+/*
+ * Kernel thread
+ */
+static int ubstatd(void *unused)
+{
+	/* daemonize call will take care of signals */
+	daemonize("ubstatd");
+
+	ubs_timer.data = (unsigned long)current;
+	ubs_timer.function = ubstatd_timeout;
+	add_timer(&ubs_timer);
+
+	while (1) {
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		if (time_after(ubs_timer.expires, jiffies)) {
+			schedule();
+			try_to_freeze();
+			continue;
+		}
+
+		__set_task_state(current, TASK_RUNNING);
+		do_notifies();
+	}
+	return 0;
+}
+
+static int __init ubstatd_init(void)
+{
+	init_timer(&ubs_timer);
+	ubs_timer.expires = TIME_MAX_JIF;
+	ubs_min_interval = TIME_MAX_SEC;
+	ubs_start_time = ubs_end_time = 0;
+
+	kernel_thread(ubstatd, NULL, 0);
+	return 0;
+}
+
+module_init(ubstatd_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/sys.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/sys.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/sys.c	2015-01-21 12:02:43.398221754 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/sys.c	2015-01-21 12:02:58.798812923 +0300
@@ -0,0 +1,180 @@
+/*
+ *  kernel/bc/sys.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/virtinfo.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/mmgang.h>
+#include <asm/uaccess.h>
+
+#include <bc/beancounter.h>
+#include <bc/dcache.h>
+
+/*
+ *	The (rather boring) getluid syscall
+ */
+SYSCALL_DEFINE0(getluid)
+{
+	struct user_beancounter *ub;
+
+	ub = get_exec_ub();
+	if (ub == NULL)
+		return -EINVAL;
+
+	return ub->ub_uid;
+}
+
+/*
+ *	The setluid syscall
+ */
+SYSCALL_DEFINE1(setluid, uid_t, uid)
+{
+	struct user_beancounter *ub;
+	int error;
+
+	/* You may not disown a setluid */
+	error = -EINVAL;
+	if (uid == (uid_t)-1)
+		goto out;
+
+	/* You may only set an ub as root */
+	error = -EPERM;
+	if (!capable(CAP_SETUID))
+		goto out;
+	/*
+	 * The ub once set is irrevocable to all
+	 * unless it's set from ve0.
+	 */
+	if (!ve_is_super(get_exec_env()))
+		goto out;
+
+	/* Ok - set up a beancounter entry for this user */
+	error = -ENOBUFS;
+	ub = get_beancounter_byuid(uid, 1);
+	if (ub == NULL)
+		goto out;
+
+	ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) "
+			"for %.20s pid %d\n",
+			ub, atomic_read(&ub->ub_refcount),
+			current->comm, current->pid);
+
+	error = set_task_exec_ub(current, ub);
+
+	put_beancounter_longterm(ub);
+out:
+	return error;
+}
+
+long do_setublimit(uid_t uid, unsigned long resource,
+		unsigned long *new_limits)
+{
+	int error;
+	unsigned long flags;
+	struct user_beancounter *ub;
+
+	error = -EPERM;
+	if(!capable(CAP_SYS_RESOURCE))
+		goto out;
+
+	if (!ve_is_super(get_exec_env()))
+		goto out;
+
+	error = -EINVAL;
+	if (resource >= UB_RESOURCES)
+		goto out;
+
+	error = -EINVAL;
+	if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
+		goto out;
+
+	error = -ENOENT;
+	ub = get_beancounter_byuid(uid, 0);
+	if (ub == NULL) {
+		ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid);
+		goto out;
+	}
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_parms[resource].barrier = new_limits[0];
+	ub->ub_parms[resource].limit = new_limits[1];
+	init_beancounter_precharge(ub, resource);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	if (resource == UB_PHYSPAGES) {
+		ub_update_threshold();
+		set_gang_limits(get_ub_gs(ub),
+				&ub->ub_parms[UB_PHYSPAGES].limit, NULL);
+	}
+
+	put_beancounter_longterm(ub);
+
+	error = 0;
+out:
+	return error;
+}
+
+/*
+ *	The setbeanlimit syscall
+ */
+SYSCALL_DEFINE3(setublimit, uid_t, uid, unsigned long, resource,
+		unsigned long __user, *limits)
+{
+	unsigned long new_limits[2];
+
+	if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
+		return -EFAULT;
+
+	return do_setublimit(uid, resource, new_limits);
+}
+
+extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, 
+		void __user *buf, long size);
+
+SYSCALL_DEFINE5(ubstat, int, func, unsigned long, arg1, unsigned long, arg2,
+		void __user, *buf, long, size)
+{
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	return do_ubstat(func, arg1, arg2, buf, size);
+}
+
+#ifdef CONFIG_COMPAT
+#define UB_MAXVALUE_COMPAT ((1UL << (sizeof(compat_long_t) * 8 - 1)) - 1)
+
+asmlinkage long compat_sys_setublimit(uid_t uid,
+		compat_long_t resource,
+		compat_long_t __user *limits)
+{
+	compat_long_t u_new_limits[2];
+	unsigned long new_limits[2];
+
+	if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits)))
+		return -EFAULT;
+
+	new_limits[0] = u_new_limits[0];
+	new_limits[1] = u_new_limits[1];
+
+	if (u_new_limits[0] == UB_MAXVALUE_COMPAT)
+		new_limits[0] = UB_MAXVALUE;
+	if (u_new_limits[1] == UB_MAXVALUE_COMPAT)
+		new_limits[1] = UB_MAXVALUE;
+
+	return do_setublimit(uid, resource, new_limits);
+}
+
+asmlinkage long compat_sys_ubstat(int func, unsigned int arg1,
+		unsigned int arg2, compat_uptr_t *buf, long size)
+{
+	return sys_ubstat(func, arg1, arg2, buf, size);
+}
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/bc/vm_pages.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/vm_pages.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/bc/vm_pages.c	2015-01-21 12:02:43.398221754 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/bc/vm_pages.c	2015-01-21 12:02:58.940809155 +0300
@@ -0,0 +1,657 @@
+/*
+ *  kernel/bc/vm_pages.c
+ *
+ *  Copyright (C) 2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/virtinfo.h>
+#include <linux/module.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/mmgang.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+#include <bc/oom_kill.h>
+
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+
+/**
+ * Update oomguarpages.held value, it includes:
+ *  charged swap-backed pages:	present anonymous pages, swapcache, tmpfs
+ *  unevictable-pages:		mlocked pages, ramfs
+ *  swap-entries:		allocated swap-space
+ */
+void __ub_update_oomguarpages(struct user_beancounter *ub)
+{
+	unsigned long pages[NR_LRU_LISTS];
+
+	gang_page_stat(get_ub_gs(ub), NULL, pages, NULL);
+
+	ub->ub_parms[UB_OOMGUARPAGES].held =
+		pages[LRU_ACTIVE_ANON] +
+		pages[LRU_INACTIVE_ANON] +
+		pages[LRU_UNEVICTABLE] +
+		ub->ub_parms[UB_SWAPPAGES].held;
+
+	ub_adjust_maxheld(ub, UB_OOMGUARPAGES);
+}
+
+#else
+
+void __ub_update_oomguarpages(struct user_beancounter *ub)
+{
+	ub->ub_parms[UB_OOMGUARPAGES].held =
+		ub->ub_parms[UB_PRIVVMPAGES].held +
+		ub->ub_parms[UB_LOCKEDPAGES].held +
+		ub->ub_parms[UB_PHYSPAGES].held +
+		ub->ub_parms[UB_SWAPPAGES].held;
+
+	ub_adjust_maxheld(ub, UB_OOMGUARPAGES);
+}
+
+#endif
+
+long ub_oomguarpages_left(struct user_beancounter *ub)
+{
+	unsigned long flags;
+	long left;
+	int precharge[UB_RESOURCES];
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	__ub_update_oomguarpages(ub);
+	left = ub->ub_parms[UB_OOMGUARPAGES].barrier -
+		ub->ub_parms[UB_OOMGUARPAGES].held;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	ub_precharge_snapshot(ub, precharge);
+	left += precharge[UB_OOMGUARPAGES];
+
+	return left;
+}
+
+void ub_update_resources_locked(struct user_beancounter *ub)
+{
+	__ub_update_oomguarpages(ub);
+}
+EXPORT_SYMBOL(ub_update_resources_locked);
+
+void ub_update_resources(struct user_beancounter *ub)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub_update_resources_locked(ub);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+EXPORT_SYMBOL(ub_update_resources);
+
+int ub_memory_charge(struct mm_struct *mm, unsigned long size,
+		unsigned vm_flags, struct file *vm_file, int sv)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return 0;
+
+	size >>= PAGE_SHIFT;
+	if (size > UB_MAXVALUE)
+		return -EINVAL;
+
+	BUG_ON(sv != UB_SOFT && sv != UB_HARD);
+
+	if (vm_flags & VM_LOCKED) {
+		if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv))
+			goto out_err;
+	}
+	if (VM_UB_PRIVATE(vm_flags, vm_file)) {
+               if (charge_beancounter_fast(ub, UB_PRIVVMPAGES, size, sv))
+			goto out_private;
+	}
+	return 0;
+
+out_private:
+	if (vm_flags & VM_LOCKED)
+		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+out_err:
+	return -ENOMEM;
+}
+
+void ub_memory_uncharge(struct mm_struct *mm, unsigned long size,
+		unsigned vm_flags, struct file *vm_file)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return;
+
+	size >>= PAGE_SHIFT;
+
+	if (vm_flags & VM_LOCKED)
+		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+       if (VM_UB_PRIVATE(vm_flags, vm_file))
+               uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, size);
+}
+
+int ub_locked_charge(struct mm_struct *mm, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return 0;
+
+	return charge_beancounter(ub, UB_LOCKEDPAGES,
+			size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_locked_uncharge(struct mm_struct *mm, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return;
+
+	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = shi->shmi_ub;
+	if (ub == NULL)
+		return 0;
+
+	return charge_beancounter(ub, UB_LOCKEDPAGES,
+			size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = shi->shmi_ub;
+	if (ub == NULL)
+		return;
+
+	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+
+static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_tmpfs_respages++;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_tmpfs_respages_inc(struct shmem_inode_info *shi)
+{
+	if (shi->shmi_ub)
+		do_ub_tmpfs_respages_inc(shi->shmi_ub);
+}
+
+static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub,
+		unsigned long size)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	/* catch possible overflow */
+	if (ub->ub_tmpfs_respages < size) {
+		uncharge_warn(ub, "tmpfs_respages",
+				size, ub->ub_tmpfs_respages);
+		size = ub->ub_tmpfs_respages;
+	}
+	ub->ub_tmpfs_respages -= size;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
+		unsigned long size)
+{
+	if (shi->shmi_ub)
+		do_ub_tmpfs_respages_sub(shi->shmi_ub, size);
+}
+
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+int ub_try_to_free_pages(struct user_beancounter *ub, gfp_t gfp_mask)
+{
+	unsigned long progress, flags;
+
+	if (!(gfp_mask & __GFP_WAIT))
+		goto nowait;
+
+	progress = try_to_free_gang_pages(get_ub_gs(ub),
+			gfp_mask | __GFP_HIGHMEM);
+	if (progress)
+		return 0;
+
+nowait:
+	if (gfp_mask & __GFP_NOWARN)
+		goto nowarn;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_parms[UB_PHYSPAGES].failcnt++;
+	if (!ub_resource_excess(ub, UB_SWAPPAGES, UB_SOFT))
+		ub->ub_parms[UB_SWAPPAGES].failcnt++;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+nowarn:
+	if ((gfp_mask & __GFP_NORETRY) || !(gfp_mask & __GFP_WAIT) ||
+			out_of_memory_in_ub(ub, gfp_mask))
+		return -ENOMEM;
+
+	return 0;
+}
+
+int __ub_phys_charge(struct user_beancounter *ub,
+		unsigned long pages, gfp_t gfp_mask)
+{
+	int strict = UB_SOFT | UB_TEST;
+
+	if ((gfp_mask & __GFP_NOFAIL) || get_exec_ub() != ub)
+		strict = UB_FORCE;
+
+	ub_oom_start(&ub->oom_ctrl);
+
+	while (charge_beancounter_fast(ub, UB_PHYSPAGES, pages, strict)) {
+		if (test_thread_flag(TIF_MEMDIE) ||
+		    fatal_signal_pending(current))
+			strict = UB_FORCE;
+		else if (ub_try_to_free_pages(ub, gfp_mask))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__ub_phys_charge);
+
+int __ub_check_ram_limits(struct user_beancounter *ub, gfp_t gfp_mask, int size)
+{
+	if ((gfp_mask & __GFP_NOFAIL) || get_exec_ub() != ub)
+		return 0;
+
+	ub_oom_start(&ub->oom_ctrl);
+
+	do {
+		if (test_thread_flag(TIF_MEMDIE) ||
+		    fatal_signal_pending(current))
+			return 0;
+		if (ub_try_to_free_pages(ub, gfp_mask))
+			return -ENOMEM;
+	} while (precharge_beancounter(ub, UB_PHYSPAGES, size));
+
+	return 0;
+}
+EXPORT_SYMBOL(__ub_check_ram_limits);
+
+#endif
+
+#ifdef CONFIG_HUGETLBFS
+
+int ub_hugetlb_charge(struct user_beancounter *ub, struct page *page)
+{
+	int numpages = 1 << compound_order(page);
+
+	if (ub_phys_charge(ub, numpages, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock_irq(&ub->ub_lock);
+	if (__charge_beancounter_locked(ub, UB_LOCKEDPAGES, numpages, UB_SOFT)) {
+		__uncharge_beancounter_locked(ub, UB_PHYSPAGES, numpages);
+		spin_unlock_irq(&ub->ub_lock);
+		return -ENOMEM;
+	}
+	ub->ub_hugetlb_pages += numpages;
+	spin_unlock_irq(&ub->ub_lock);
+
+	BUG_ON(page->kmem_ub);
+	page->kmem_ub = ub;
+	get_beancounter(ub);
+	return 0;
+}
+
+void ub_hugetlb_uncharge(struct page *page)
+{
+	struct user_beancounter *ub = page->kmem_ub;
+	int numpages = 1 << compound_order(page);
+
+	if (!ub)
+		return;
+
+	spin_lock_irq(&ub->ub_lock);
+	__uncharge_beancounter_locked(ub, UB_LOCKEDPAGES, numpages);
+	__uncharge_beancounter_locked(ub, UB_PHYSPAGES, numpages);
+	ub->ub_hugetlb_pages -= numpages;
+	spin_unlock_irq(&ub->ub_lock);
+
+	page->kmem_ub = NULL;
+	put_beancounter(ub);
+}
+
+#endif /* CONFIG_HUGETLBFS */
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+
+/*
+ * All this stuff is protected with swap_lock
+ */
+
+void ub_swapentry_get(struct swap_info_struct *si, pgoff_t num,
+		      struct user_beancounter *ub)
+{
+	rcu_assign_pointer(si->swap_ubs[num], ub);
+	ub->ub_swapentries++;
+}
+
+void ub_swapentry_put(struct swap_info_struct *si, pgoff_t num)
+{
+	struct user_beancounter *ub = si->swap_ubs[num];
+
+	rcu_assign_pointer(si->swap_ubs[num], NULL);
+	ub->ub_swapentries--;
+}
+
+void ub_swapentry_charge(struct swap_info_struct *si, pgoff_t num)
+{
+	charge_beancounter_fast(si->swap_ubs[num], UB_SWAPPAGES, 1, UB_FORCE);
+}
+
+void ub_swapentry_uncharge(struct swap_info_struct *si, pgoff_t num)
+{
+	uncharge_beancounter_fast(si->swap_ubs[num], UB_SWAPPAGES, 1);
+}
+
+void ub_swapentry_recharge(struct swap_info_struct *si, pgoff_t num,
+			   struct user_beancounter *new_ub)
+{
+	struct user_beancounter *ub;
+
+	ub = si->swap_ubs[num];
+	rcu_assign_pointer(si->swap_ubs[num], new_ub);
+	ub->ub_swapentries--;
+	new_ub->ub_swapentries++;
+	if (!(si->swap_map[num] & SWAP_HAS_CACHE)) {
+		uncharge_beancounter_fast(ub, UB_SWAPPAGES, 1);
+		charge_beancounter_fast(new_ub, UB_SWAPPAGES, 1, UB_FORCE);
+	}
+}
+
+int ub_swap_init(struct swap_info_struct *si, pgoff_t num)
+{
+	struct user_beancounter **ubs;
+
+	ubs = vmalloc(num * sizeof(struct user_beancounter *));
+	if (ubs == NULL)
+		return -ENOMEM;
+
+	memset(ubs, 0, num * sizeof(struct user_beancounter *));
+	si->swap_ubs = ubs;
+	return 0;
+}
+
+void ub_swap_fini(struct swap_info_struct *si)
+{
+	if (si->swap_ubs) {
+		vfree(si->swap_ubs);
+		si->swap_ubs = NULL;
+	}
+}
+#endif
+
+static int bc_fill_sysinfo(struct user_beancounter *ub,
+		unsigned long meminfo_val, struct sysinfo *si)
+{
+	unsigned long used, total;
+	unsigned long totalram, totalswap;
+
+	/* No virtualization */
+	if (meminfo_val == VE_MEMINFO_SYSTEM)
+		return NOTIFY_DONE | NOTIFY_STOP_MASK;
+
+	totalram = si->totalram;
+	totalswap = si->totalswap;
+
+	memset(si, 0, sizeof(*si));
+
+	total = ub->ub_parms[UB_PHYSPAGES].limit;
+	used = get_beancounter_usage_percpu(ub, UB_PHYSPAGES);
+
+	if (total == UB_MAXVALUE) {
+		if (meminfo_val < VE_MEMINFO_NR_SPECIAL)
+			total = totalram;
+		else {
+			total = min(meminfo_val, totalram);
+			used = get_beancounter_usage_percpu(ub, UB_PRIVVMPAGES);
+			if (glob_ve_meminfo) {
+				ub_update_resources(ub);
+				used = ub->ub_parms[UB_OOMGUARPAGES].held;
+			}
+		}
+	}
+
+	si->totalram = total;
+	si->freeram = (total > used ? total - used : 0);
+
+	total = ub->ub_parms[UB_SWAPPAGES].limit;
+	used = get_beancounter_usage_percpu(ub, UB_SWAPPAGES);
+
+	if (total == UB_MAXVALUE) {
+		if (meminfo_val < VE_MEMINFO_NR_SPECIAL)
+			total = totalswap;
+		else
+			total = 0;
+	}
+
+	si->totalswap = total;
+	si->freeswap = (total > used ? total - used : 0);
+
+	si->mem_unit = PAGE_SIZE;
+
+	return NOTIFY_OK;
+}
+
+static int bc_fill_meminfo(struct user_beancounter *ub,
+		unsigned long meminfo_val, struct meminfo *mi)
+{
+	int cpu, ret;
+	long dcache, kmem;
+
+	ret = bc_fill_sysinfo(ub, meminfo_val, mi->si);
+	if (ret & NOTIFY_STOP_MASK)
+		goto out;
+
+	gang_page_stat(get_ub_gs(ub), NULL, mi->pages, mi->shadow);
+	gang_idle_page_stat(get_ub_gs(ub), NULL, &mi->idle_page_stats);
+
+	mi->locked = ub->ub_parms[UB_LOCKEDPAGES].held;
+	mi->shmem = ub->ub_parms[UB_SHMPAGES].held;
+	dcache = ub->ub_parms[UB_DCACHESIZE].held;
+	kmem = ub->ub_parms[UB_KMEMSIZE].held;
+
+	mi->dirty_pages = __ub_stat_get(ub, dirty_pages);
+	mi->writeback_pages = __ub_stat_get(ub, writeback_pages);
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+
+		mi->dirty_pages	+= pcpu->dirty_pages;
+		mi->writeback_pages	+= pcpu->writeback_pages;
+		dcache		-= pcpu->precharge[UB_DCACHESIZE];
+		kmem		-= pcpu->precharge[UB_KMEMSIZE];
+	}
+
+	mi->dirty_pages = max_t(long, 0, mi->dirty_pages);
+	mi->writeback_pages = max_t(long, 0, mi->writeback_pages);
+
+	mi->slab_reclaimable = DIV_ROUND_UP(max(0L, dcache), PAGE_SIZE);
+	mi->slab_unreclaimable =
+		DIV_ROUND_UP(max(0L, kmem - dcache), PAGE_SIZE);
+
+	mi->cached = min(mi->si->totalram - mi->si->freeram -
+			mi->slab_reclaimable - mi->slab_unreclaimable,
+			mi->pages[LRU_INACTIVE_FILE] +
+			mi->pages[LRU_ACTIVE_FILE] +
+			ub->ub_parms[UB_SHMPAGES].held);
+out:
+	return ret;
+}
+
+static int bc_fill_vmstat(struct user_beancounter *ub, unsigned long *stat)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+
+		stat[NR_VM_ZONE_STAT_ITEMS + PSWPIN]	+= pcpu->swapin;
+		stat[NR_VM_ZONE_STAT_ITEMS + PSWPOUT]	+= pcpu->swapout;
+
+		stat[NR_VM_ZONE_STAT_ITEMS + PSWPIN]	+= pcpu->vswapin;
+		stat[NR_VM_ZONE_STAT_ITEMS + PSWPOUT]	+= pcpu->vswapout;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int bc_mem_notify(struct vnotifier_block *self,
+		unsigned long event, void *arg, int old_ret)
+{
+	switch (event) {
+	case VIRTINFO_MEMINFO: {
+		struct meminfo *mi = arg;
+		return bc_fill_meminfo(mi->ub, mi->meminfo_val, mi);
+	}
+	case VIRTINFO_SYSINFO:
+		return bc_fill_sysinfo(get_exec_ub(),
+				get_exec_env()->meminfo_val, arg);
+	case VIRTINFO_VMSTAT:
+		return bc_fill_vmstat(get_exec_ub(), arg);
+	};
+
+	return old_ret;
+}
+
+static struct vnotifier_block bc_mem_notifier_block = {
+	.notifier_call = bc_mem_notify,
+};
+
+static int __init init_vmguar_notifier(void)
+{
+	virtinfo_notifier_register(VITYPE_GENERAL, &bc_mem_notifier_block);
+	return 0;
+}
+
+static void __exit fini_vmguar_notifier(void)
+{
+	virtinfo_notifier_unregister(VITYPE_GENERAL, &bc_mem_notifier_block);
+}
+
+module_init(init_vmguar_notifier);
+module_exit(fini_vmguar_notifier);
+
+static void __show_one_resource(const char *name, struct ubparm *parm)
+{
+	if (parm->limit == UB_MAXVALUE)
+		printk("%s: %lu / inf [%lu] ", name,
+				parm->held, parm->failcnt);
+	else
+		printk("%s: %lu / %lu [%lu] ", name,
+				parm->held, parm->limit, parm->failcnt);
+}
+
+void __show_ub_mem(struct user_beancounter *ub)
+{
+	__show_one_resource("RAM", ub->ub_parms + UB_PHYSPAGES);
+	__show_one_resource("SWAP", ub->ub_parms + UB_SWAPPAGES);
+	__show_one_resource("KMEM", ub->ub_parms + UB_KMEMSIZE);
+	__show_one_resource("DCSZ", ub->ub_parms + UB_DCACHESIZE);
+	__show_one_resource("OOMG", ub->ub_parms + UB_OOMGUARPAGES);
+
+	printk("Dirty %lu Wback %lu Dche %u Prnd %lu\n",
+			ub_stat_get(ub, dirty_pages),
+			ub_stat_get(ub, writeback_pages),
+			ub->ub_dentry_unused, ub->ub_dentry_pruned);
+}
+
+void show_ub_mem(struct user_beancounter *ub)
+{
+	printk(KERN_INFO "UB-%d-Mem-Info:\n", ub->ub_uid);
+	gang_show_state(get_ub_gs(ub));
+	__show_ub_mem(ub);
+}
+
+#ifdef CONFIG_PROC_FS
+static int bc_vmaux_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long swapin, swapout, vswapin, vswapout, phys_pages;
+	unsigned long shadow_pages;
+	int i;
+
+	ub = seq_beancounter(f);
+
+	swapin = swapout = vswapin = vswapout = 0;
+	phys_pages = ub->ub_parms[UB_PHYSPAGES].held;
+	shadow_pages = ub->ub_parms[UB_SHADOWPAGES].held;
+	for_each_possible_cpu(i) {
+		ub_pcpu = ub_percpu(ub, i);
+		swapin += ub_pcpu->swapin;
+		swapout += ub_pcpu->swapout;
+		vswapin += ub_pcpu->vswapin;
+		vswapout += ub_pcpu->vswapout;
+		phys_pages -= ub_pcpu->precharge[UB_PHYSPAGES];
+		shadow_pages -= ub_pcpu->precharge[UB_SHADOWPAGES];
+	}
+
+	phys_pages = max_t(long, 0, phys_pages);
+	shadow_pages = max_t(long, 0, shadow_pages);
+
+	seq_printf(f, bc_proc_lu_fmt, "tmpfs_respages",
+			ub->ub_tmpfs_respages);
+
+	seq_printf(f, bc_proc_lu_fmt, "swapin", swapin);
+	seq_printf(f, bc_proc_lu_fmt, "swapout", swapout);
+
+	seq_printf(f, bc_proc_lu_fmt, "vswapin", vswapin);
+	seq_printf(f, bc_proc_lu_fmt, "vswapout", vswapout);
+
+	seq_printf(f, bc_proc_lu_fmt, "ram", phys_pages);
+	seq_printf(f, bc_proc_lu_fmt, "shadow", shadow_pages);
+	seq_printf(f, bc_proc_lu_fmt, "swap_entries", ub->ub_swapentries);
+
+	seq_printf(f, bc_proc_lu_fmt, "hugetlb", ub->ub_hugetlb_pages);
+
+	return 0;
+}
+static struct bc_proc_entry bc_vmaux_entry = {
+	.name = "vmaux",
+	.u.show = bc_vmaux_show,
+};
+
+static int __init bc_vmaux_init(void)
+{
+	bc_register_proc_entry(&bc_vmaux_entry);
+	return 0;
+}
+
+late_initcall(bc_vmaux_init);
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cgroup.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cgroup.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cgroup.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cgroup.c	2015-01-21 12:02:58.677816134 +0300
@@ -61,6 +61,7 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <bc/dcache.h>
 
 #include <asm/atomic.h>
 
@@ -127,9 +128,6 @@ struct cgroupfs_root {
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
@@ -233,12 +231,19 @@ enum {
 	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 
+static int cgroup_is_disposable(const struct cgroup *cgrp)
+{
+	return (cgrp->flags & ((1 << CGRP_NOTIFY_ON_RELEASE) |
+				(1 << CGRP_SELF_DESTRUCTION))) > 0;
+}
+
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
-		(1 << CGRP_NOTIFY_ON_RELEASE);
-	return (cgrp->flags & bits) == bits;
+		(1 << CGRP_NOTIFY_ON_RELEASE) |
+		(1 << CGRP_SELF_DESTRUCTION);
+	return (cgrp->flags & bits) > (1 << CGRP_RELEASABLE);
 }
 
 static int notify_on_release(const struct cgroup *cgrp)
@@ -362,7 +367,7 @@ static void __put_css_set(struct css_set
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		if (atomic_dec_and_test(&cgrp->count) &&
-		    notify_on_release(cgrp)) {
+		    cgroup_is_disposable(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
@@ -679,6 +684,42 @@ static struct cgroup *task_cgroup_from_r
 	return res;
 }
 
+int cpt_collect_cgroups(struct vfsmount *mnt,
+			int (*cb)(struct cgroup *cgrp, void *arg), void *arg)
+{
+	struct cgroup *top, *cgrp;
+	int ret = 0;
+
+	top = mnt->mnt_root->d_fsdata;
+	top = top->top_cgroup;
+	cgrp = top;
+
+	cgroup_lock();
+
+	do {
+		ret = cb(cgrp, arg);
+		if (ret)
+			goto out;
+
+		if (!list_empty(&cgrp->children)) {
+			cgrp = list_first_entry(&cgrp->children,
+					struct cgroup, sibling);
+			continue;
+		}
+		while (cgrp != top) {
+			if (cgrp->sibling.next != &cgrp->parent->children) {
+				cgrp = list_entry(cgrp->sibling.next,
+							struct cgroup, sibling);
+				break;
+			} else
+				cgrp = cgrp->parent;
+		}
+	} while (cgrp != top);
+out:
+	cgroup_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(cpt_collect_cgroups);
 /*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -845,6 +886,7 @@ static void cgroup_diput(struct dentry *
 			ss->destroy(ss, cgrp);
 
 		cgrp->root->number_of_cgroups--;
+		kfree(cgrp->release_agent);
 		mutex_unlock(&cgroup_mutex);
 
 		/*
@@ -1016,6 +1058,7 @@ static int rebind_subsystems(struct cgro
 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+	struct cgroup *top_cgrp;
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_root_mutex);
@@ -1023,26 +1066,20 @@ static int cgroup_show_options(struct se
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
 		seq_puts(seq, ",noprefix");
-	if (strlen(root->release_agent_path))
-		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+	/* A file from cgroup directory may be bind mounted */
+	if (S_ISDIR(vfs->mnt_root->d_inode->i_mode)) {
+		top_cgrp = vfs->mnt_root->d_fsdata;
+		top_cgrp = top_cgrp->top_cgroup;
+		if (top_cgrp->release_agent)
+			seq_printf(seq, ",release_agent=%s",
+					top_cgrp->release_agent);
+	}
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
 	mutex_unlock(&cgroup_root_mutex);
 	return 0;
 }
 
-struct cgroup_sb_opts {
-	unsigned long subsys_bits;
-	unsigned long flags;
-	char *release_agent;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-
-	struct cgroupfs_root *new_root;
-
-};
-
 /* Convert a hierarchy specifier into a bitmask of subsystems and
  * flags. */
 static int parse_cgroupfs_options(char *data,
@@ -1144,6 +1181,15 @@ static int parse_cgroupfs_options(char *
 	if (!opts->subsys_bits && !opts->name)
 		return -EINVAL;
 
+	if (!ve_is_super(get_exec_env())) {
+		/* Forbid all subsystems inside container */
+		if (opts->subsys_bits)
+			return -ENOENT;
+		/* Allow only one hierarchy: "systemd" */
+		if (strcmp(opts->name, "systemd"))
+			return -EPERM;
+	}
+
 	return 0;
 }
 
@@ -1154,6 +1200,9 @@ static int cgroup_remount(struct super_b
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	lock_kernel();
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -1183,8 +1232,11 @@ static int cgroup_remount(struct super_b
 	/* (re)populate subsystem files */
 	cgroup_populate_dir(cgrp);
 
-	if (opts.release_agent)
-		strcpy(root->release_agent_path, opts.release_agent);
+	if (opts.release_agent) {
+		kfree(cgrp->release_agent);
+		cgrp->release_agent = opts.release_agent;
+		opts.release_agent = NULL;
+	}
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -1289,8 +1341,6 @@ static struct cgroupfs_root *cgroup_root
 
 	root->subsys_bits = opts->subsys_bits;
 	root->flags = opts->flags;
-	if (opts->release_agent)
-		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	return root;
@@ -1305,6 +1355,7 @@ static void cgroup_drop_root(struct cgro
 	spin_lock(&hierarchy_id_lock);
 	ida_remove(&hierarchy_ida, root->hierarchy_id);
 	spin_unlock(&hierarchy_id_lock);
+	kfree(root->top_cgroup.release_agent);
 	kfree(root);
 }
 
@@ -1353,6 +1404,7 @@ static int cgroup_get_rootdir(struct sup
 		return -ENOMEM;
 	}
 	sb->s_root = dentry;
+	ub_dcache_set_owner(dentry, get_ub0());
 	return 0;
 }
 
@@ -1368,7 +1420,14 @@ static int cgroup_get_sb(struct file_sys
 	struct inode *inode;
 
 	/* First find the desired set of subsystems */
-	ret = parse_cgroupfs_options(data, &opts);
+	if (!(flags & MS_KERNMOUNT))
+		ret = parse_cgroupfs_options(data, &opts);
+	else {
+		opts = *(struct cgroup_sb_opts *)data;
+		opts.name = kstrdup(opts.name, GFP_KERNEL);
+		opts.release_agent = kstrdup(opts.release_agent, GFP_KERNEL);
+	}
+
 	if (ret)
 		goto out_err;
 
@@ -1464,6 +1523,14 @@ static int cgroup_get_sb(struct file_sys
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 
+#ifdef CONFIG_VE
+		if (root->subsys_bits)
+#endif
+		{
+			root_cgrp->release_agent = opts.release_agent;
+			opts.release_agent = NULL;
+		}
+
 		cred = override_creds(&init_cred);
 		cgroup_populate_dir(root_cgrp);
 		revert_creds(cred);
@@ -1478,7 +1545,53 @@ static int cgroup_get_sb(struct file_sys
 		cgroup_drop_root(opts.new_root);
 	}
 
-	simple_set_mnt(mnt, sb);
+#ifdef CONFIG_VE
+	if (!root->subsys_bits) {
+		struct cgroup *top_cgrp;
+		char name[16];
+
+		/*
+		 * Construct namespace for hierarchies without subsystems
+		 */
+		snprintf(name, sizeof name, "%d", get_exec_env()->veid);
+		top_cgrp = cgroup_kernel_open(&root->top_cgroup, 0, name);
+		ret = PTR_ERR(top_cgrp);
+		if (IS_ERR(top_cgrp))
+			goto drop_new_super;
+
+		if (top_cgrp == NULL) {
+			top_cgrp = cgroup_kernel_open(&root->top_cgroup, CGRP_CREAT, name);
+			ret = PTR_ERR(top_cgrp);
+			if (IS_ERR(top_cgrp))
+				goto drop_new_super;
+
+			top_cgrp->khelper_wq = get_exec_env()->khelper_wq;
+
+			/*
+			 * Register independent release agent for this fake top cgroup
+			 */
+			mutex_lock(&top_cgrp->dentry->d_inode->i_mutex);
+			mutex_lock(&cgroup_mutex);
+			mutex_lock(&cgroup_root_mutex);
+			top_cgrp->release_agent = opts.release_agent;
+			opts.release_agent = NULL;
+			top_cgrp->top_cgroup = top_cgrp;
+			cgroup_populate_dir(top_cgrp);
+			mutex_unlock(&cgroup_root_mutex);
+			mutex_unlock(&cgroup_mutex);
+			mutex_unlock(&top_cgrp->dentry->d_inode->i_mutex);
+		}
+
+		/*
+		 * mount it as bindmount to fist-level fake top cgroup
+		 */
+		mnt->mnt_sb = sb;
+		mnt->mnt_root = dget(top_cgrp->dentry);
+		cgroup_kernel_close(top_cgrp);
+		ub_dcache_set_owner(mnt->mnt_root, get_exec_ub());
+	} else
+#endif /* CONFIG_VE */
+		simple_set_mnt(mnt, sb);
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	return 0;
@@ -1543,11 +1656,13 @@ static void cgroup_kill_sb(struct super_
 	cgroup_drop_root(root);
 }
 
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.get_sb = cgroup_get_sb,
 	.kill_sb = cgroup_kill_sb,
+	.fs_flags = FS_VIRTUALIZED,
 };
+EXPORT_SYMBOL(cgroup_fs_type);
 
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
@@ -1574,7 +1689,7 @@ int cgroup_path(const struct cgroup *cgr
 	char *start;
 	struct dentry *dentry = rcu_dereference(cgrp->dentry);
 
-	if (!dentry || cgrp == dummytop) {
+	if (!dentry || cgrp == dummytop || cgrp == cgrp->top_cgroup) {
 		/*
 		 * Inactive subsystems have no dentry for their root
 		 * cgroup
@@ -1594,6 +1709,9 @@ int cgroup_path(const struct cgroup *cgr
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
+		/* hide fake top-cgroup in path */
+		if (cgrp == cgrp->top_cgroup)
+			cgrp = &cgrp->root->top_cgroup;
 		dentry = rcu_dereference(cgrp->dentry);
 		if (!cgrp->parent)
 			continue;
@@ -1605,6 +1723,12 @@ int cgroup_path(const struct cgroup *cgr
 	return 0;
 }
 
+struct task_and_cgroup {
+	struct task_struct	*task;
+	struct cgroup		*cgrp;
+	struct css_set		*cg;
+};
+
 /*
  * cgroup_task_migrate - move a task from one cgroup to another.
  *
@@ -1612,44 +1736,14 @@ int cgroup_path(const struct cgroup *cgr
  * will already exist. If not set, this function might sleep, and can fail with
  * -ENOMEM. Otherwise, it can only fail with -ESRCH.
  */
-static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
-			       struct task_struct *tsk, bool guarantee)
+static int cgroup_task_migrate(struct cgroup *oldcgrp,
+			       struct task_struct *tsk, struct css_set *newcg)
 {
 	struct css_set *oldcg;
-	struct css_set *newcg;
 
-	/*
-	 * get old css_set. we need to take task_lock and refcount it, because
-	 * an exiting task can change its css_set to init_css_set and drop its
-	 * old one without taking cgroup_mutex.
-	 */
 	task_lock(tsk);
 	oldcg = tsk->cgroups;
-	get_css_set(oldcg);
-	task_unlock(tsk);
-
-	/* locate or allocate a new css_set for this task. */
-	if (guarantee) {
-		/* we know the css_set we want already exists. */
-		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-		read_lock(&css_set_lock);
-		newcg = find_existing_css_set(oldcg, cgrp, template);
-		BUG_ON(!newcg);
-		get_css_set(newcg);
-		read_unlock(&css_set_lock);
-	} else {
-		might_sleep();
-		/* find_css_set will give us newcg already referenced. */
-		newcg = find_css_set(oldcg, cgrp);
-		if (!newcg) {
-			put_css_set(oldcg);
-			return -ENOMEM;
-		}
-	}
-	put_css_set(oldcg);
-
 	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
-	task_lock(tsk);
 	if (tsk->flags & PF_EXITING) {
 		task_unlock(tsk);
 		put_css_set(newcg);
@@ -1688,6 +1782,7 @@ int cgroup_attach_task(struct cgroup *cg
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	struct cgroup *oldcgrp;
 	struct cgroupfs_root *root = cgrp->root;
+	struct css_set *newcg, *oldcg;
 
 	/* Nothing to do if the task is already in that cgroup */
 	oldcgrp = task_cgroup_from_root(tsk, root);
@@ -1717,7 +1812,24 @@ int cgroup_attach_task(struct cgroup *cg
 		}
 	}
 
-	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+	/*
+	 * get old css_set. we need to take task_lock and refcount it, because
+	 * an exiting task can change its css_set to init_css_set and drop its
+	 * old one without taking cgroup_mutex.
+	 */
+	task_lock(tsk);
+	oldcg = tsk->cgroups;
+	get_css_set(oldcg);
+	task_unlock(tsk);
+
+	newcg = find_css_set(oldcg, cgrp);
+	put_css_set(oldcg);
+	if (!newcg) {
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	retval = cgroup_task_migrate(oldcgrp, tsk, newcg);
 	if (retval)
 		goto out;
 
@@ -1730,8 +1842,6 @@ int cgroup_attach_task(struct cgroup *cg
 			ss->attach(ss, cgrp, oldcgrp, tsk);
 	}
 
-	synchronize_rcu();
-
 	/*
 	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
 	 * is no longer empty.
@@ -1779,72 +1889,6 @@ int cgroup_attach_task_all(struct task_s
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
-/*
- * cgroup_attach_proc works in two stages, the first of which prefetches all
- * new css_sets needed (to make sure we have enough memory before committing
- * to the move) and stores them in a list of entries of the following type.
- * TODO: possible optimization: use css_set->rcu_head for chaining instead
- */
-struct cg_list_entry {
-	struct css_set *cg;
-	struct list_head links;
-};
-
-static bool css_set_check_fetched(struct cgroup *cgrp,
-				  struct task_struct *tsk, struct css_set *cg,
-				  struct list_head *newcg_list)
-{
-	struct css_set *newcg;
-	struct cg_list_entry *cg_entry;
-	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
-	read_lock(&css_set_lock);
-	newcg = find_existing_css_set(cg, cgrp, template);
-	if (newcg)
-		get_css_set(newcg);
-	read_unlock(&css_set_lock);
-
-	/* doesn't exist at all? */
-	if (!newcg)
-		return false;
-	/* see if it's already in the list */
-	list_for_each_entry(cg_entry, newcg_list, links) {
-		if (cg_entry->cg == newcg) {
-			put_css_set(newcg);
-			return true;
-		}
-	}
-
-	/* not found */
-	put_css_set(newcg);
-	return false;
-}
-
-/*
- * Find the new css_set and store it in the list in preparation for moving the
- * given task to the given cgroup. Returns 0 or -ENOMEM.
- */
-static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
-			    struct list_head *newcg_list)
-{
-	struct css_set *newcg;
-	struct cg_list_entry *cg_entry;
-
-	/* ensure a new css_set will exist for this thread */
-	newcg = find_css_set(cg, cgrp);
-	if (!newcg)
-		return -ENOMEM;
-	/* add it to the list */
-	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
-	if (!cg_entry) {
-		put_css_set(newcg);
-		return -ENOMEM;
-	}
-	cg_entry->cg = newcg;
-	list_add(&cg_entry->links, newcg_list);
-	return 0;
-}
-
 /**
  * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
@@ -1855,23 +1899,16 @@ static int css_set_prefetch(struct cgrou
  */
 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
-	int retval, i, group_size;
+	int retval, i, group_size, nr_migrating_tasks;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	bool cancel_failed_ss = false;
 	/* guaranteed to be initialized later, but the compiler needs this */
-	struct cgroup *oldcgrp = NULL;
 	struct css_set *oldcg;
 	struct cgroupfs_root *root = cgrp->root;
 	/* threadgroup list cursor and array */
 	struct task_struct *tsk;
+	struct task_and_cgroup *tc;
 	struct flex_array *group;
-	/*
-	 * we need to make sure we have css_sets for all the tasks we're
-	 * going to move -before- we actually start moving them, so that in
-	 * case we get an ENOMEM we can bail out before making any changes.
-	 */
-	struct list_head newcg_list;
-	struct cg_list_entry *cg_entry, *temp_nobe;
 
 	/*
 	 * step 0: in order to do expensive, possibly blocking operations for
@@ -1882,8 +1919,7 @@ int cgroup_attach_proc(struct cgroup *cg
 	 */
 	group_size = get_nr_threads(leader);
 	/* flex_array supports very large thread-groups better than kmalloc. */
-	group = flex_array_alloc(sizeof(struct task_struct *), group_size,
-				 GFP_KERNEL);
+	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
 	if (!group)
 		return -ENOMEM;
 	/* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -1907,8 +1943,10 @@ int cgroup_attach_proc(struct cgroup *cg
 	}
 	/* take a reference on each task in the group to go in the array. */
 	tsk = leader;
-	i = 0;
+	i = nr_migrating_tasks = 0;
 	do {
+		struct task_and_cgroup ent;
+
 		/* as per above, nr_threads may decrease, but not increase. */
 		BUG_ON(i >= group_size);
 		get_task_struct(tsk);
@@ -1916,14 +1954,24 @@ int cgroup_attach_proc(struct cgroup *cg
 		 * saying GFP_ATOMIC has no effect here because we did prealloc
 		 * earlier, but it's good form to communicate our expectations.
 		 */
-		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+		ent.task = tsk;
+		ent.cgrp = task_cgroup_from_root(tsk, root);
+		ent.cg = NULL;
+		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
-	} while_each_thread(leader, tsk);
+		if (ent.cgrp != cgrp)
+			nr_migrating_tasks++;
+	} while_each_thread_ve(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
 	read_unlock(&tasklist_lock);
 
+	/* methods shouldn't be called if no task is actually migrating */
+	retval = 0;
+	if (!nr_migrating_tasks)
+		goto out_put_tasks;
+
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
 	 */
@@ -1939,8 +1987,10 @@ int cgroup_attach_proc(struct cgroup *cg
 		if (ss->can_attach_task) {
 			/* run on each task in the threadgroup. */
 			for (i = 0; i < group_size; i++) {
-				tsk = flex_array_get_ptr(group, i);
-				retval = ss->can_attach_task(cgrp, tsk);
+				tc = flex_array_get(group, i);
+				if (tc->cgrp == cgrp)
+					continue;
+				retval = ss->can_attach_task(cgrp, tc->task);
 				if (retval) {
 					failed_ss = ss;
 					cancel_failed_ss = true;
@@ -1954,28 +2004,21 @@ int cgroup_attach_proc(struct cgroup *cg
 	 * step 2: make sure css_sets exist for all threads to be migrated.
 	 * we use find_css_set, which allocates a new one if necessary.
 	 */
-	INIT_LIST_HEAD(&newcg_list);
 	for (i = 0; i < group_size; i++) {
-		tsk = flex_array_get_ptr(group, i);
+		tc = flex_array_get(group, i);
 		/* nothing to do if this task is already in the cgroup */
-		oldcgrp = task_cgroup_from_root(tsk, root);
-		if (cgrp == oldcgrp)
+		if (tc->cgrp == cgrp)
 			continue;
 		/* get old css_set pointer */
-		task_lock(tsk);
-		oldcg = tsk->cgroups;
+		task_lock(tc->task);
+		oldcg = tc->task->cgroups;
 		get_css_set(oldcg);
-		task_unlock(tsk);
-		/* see if the new one for us is already in the list? */
-		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
-			/* was already there, nothing to do. */
-			put_css_set(oldcg);
-		} else {
-			/* we don't already have it. get new one. */
-			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
-			put_css_set(oldcg);
-			if (retval)
-				goto out_list_teardown;
+		task_unlock(tc->task);
+		tc->cg = find_css_set(tc->task->cgroups, cgrp);
+		put_css_set(oldcg);
+		if (!tc->cg) {
+			retval = -ENOMEM;
+			goto out_put_css_set_refs;
 		}
 	}
 
@@ -1990,18 +2033,19 @@ int cgroup_attach_proc(struct cgroup *cg
 			ss->pre_attach(cgrp);
 	}
 	for (i = 0; i < group_size; i++) {
-		tsk = flex_array_get_ptr(group, i);
+		tc = flex_array_get(group, i);
 		/* leave current thread as it is if it's already there */
-		oldcgrp = task_cgroup_from_root(tsk, root);
-		if (cgrp == oldcgrp)
+		if (tc->cgrp == cgrp)
 			continue;
 		/* if the thread is PF_EXITING, it can just get skipped. */
-		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+		if (!tc->cg)
+			continue;
+		retval = cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
 		if (retval == 0) {
 			/* attach each task to each subsystem */
 			for_each_subsys(root, ss) {
 				if (ss->attach_task)
-					ss->attach_task(cgrp, tsk);
+					ss->attach_task(cgrp, tc->task);
 			}
 		} else {
 			BUG_ON(retval != -ESRCH);
@@ -2015,25 +2059,26 @@ int cgroup_attach_proc(struct cgroup *cg
 	 * being moved, this call will need to be reworked to communicate that.
 	 */
 	for_each_subsys(root, ss) {
-		if (ss->attach)
-			ss->attach(ss, cgrp, oldcgrp, leader);
+		if (ss->attach) {
+			tc = flex_array_get(group, 0);
+			ss->attach(ss, cgrp, tc->cgrp, tc->task);
+		}
 	}
 
 	/*
 	 * step 5: success! and cleanup
 	 */
-	synchronize_rcu();
 	cgroup_wakeup_rmdir_waiter(cgrp);
 	retval = 0;
-out_list_teardown:
-	/* clean up the list of prefetched css_sets. */
-	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
-		list_del(&cg_entry->links);
-		put_css_set(cg_entry->cg);
-		kfree(cg_entry);
+out_put_css_set_refs:
+	if (retval) {
+		for (i = 0; i < group_size; i++) {
+			tc = flex_array_get(group, i);
+			if (tc->cg)
+				put_css_set(tc->cg);
+		}
 	}
 out_cancel_attach:
-	/* same deal as in cgroup_attach_task */
 	if (retval) {
 		for_each_subsys(root, ss) {
 			if (ss == failed_ss) {
@@ -2045,10 +2090,11 @@ out_cancel_attach:
 				ss->cancel_attach(ss, cgrp, leader);
 		}
 	}
+out_put_tasks:
 	/* clean up the array of referenced threads in the group. */
 	for (i = 0; i < group_size; i++) {
-		tsk = flex_array_get_ptr(group, i);
-		put_task_struct(tsk);
+		tc = flex_array_get(group, i);
+		put_task_struct(tc->task);
 	}
 out_free_group_list:
 	flex_array_free(group);
@@ -2165,11 +2211,19 @@ bool cgroup_lock_live_group(struct cgrou
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-	if (!cgroup_lock_live_group(cgrp))
+	char *release_agent;
+
+	release_agent = kstrdup(buffer, GFP_KERNEL);
+	if (!release_agent)
+		return -ENOMEM;
+
+	if (!cgroup_lock_live_group(cgrp)) {
+		kfree(release_agent);
 		return -ENODEV;
+	}
 	mutex_lock(&cgroup_root_mutex);
-	strcpy(cgrp->root->release_agent_path, buffer);
+	kfree(cgrp->release_agent);
+	cgrp->release_agent = release_agent;
 	mutex_unlock(&cgroup_root_mutex);
 	cgroup_unlock();
 	return 0;
@@ -2180,7 +2234,8 @@ static int cgroup_release_agent_show(str
 {
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	seq_puts(seq, cgrp->root->release_agent_path);
+	if (cgrp->release_agent)
+		seq_puts(seq, cgrp->release_agent);
 	seq_putc(seq, '\n');
 	cgroup_unlock();
 	return 0;
@@ -2633,7 +2688,7 @@ static void cgroup_enable_task_cg_lists(
 	struct task_struct *p, *g;
 	write_lock(&css_set_lock);
 	use_task_css_set_links = 1;
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		task_lock(p);
 		/*
 		 * We should check if the process is exiting, otherwise
@@ -2643,7 +2698,7 @@ static void cgroup_enable_task_cg_lists(
 		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
 			list_add(&p->cg_list, &p->cgroups->tasks);
 		task_unlock(p);
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 	write_unlock(&css_set_lock);
 }
 
@@ -2661,6 +2716,7 @@ void cgroup_iter_start(struct cgroup *cg
 	it->cg_link = &cgrp->css_sets;
 	cgroup_advance_iter(cgrp, it);
 }
+EXPORT_SYMBOL(cgroup_iter_start);
 
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it)
@@ -2685,11 +2741,13 @@ struct task_struct *cgroup_iter_next(str
 	}
 	return res;
 }
+EXPORT_SYMBOL(cgroup_iter_next);
 
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	read_unlock(&css_set_lock);
 }
+EXPORT_SYMBOL(cgroup_iter_end);
 
 static inline int started_after_time(struct task_struct *t1,
 				     struct timespec *time,
@@ -2776,6 +2834,7 @@ int cgroup_scan_tasks(struct cgroup_scan
 			return retval;
 	}
 
+	retval = 0;
  again:
 	/*
 	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
@@ -2833,8 +2892,8 @@ int cgroup_scan_tasks(struct cgroup_scan
 				latest_time = q->start_time;
 				latest_task = q;
 			}
-			/* Process the task per the caller's callback */
-			scan->process_task(q, scan);
+			if (!retval)
+				retval = scan->process_task(q, scan);
 			put_task_struct(q);
 		}
 		/*
@@ -2848,8 +2907,9 @@ int cgroup_scan_tasks(struct cgroup_scan
 	}
 	if (heap == &tmp_heap)
 		heap_free(&tmp_heap);
-	return 0;
+	return retval;
 }
+EXPORT_SYMBOL_GPL(cgroup_scan_tasks);
 
 /*
  * Stuff for reading the 'tasks'/'procs' files.
@@ -3454,6 +3514,23 @@ fail:
 	return ret;
 }
 
+static u64 cgroup_read_self_destruction(struct cgroup *cgrp,
+		struct cftype *cft)
+{
+	return test_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
+}
+
+static int cgroup_write_self_destruction(struct cgroup *cgrp,
+		struct cftype *cft, u64 val)
+{
+	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+	if (val)
+		set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
+	else
+		clear_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
+	return 0;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -3484,6 +3561,11 @@ static struct cftype files[] = {
 		.write_string = cgroup_write_event_control,
 		.mode = S_IWUGO,
 	},
+	{
+		.name = "self_destruction",
+		.read_u64 = cgroup_read_self_destruction,
+		.write_u64 = cgroup_write_self_destruction,
+	},
 };
 
 static struct cftype cft_release_agent = {
@@ -3753,7 +3835,12 @@ static int cgroup_clear_css_refs(struct 
 	return !failed;
 }
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern int mem_cgroup_force_empty_hack(struct cgroup *);
+#else
+#define mem_cgroup_force_empty_hack(p)  0
+#endif
+
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
@@ -3965,7 +4052,7 @@ int __init cgroup_init(void)
 	if (err < 0)
 		goto out;
 
-	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+	proc_create("cgroups", 0, &glob_proc_root, &proc_cgroupstats_operations);
 
 out:
 	if (err)
@@ -4077,6 +4164,8 @@ static int proc_cgroupstats_show(struct 
 	int i;
 
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	if (!ve_is_super(get_exec_env()))
+		return 0;
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
@@ -4418,7 +4507,7 @@ void __css_put(struct cgroup_subsys_stat
 	rcu_read_lock();
 	val = atomic_sub_return(count, &css->refcnt);
 	if (val == 1) {
-		if (notify_on_release(cgrp)) {
+		if (cgroup_is_disposable(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
@@ -4458,19 +4547,39 @@ static void cgroup_release_agent(struct 
 	spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
-		int i;
+		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		spin_unlock(&release_list_lock);
+
+		if (test_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags)) {
+			struct inode *parent = cgrp->dentry->d_parent->d_inode;
+			struct super_block *sb = cgrp->dentry->d_sb;
+
+			atomic_inc(&sb->s_active);
+			dget(cgrp->dentry);
+			mutex_unlock(&cgroup_mutex);
+			mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT);
+			vfs_rmdir(parent, cgrp->dentry);
+			mutex_unlock(&parent->i_mutex);
+			dput(cgrp->dentry);
+			deactivate_super(sb);
+			mutex_lock(&cgroup_mutex);
+			goto continue_free;
+		}
+
+		if (!cgrp->top_cgroup->khelper_wq)
+			goto continue_free;
+
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
 		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
 			goto continue_free;
-		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+		agentbuf = kstrdup(cgrp->top_cgroup->release_agent, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 
@@ -4489,7 +4598,12 @@ static void cgroup_release_agent(struct 
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		err = call_usermodehelper_wq(argv[0], argv, envp, UMH_WAIT_EXEC,
+						cgrp->top_cgroup->khelper_wq);
+		if (err < 0)
+			pr_warn_ratelimited("cgroup release_agent "
+					    "%s %s failed: %d\n",
+					    agentbuf, pathbuf, err);
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
@@ -4926,3 +5040,217 @@ struct cgroup_subsys debug_subsys = {
 	.subsys_id = debug_subsys_id,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
+
+struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts)
+{
+	return kern_mount_data(&cgroup_fs_type, opts);
+}
+EXPORT_SYMBOL(cgroup_kernel_mount);
+
+struct cgroup *cgroup_get_root(struct vfsmount *mnt)
+{
+	return mnt->mnt_root->d_fsdata;
+}
+EXPORT_SYMBOL(cgroup_get_root);
+
+struct cgroup *cgroup_kernel_open(struct cgroup *parent,
+		enum cgroup_open_flags flags, char *name)
+{
+	struct dentry *dentry;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, parent->dentry, strlen(name));
+	cgrp = ERR_CAST(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (flags & CGRP_CREAT) {
+		if ((flags & CGRP_EXCL) && dentry->d_inode)
+			ret = -EEXIST;
+		else if (!dentry->d_inode)
+			ret = vfs_mkdir(parent->dentry->d_inode, dentry, 0755);
+		else
+			flags &= ~CGRP_WEAK;
+	}
+	if (!ret && dentry->d_inode) {
+		cgrp = __d_cgrp(dentry);
+		__cgroup_kernel_open(cgrp);
+		if (flags & CGRP_WEAK)
+			set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
+	} else
+		cgrp = ret ? ERR_PTR(ret) : NULL;
+	dput(dentry);
+out:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	return cgrp;
+}
+EXPORT_SYMBOL(cgroup_kernel_open);
+
+/* FIXME remove sub-cgroups too */
+int cgroup_kernel_remove(struct cgroup *parent, char *name)
+{
+	struct dentry *dentry;
+	int ret;
+
+	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, parent->dentry, strlen(name));
+	ret = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+	ret = -ENOENT;
+	if (dentry->d_inode)
+		ret = vfs_rmdir(parent->dentry->d_inode, dentry);
+	dput(dentry);
+out:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(cgroup_kernel_remove);
+
+int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+	int ret;
+
+	cgroup_lock();
+	ret = cgroup_attach_task(cgrp, tsk);
+	cgroup_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(cgroup_kernel_attach);
+
+void cgroup_kernel_close(struct cgroup *cgrp)
+{
+	if (!cgroup_is_disposable(cgrp)) {
+		atomic_dec(&cgrp->count);
+	} else if (atomic_dec_and_test(&cgrp->count)) {
+		set_bit(CGRP_RELEASABLE, &cgrp->flags);
+		check_for_release(cgrp);
+	}
+}
+EXPORT_SYMBOL(cgroup_kernel_close);
+
+#ifdef CONFIG_VE
+
+static int cgroup_genocide(struct cgroup *cgrp)
+{
+	struct cgroup *orig_parent = cgrp->parent, *parent;
+	struct inode *inode;
+	int ret = 0;
+
+	do {
+		if (!list_empty(&cgrp->children)) {
+			cgrp = list_first_entry(&cgrp->children,
+					struct cgroup, sibling);
+			continue;
+		}
+		parent = cgrp->parent;
+		if (!parent)
+			break;
+		atomic_inc(&parent->count);
+		dget(cgrp->dentry);
+		cgroup_unlock();
+		inode = parent->dentry->d_inode;
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+		ret = vfs_rmdir(inode, cgrp->dentry);
+		mutex_unlock(&inode->i_mutex);
+		dput(cgrp->dentry);
+		cgroup_lock();
+		atomic_dec(&parent->count);
+		cgrp = parent;
+		if (parent == orig_parent)
+			break;
+	} while (!ret);
+
+	return ret;
+}
+
+#include <linux/ve_proto.h>
+
+static int cgroup_ve_init(void *data)
+{
+	return 0;
+}
+
+static void cgroup_ve_fini(void *data)
+{
+	struct ve_struct *ve = data;
+	struct cgroupfs_root *root, *prev = NULL;
+	struct cgroup *cgrp;
+	char name[16];
+
+	snprintf(name, sizeof name, "%d", ve->veid);
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		if (root->subsys_bits)
+			continue;
+		atomic_inc(&root->sb->s_active);
+		cgroup_unlock();
+		if (prev)
+			deactivate_super(prev->sb);
+		prev = root;
+		cgrp = cgroup_kernel_open(&root->top_cgroup, 0, name);
+		cgroup_lock();
+		if (!IS_ERR_OR_NULL(cgrp)) {
+			cgroup_kernel_close(cgrp);
+			cgroup_genocide(cgrp);
+		}
+	}
+	cgroup_unlock();
+	if (prev)
+		deactivate_super(prev->sb);
+}
+
+void cgroup_ve_khelper_cleanup(void *data)
+{
+	struct ve_struct *ve = data;
+	struct cgroupfs_root *root, *prev = NULL;
+	struct cgroup *cgrp;
+	char name[16];
+
+	snprintf(name, sizeof name, "%d", ve->veid);
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		if (root->subsys_bits)
+			continue;
+		atomic_inc(&root->sb->s_active);
+		cgroup_unlock();
+		if (prev)
+			deactivate_super(prev->sb);
+		prev = root;
+		cgrp = cgroup_kernel_open(&root->top_cgroup, 0, name);
+		cgroup_lock();
+		if (!IS_ERR_OR_NULL(cgrp)) {
+			cgrp->top_cgroup->khelper_wq = NULL;
+			cgroup_kernel_close(cgrp);
+		}
+	}
+	cgroup_unlock();
+	if (prev)
+		deactivate_super(prev->sb);
+}
+EXPORT_SYMBOL(cgroup_ve_khelper_cleanup);
+
+static struct ve_hook cgroup_ve_hook = {
+	.init		= cgroup_ve_init,
+	.fini		= cgroup_ve_fini,
+	.owner		= THIS_MODULE,
+};
+
+static struct ve_hook cgroup_ve_init_exit_hook = {
+	.fini		= cgroup_ve_khelper_cleanup,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_ve_cgroup(void)
+{
+	ve_hook_register(VE_SS_CHAIN, &cgroup_ve_hook);
+	ve_hook_register(VE_INIT_EXIT_CHAIN, &cgroup_ve_init_exit_hook);
+	return 0;
+}
+module_init(init_ve_cgroup);
+
+#endif /* CONFIG_VE */
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cgroup_freezer.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cgroup_freezer.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cgroup_freezer.c	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cgroup_freezer.c	2015-01-21 12:02:45.821157426 +0300
@@ -21,12 +21,6 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 
-enum freezer_state {
-	CGROUP_THAWED = 0,
-	CGROUP_FREEZING,
-	CGROUP_FROZEN,
-};
-
 struct freezer {
 	struct cgroup_subsys_state css;
 	enum freezer_state state;
@@ -311,7 +305,7 @@ static void unfreeze_cgroup(struct cgrou
 	freezer->state = CGROUP_THAWED;
 }
 
-static int freezer_change_state(struct cgroup *cgroup,
+int freezer_change_state(struct cgroup *cgroup,
 				enum freezer_state goal_state)
 {
 	struct freezer *freezer;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/compat.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/compat.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/compat.c	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/compat.c	2015-01-21 12:02:58.029833335 +0300
@@ -159,7 +159,7 @@ int put_compat_timespec(const struct tim
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
 
-static long compat_nanosleep_restart(struct restart_block *restart)
+long compat_nanosleep_restart(struct restart_block *restart)
 {
 	struct compat_timespec __user *rmtp;
 	struct timespec rmt;
@@ -181,6 +181,7 @@ static long compat_nanosleep_restart(str
 
 	return ret;
 }
+EXPORT_SYMBOL(compat_nanosleep_restart);
 
 asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 				     struct compat_timespec __user *rmtp)
@@ -1197,4 +1198,4 @@ void __user *compat_alloc_user_space(uns
 
 	return ptr;
 }
-EXPORT_SYMBOL_GPL(compat_alloc_user_space);
+EXPORT_SYMBOL(compat_alloc_user_space);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/Makefile
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/Makefile	2015-01-21 12:02:48.221093712 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/Makefile	2015-01-21 12:02:49.273065785 +0300
@@ -0,0 +1,48 @@
+#
+#
+#  kernel/cpt/Makefile
+#
+#  Copyright (C) 2000-2005  SWsoft
+#  All rights reserved.
+#
+#  Licensing governed by "linux/COPYING.SWsoft" file.
+
+obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o
+
+vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \
+	cpt_mm.o cpt_files.o cpt_kernel.o \
+	cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \
+	cpt_conntrack.o cpt_epoll.o cpt_cgroup.o
+
+vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \
+	rst_mm.o rst_files.o \
+	rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \
+	rst_conntrack.o rst_epoll.o rst_delayfs.o rst_cgroup.o
+
+ifeq ($(CONFIG_BEANCOUNTERS), y)
+vzcpt-objs += cpt_ubc.o
+vzrst-objs += rst_ubc.o
+endif
+
+ifeq ($(CONFIG_INOTIFY_USER), y)
+vzcpt-objs += cpt_inotify.o
+vzrst-objs += rst_inotify.o
+endif
+
+vzrst-objs += cpt_exports.o
+
+ifeq ($(CONFIG_VZ_CHECKPOINT), m)
+vzrst-objs += cpt_obj.o cpt_kernel.o
+endif
+
+ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y)
+vzcpt-objs += cpt_iterative.o
+vzrst-objs += rst_iterative.o
+endif
+
+ifeq ($(CONFIG_X86_64), y)
+vzcpt-objs += cpt_x8664.o
+ifeq ($(CONFIG_VZ_CHECKPOINT), m)
+vzrst-objs += cpt_x8664.o
+endif
+endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_cgroup.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_cgroup.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_cgroup.c	2015-01-21 12:02:49.273065785 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_cgroup.c	2015-01-21 12:02:51.050018612 +0300
@@ -0,0 +1,187 @@
+#include <linux/cgroup.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+#include "cpt_process.h"
+
+static int cgroup_index = 0;
+
+static int cpt_dump_one_cgroup_pid(struct task_struct *task,
+				   struct cgroup_scanner *scan)
+{
+	struct cpt_context *ctx = scan->data;
+	u32 pid;
+
+	pid = cpt_task_pid_nr(task, PIDTYPE_PID);
+	if (pid)
+		ctx->write(&pid, sizeof(pid), ctx);
+	else
+		eprintk_ctx("Can't find pid for task '%s'\n", task->comm);
+	return pid ? 0 : -ENOENT;
+}
+
+static int cpt_dump_one_cgroup(struct cgroup *cgrp, void *args)
+{
+	int ret = 0;
+	cpt_object_t *obj;
+	struct cpt_context *ctx = (struct cpt_context *) args;
+	struct cpt_cgroup_image *v;
+	loff_t saved_obj;
+	u32 pid;
+
+	const char *name = cgrp->dentry->d_name.name;
+
+	obj = cpt_object_add(CPT_OBJ_CGROUP, cgrp, ctx);
+	if (obj == NULL)
+		return -ENOMEM;
+
+	if (obj->o_index == CPT_NOINDEX)
+		cpt_obj_setindex(obj, cgroup_index++, ctx);
+	if (cgroup_index == INT_MAX)
+		return -ENOMEM;
+
+	cpt_open_object(obj, ctx);
+
+	v = cpt_get_buf(ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_CGROUP;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_VOID;
+	v->cpt_index = obj->o_index;
+
+	if (test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags))
+		v->cpt_flags |= CPT_CGRP_NOTIFY_ON_RELEASE;
+	if (test_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags))
+		v->cpt_flags |= CPT_CGRP_SELF_DESTRUCTION;
+
+	if (cgrp == cgrp->top_cgroup) {
+		v->cpt_parent = -1;
+	} else {
+		obj = lookup_cpt_object(CPT_OBJ_CGROUP, cgrp->parent, ctx);
+		v->cpt_parent = obj->o_index;
+	}
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	if (v->cpt_parent != -1) {
+		struct cgroup_scanner scan = {
+			.cg = cgrp,
+			.process_task = cpt_dump_one_cgroup_pid,
+			.data = ctx,
+		};
+
+		cpt_dump_string(name, ctx);
+
+		ret = cgroup_scan_tasks(&scan);
+	}
+	cpt_pop_object(&saved_obj, ctx);
+
+	pid = 0;
+	ctx->write(&pid, sizeof(pid), ctx);
+
+	cpt_close_object(ctx);
+
+	return ret;
+}
+
+static int cpt_dump_cgroup_options(struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	struct seq_file sf;
+
+	sf.buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!sf.buf)
+		return -ENOMEM;
+	sf.count = 0;
+	sf.size = PAGE_SIZE;
+
+	seq_printf(&sf, "none");
+	mnt->mnt_sb->s_op->show_options(&sf, mnt);
+
+	cpt_dump_string(sf.buf, ctx);
+
+	free_page((unsigned long) sf.buf);
+
+	return 0;
+}
+
+static int cpt_dump_one_cgroup_mnt(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct cpt_object_hdr *v;
+	loff_t saved_obj;
+	int err;
+
+	cpt_open_object(NULL, ctx);
+	v = cpt_get_buf(ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_CGROUPS;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	err = cpt_dump_cgroup_options(obj->o_parent, ctx);
+	if (err)
+		return err;
+
+	err = cpt_collect_cgroups(obj->o_parent, cpt_dump_one_cgroup, ctx);
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_close_object(ctx);
+
+	return err;
+}
+
+int cpt_dump_cgroups(struct cpt_context *ctx)
+{
+	cpt_object_t *obj, *cgrp_obj;
+	int err;
+	struct vfsmount *mnt;
+
+	for_each_object(cgrp_obj, CPT_OBJ_CGROUP) {
+		mnt = cgrp_obj->o_parent;
+
+		obj = cpt_object_add(CPT_OBJ_CGROUPS, mnt->mnt_sb, ctx);
+		if (obj == NULL)
+			return -ENOMEM;
+
+		obj->o_parent = mnt;
+	}
+
+	cpt_open_section(ctx, CPT_SECT_CGROUPS);
+
+	for_each_object(obj, CPT_OBJ_CGROUPS) {
+		err = cpt_dump_one_cgroup_mnt(obj, ctx);
+		if (err)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+
+	return 0;
+}
+
+int cpt_add_cgroup(struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	struct cgroup *cgrp = mnt->mnt_root->d_fsdata;
+	cpt_object_t *obj;
+
+	obj = cpt_object_add(CPT_OBJ_CGROUP, cgrp, ctx);
+	if (obj == NULL)
+		return CPT_NOINDEX;
+
+	if (obj->o_index == CPT_NOINDEX) {
+		cpt_obj_setindex(obj, cgroup_index++, ctx);
+		obj->o_parent = mnt;
+	}
+
+	return obj->o_index;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_conntrack.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_conntrack.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_conntrack.c	2015-01-21 12:02:48.221093712 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_conntrack.c	2015-01-21 12:02:50.952021213 +0300
@@ -0,0 +1,400 @@
+/*
+ *
+ *  kernel/cpt/cpt_conntrack.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/unistd.h>
+#include <linux/ve.h>
+#include <linux/vzcalluser.h>
+#include <linux/cpt_image.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/rculist_nulls.h>
+
+#if defined(CONFIG_VE_IPTABLES) && \
+    (defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE))
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+
+/* How does it work?
+ *
+ * Network is disabled, so new conntrack entries will not appear.
+ * However, some of them can disappear because of timeouts.
+ *
+ * So, we take read_lock, collect all required information atomically,
+ * essentially, creating parallel "refcount" structures holding pointers.
+ * We delete conntrack timers as well, so the structures cannot disappear
+ * after releasing the lock. Now, after releasing lock we can dump everything
+ * safely. And on exit we restore timers to their original values.
+ *
+ * Note, this approach is not going to work in VE0.
+ */
+
+struct ct_holder
+{
+	struct ct_holder *next;
+	struct nf_conntrack_tuple_hash *cth;
+	int index;
+};
+
+static void encode_tuple(struct cpt_ipct_tuple *v, struct nf_conntrack_tuple *tuple)
+{
+	v->cpt_dst = tuple->dst.u3.ip;
+	v->cpt_l3num = tuple->src.l3num;
+	v->cpt_dstport = tuple->dst.u.all;
+	v->cpt_protonum = tuple->dst.protonum;
+	v->cpt_dir = tuple->dst.dir;
+
+	v->cpt_src = tuple->src.u3.ip;
+	v->cpt_srcport = tuple->src.u.all;
+}
+
+static void encode_tuple_mask(struct cpt_ipct_tuple *v, struct nf_conntrack_tuple_mask *tuple)
+{
+	v->cpt_src = tuple->src.u3.ip;
+	v->cpt_srcport = tuple->src.u.all;
+}
+
+static int dump_one_expect(struct cpt_ip_connexpect_image *v,
+			   struct nf_conntrack_expect *exp,
+			   int sibling, cpt_context_t *ctx)
+{
+	int err = 0;
+
+	v->cpt_next = sizeof(*v);
+	v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_VOID;
+
+	encode_tuple(&v->cpt_tuple, &exp->tuple);
+	encode_tuple_mask(&v->cpt_mask, &exp->mask);
+	v->cpt_sibling_conntrack = sibling;
+	v->cpt_flags = exp->flags;
+	v->cpt_dir = 0;
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+	v->cpt_manip_proto = exp->saved_proto.all;
+	v->cpt_dir = exp->dir;
+#endif
+	v->cpt_timeout = exp->timeout.expires - jiffies;
+	v->cpt_class = exp->class;
+	return err;
+}
+
+/* NOTE. We use one page to dump list of expectations. This may be not enough
+ * in theory. In practice there is only one expectation per conntrack record.
+ * Moreover, taking into account that _ALL_ of expecations are saved in one
+ * global list, which is looked up each incoming/outpging packet, the system
+ * would be severely dead when even one conntrack would have so much of
+ * expectations. Shortly, I am not going to repair this.
+ */
+
+static int dump_expect_list(struct nf_conn *ct, struct ct_holder *list,
+			    cpt_context_t *ctx)
+{
+	int err = 0;
+	unsigned long pg;
+	struct cpt_ip_connexpect_image *v;
+	struct nf_conntrack_expect *exp;
+	struct nf_conn_help *help = nfct_help(ct);
+	struct hlist_node *next;
+	int expecting = 0, i;
+
+	if (!help)
+		return 0;
+
+	for (i = 0; i < NF_CT_MAX_EXPECT_CLASSES; i++)
+		expecting += help->expecting[i];
+
+	if (expecting == 0)
+		return err;
+	if (expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE)
+		return -ENOBUFS;
+
+	pg = __get_free_page(GFP_KERNEL);
+	if (!pg)
+		return -ENOMEM;
+	v = (struct cpt_ip_connexpect_image *)pg;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_for_each_entry(exp, next, &help->expectations, lnode) {
+		int sibling;
+
+		if (exp->master != ct)
+			continue;
+
+		if (help->helper == NULL) {
+			eprintk_ctx("conntrack: no helper and non-trivial expectation\n");
+			err = -EINVAL;
+			break;
+		}
+
+		sibling = 0;
+#if 0
+		/* That's all? No need to calculate sibling? */
+		if (exp->sibling) {
+			struct ct_holder *c;
+			for (c = list; c; c = c->next) {
+				if (tuplehash_to_ctrack(c->cth) == exp->sibling) {
+					sibling = c->index;
+					break;
+				}
+			}
+			/* NOTE: exp->sibling could be not "confirmed" and, hence,
+			 * out of hash table. We should just ignore such a sibling,
+			 * the connection is going to be retried, the packet
+			 * apparently was lost somewhere.
+			 */
+			if (sibling == 0)
+				dprintk_ctx("sibling conntrack is not found\n");
+		}
+#endif
+
+		/* If the expectation still does not have exp->sibling
+		 * and timer is not running, it is about to die on another
+		 * cpu. Skip it. */
+		if (!del_timer(&exp->timeout)) {
+			dprintk_ctx("conntrack: expectation: no timer\n");
+			continue;
+		}
+
+		err = dump_one_expect(v, exp, sibling, ctx);
+
+		add_timer(&exp->timeout);
+
+		if (err)
+			break;
+
+		v++;
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	if (err == 0 && (unsigned long)v != pg)
+		ctx->write((void*)pg, (unsigned long)v - pg, ctx);
+
+	free_page(pg);
+	return err;
+}
+
+static int dump_one_ct(struct ct_holder *c, struct ct_holder *list,
+		       cpt_context_t *ctx)
+{
+	struct nf_conntrack_tuple_hash *h = c->cth;
+	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct cpt_ip_conntrack_image v;
+	const struct nf_conn_help *help;
+	int err = 0;
+
+	BUILD_BUG_ON(sizeof(v.cpt_proto_data) < sizeof(ct->proto));
+	BUILD_BUG_ON(sizeof(v.cpt_help_data) < sizeof(union nf_conntrack_help));
+
+	rcu_read_lock_bh();
+	help = nfct_help(ct);
+	if (help) {
+		const struct nf_conntrack_helper *helper;
+
+		helper = rcu_dereference(help->helper);
+		if (helper && !strcmp(helper->name, "pptp")) {
+			eprintk_ctx("conntrack: PPTP isn't supported\n");
+			err = -EBUSY;
+		}
+	}
+	rcu_read_unlock_bh();
+
+	if (err)
+		return err;
+
+	cpt_open_object(NULL, ctx);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_CONNTRACK;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_ARRAY;
+
+	rcu_read_lock_bh();
+	v.cpt_status = ct->status;
+	v.cpt_timeout = ct->timeout.expires - jiffies;
+	v.cpt_ct_helper = (nfct_help(ct) != NULL);
+	v.cpt_index = c->index;
+	v.cpt_mark = 0;
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	v.cpt_mark = ct->mark;
+#endif
+	encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple);
+	encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple);
+	memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data));
+	if (nfct_help(ct))
+		memcpy(&v.cpt_help_data, &nfct_help(ct)->help, sizeof(v.cpt_help_data));
+
+	v.cpt_masq_index = 0;
+	v.cpt_nat_helper = 0;
+	if (nat) {
+#ifdef CONFIG_NF_NAT_NEEDED
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+	defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+		v.cpt_masq_index = nat->masq_index;
+#endif
+	/* "help" data is used by pptp, difficult to support */
+		v.cpt_nat_seq[0].cpt_correction_pos = nat->seq[0].correction_pos;
+		v.cpt_nat_seq[0].cpt_offset_before = nat->seq[0].offset_before;
+		v.cpt_nat_seq[0].cpt_offset_after = nat->seq[0].offset_after;
+		v.cpt_nat_seq[1].cpt_correction_pos = nat->seq[1].correction_pos;
+		v.cpt_nat_seq[1].cpt_offset_before = nat->seq[1].offset_before;
+		v.cpt_nat_seq[1].cpt_offset_after = nat->seq[1].offset_after;
+#endif
+	}
+	rcu_read_unlock_bh();
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	err = dump_expect_list(ct, list, ctx);
+
+	cpt_close_object(ctx);
+	return err;
+}
+
+int cpt_dump_ip_conntrack(cpt_context_t * ctx)
+{
+	struct ct_holder *ct_list = NULL;
+	struct ct_holder *c, **cp;
+	struct nf_conn *ct;
+	int err = 0;
+	int index = 0;
+	int idx;
+	struct net *net = get_exec_env()->ve_netns;
+	struct hlist_nulls_node *n;
+
+	for (idx = atomic_read(&(net->ct.count)); idx >= 0; idx--) {
+		c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
+		if (c == NULL) {
+			err = -ENOMEM;
+			goto done;
+		}
+		memset(c, 0, sizeof(struct ct_holder));
+		c->next = ct_list;
+		ct_list = c;
+	}
+
+	c = ct_list;
+
+	rcu_read_lock_bh();
+	for (idx = 0; idx < net->ct.htable_size; idx++) {
+		struct nf_conntrack_tuple_hash *h;
+                hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[idx], hnnode) {
+			/* Skip reply tuples, they are covered by original
+			 * direction. */
+			if (NF_CT_DIRECTION(h))
+				continue;
+
+			/* Oops, we have not enough of holders...
+			 * It is impossible. */
+			if (unlikely(c == NULL)) {
+				rcu_read_unlock_bh();
+				eprintk_ctx("unexpected conntrack appeared\n");
+				err = -ENOMEM;
+				goto done;
+			}
+
+			/* If timer is not running, it means that it
+			 * has just been scheduled on another cpu.
+			 * We should skip this conntrack, it is about to be
+			 * destroyed. */
+			if (!del_timer(&nf_ct_tuplehash_to_ctrack(h)->timeout)) {
+				dprintk_ctx("conntrack: no timer\n");
+				continue;
+			}
+
+			/* Timer is deleted. refcnt is _not_ decreased.
+			 * We are going to restore the timer on exit
+			 * from this function. */
+			c->cth = h;
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			nf_conntrack_get(&ct->ct_general);
+			c->index = ++index;
+			c = c->next;
+		}
+	}
+	rcu_read_unlock_bh();
+
+	/* No conntracks? Good. */
+	if (index == 0)
+		goto done;
+
+	/* Comb the list a little. */
+	cp = &ct_list;
+	while ((c = *cp) != NULL) {
+		/* Discard unused entries; they can appear, if some
+		 * entries were timed out since we preallocated the list.
+		 */
+		if (c->cth == NULL) {
+			*cp = c->next;
+			kfree(c);
+			continue;
+		}
+
+		/* Move conntracks attached to expectations to the beginning
+		 * of the list. */
+		if (nf_ct_tuplehash_to_ctrack(c->cth)->master && c != ct_list) {
+			*cp = c->next;
+			c->next = ct_list;
+			ct_list = c;
+			dprintk_ctx("conntrack: %d moved in list\n", c->index);
+			continue;
+		}
+		cp = &c->next;
+	}
+
+	cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK);
+
+	for (c = ct_list; c; c = c->next) {
+		err = dump_one_ct(c, ct_list, ctx);
+		if (err)
+			goto done;
+	}
+
+	cpt_close_section(ctx);
+
+done:
+	while ((c = ct_list) != NULL) {
+		ct_list = c->next;
+		if (c->cth) {
+			ct = nf_ct_tuplehash_to_ctrack(c->cth);
+			nf_conntrack_put(&ct->ct_general);
+			/* Restore timer. refcnt is preserved. */
+			add_timer(&nf_ct_tuplehash_to_ctrack(c->cth)->timeout);
+		}
+		kfree(c);
+	}
+	return err;
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_context.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_context.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_context.c	2015-01-21 12:02:48.221093712 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_context.c	2015-01-21 12:02:49.777052406 +0300
@@ -0,0 +1,332 @@
+/*
+ *
+ *  kernel/cpt/cpt_context.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+#ifdef CONFIG_PRAM
+struct cpt_pram_ops *cpt_pram_ops;
+EXPORT_SYMBOL(cpt_pram_ops);
+
+int cpt_open_pram(cpt_context_t *ctx)
+{
+	int err = -ENOSYS;
+
+	if (cpt_pram_ops)
+		err = cpt_pram_ops->cpt_open(ctx);
+	if (err)
+		eprintk_ctx("cpt_open_pram: %d\n", err);
+	return err;
+}
+
+void cpt_close_pram(cpt_context_t *ctx, int err)
+{
+	if (cpt_pram_ops)
+		cpt_pram_ops->cpt_close(ctx, err);
+}
+
+void cpt_dump_pram(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct cpt_context *ctx)
+{
+	int err = 0;
+
+	if (cpt_pram_ops)
+		err = cpt_pram_ops->cpt_dump(vma, start, end, ctx);
+	if (err)
+		eprintk_ctx("cpt_dump_pram: %d\n", err);
+}
+#endif
+
+static int check_dumpsize(struct cpt_context *ctx, size_t count, loff_t pos)
+{
+	if (pos + count > ctx->dumpsize)
+		ctx->dumpsize = pos + count;
+	if (ctx->maxdumpsize && ctx->dumpsize > ctx->maxdumpsize) {
+		ctx->write_error = -ENOSPC;
+		return 0;
+	}
+	return 1;
+}
+
+static void file_write(const void *addr, size_t count, struct cpt_context *ctx)
+{
+	mm_segment_t oldfs;
+	ssize_t err = -EBADF;
+	struct file *file = ctx->file;
+
+	if (file && !check_dumpsize(ctx, count, file->f_pos))
+		return;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if (file)
+		err = file->f_op->write(file, addr, count, &file->f_pos);
+	set_fs(oldfs);
+	if (err != count && !ctx->write_error)
+		ctx->write_error = err < 0 ? err : -EIO;
+}
+
+static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
+{
+	mm_segment_t oldfs;
+	ssize_t err = -EBADF;
+	struct file *file = ctx->file;
+
+	if (file && !check_dumpsize(ctx, count, pos))
+		return;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if (file)
+		err = file->f_op->write(file, addr, count, &pos);
+	set_fs(oldfs);
+	if (err != count && !ctx->write_error)
+		ctx->write_error = err < 0 ? err : -EIO;
+}
+
+static void file_align(struct cpt_context *ctx)
+{
+	struct file *file = ctx->file;
+
+	if (file)
+		file->f_pos = CPT_ALIGN(file->f_pos);
+}
+
+static void cpt_push(loff_t *p, struct cpt_context *ctx)
+{
+	cpt_push_object(p, ctx);
+	cpt_open_object(NULL, ctx);
+}
+
+static void cpt_pop(loff_t *p, struct cpt_context *ctx)
+{
+	cpt_close_object(ctx);
+	cpt_pop_object(p, ctx);
+}
+
+static loff_t lookup_cpt_object_pos(int type, void *p, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = lookup_cpt_object(type, p, ctx);
+	return obj->o_pos;
+}
+
+struct cpt_ops cpt_ops = {
+	.write = file_write,
+	.push_object = cpt_push,
+	.pop_object = cpt_pop,
+	.lookup_object = lookup_cpt_object_pos,
+};
+
+void cpt_context_init(struct cpt_context *ctx)
+{
+	int i;
+
+	memset(ctx, 0, sizeof(*ctx));
+
+	init_MUTEX(&ctx->main_sem);
+	ctx->refcount = 1;
+
+	ctx->current_section = -1;
+	ctx->current_object = -1;
+	ctx->pagesize = PAGE_SIZE;
+	ctx->write = file_write;
+	ctx->pwrite = file_pwrite;
+	ctx->align = file_align;
+	for (i=0; i < CPT_SECT_MAX; i++)
+		ctx->sections[i] = CPT_NULL;
+	cpt_object_init(ctx);
+}
+
+int cpt_open_dumpfile(struct cpt_context *ctx)
+{
+	if (ctx->file)
+		ctx->maxdumpsize = i_size_read(ctx->file->f_mapping->host);
+	ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
+	if (ctx->tmpbuf == NULL)
+		return -ENOMEM;
+	__cpt_release_buf(ctx);
+	return 0;
+}
+
+int cpt_close_dumpfile(struct cpt_context *ctx)
+{
+	if (ctx->file) {
+		WARN_ON(i_size_read(ctx->file->f_mapping->host) != ctx->dumpsize);
+		fput(ctx->file);
+		ctx->file = NULL;
+	}
+	if (ctx->tmpbuf) {
+		free_page((unsigned long)ctx->tmpbuf);
+		ctx->tmpbuf = NULL;
+	}
+	if (ctx->write_error)
+		eprintk_ctx("error while writing dump file: %d\n", ctx->write_error);
+	return ctx->write_error;
+}
+
+int cpt_major_hdr_out(struct cpt_context *ctx)
+{
+	struct cpt_major_hdr hdr;
+
+	if (ctx->file == NULL)
+		return 0;
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.cpt_signature[0] = CPT_SIGNATURE0;
+	hdr.cpt_signature[1] = CPT_SIGNATURE1;
+	hdr.cpt_signature[2] = CPT_SIGNATURE2;
+	hdr.cpt_signature[3] = CPT_SIGNATURE3;
+	hdr.cpt_hdrlen = sizeof(hdr);
+	hdr.cpt_image_version = CPT_CURRENT_VERSION;
+#ifdef CONFIG_X86_64
+	hdr.cpt_os_arch = CPT_OS_ARCH_EMT64;
+#elif defined(CONFIG_X86_32)
+	hdr.cpt_os_arch = CPT_OS_ARCH_I386;
+#elif defined(CONFIG_IA64)
+	hdr.cpt_os_arch = CPT_OS_ARCH_IA64;
+#else
+#error	Arch is not supported
+#endif
+	hdr.cpt_ve_features = (__u32)ctx->features;
+	hdr.cpt_ve_features2 = (__u32)(ctx->features>>32);
+	hdr.cpt_pagesize = (__u16)PAGE_SIZE;
+	hdr.cpt_hz = HZ;
+	hdr.cpt_start_jiffies64 = ctx->virt_jiffies64;
+	hdr.cpt_start_sec = ctx->start_time.tv_sec;
+	hdr.cpt_start_nsec = ctx->start_time.tv_nsec;
+	hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags;
+	hdr.cpt_kernel_config[0] = ctx->kernel_config_flags;
+	hdr.cpt_iptables_mask = ctx->iptables_mask;
+
+	ctx->write(&hdr, sizeof(hdr), ctx);
+	return 0;
+}
+
+int cpt_close_section(struct cpt_context *ctx)
+{
+	if (ctx->file && ctx->current_section >= 0) {
+		__u64 next = ctx->file->f_pos - ctx->current_section;
+		ctx->pwrite(&next, 8, ctx, ctx->current_section);
+		ctx->current_section = -1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cpt_close_section);
+
+int cpt_open_section(struct cpt_context *ctx, __u32 type)
+{
+	struct cpt_section_hdr hdr;
+
+	if (ctx->file == NULL)
+		return 0;
+
+	cpt_close_section(ctx);
+
+	ctx->current_section = ctx->file->f_pos;
+	ctx->sections[type] = ctx->current_section;
+
+	hdr.cpt_next = 0;
+	hdr.cpt_section = type;
+	hdr.cpt_hdrlen = sizeof(hdr);
+	hdr.cpt_align = 0;
+	ctx->write(&hdr, sizeof(hdr), ctx);
+
+	return 0;
+}
+EXPORT_SYMBOL(cpt_open_section);
+
+
+int cpt_close_object(struct cpt_context *ctx)
+{
+	if (ctx->file && ctx->current_object >= 0) {
+		__u64 next = ctx->file->f_pos - ctx->current_object;
+		ctx->pwrite(&next, 8, ctx, ctx->current_object);
+		ctx->current_object = -1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cpt_close_object);
+
+int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	if (ctx->file == NULL)
+		return 0;
+
+	cpt_close_object(ctx);
+
+	ctx->current_object = ctx->file->f_pos;
+	if (obj)
+		cpt_obj_setpos(obj, ctx->current_object, ctx);
+
+	return 0;
+}
+EXPORT_SYMBOL(cpt_open_object);
+
+int cpt_push_object(loff_t *saved, struct cpt_context *ctx)
+{
+	if (ctx->file) {
+		*saved = ctx->current_object;
+		ctx->current_object = ctx->file->f_pos;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cpt_push_object);
+
+int cpt_pop_object(loff_t *saved, struct cpt_context *ctx)
+{
+	ctx->current_object = *saved;
+	return 0;
+}
+EXPORT_SYMBOL(cpt_pop_object);
+
+int cpt_dump_tail(struct cpt_context *ctx)
+{
+	struct cpt_major_tail hdr;
+	int i;
+
+	if (ctx->file == NULL)
+		return 0;
+
+	cpt_open_section(ctx, CPT_SECT_TRAILER);
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.cpt_next = sizeof(hdr);
+	hdr.cpt_object = CPT_OBJ_TRAILER;
+	hdr.cpt_hdrlen = sizeof(hdr);
+	hdr.cpt_content = CPT_CONTENT_VOID;
+	hdr.cpt_lazypages = 0;
+	hdr.cpt_64bit = ctx->tasks64;
+	hdr.cpt_signature[0] = CPT_SIGNATURE0;
+	hdr.cpt_signature[1] = CPT_SIGNATURE1;
+	hdr.cpt_signature[2] = CPT_SIGNATURE2;
+	hdr.cpt_signature[3] = CPT_SIGNATURE3;
+	hdr.cpt_nsect = CPT_SECT_MAX_INDEX;
+	for (i = 0; i < CPT_SECT_MAX_INDEX; i++)
+		hdr.cpt_sections[i] = ctx->sections[i];
+
+	ctx->write(&hdr, sizeof(hdr), ctx);
+	cpt_close_section(ctx);
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_dump.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_dump.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_dump.c	2015-01-21 12:02:48.222093685 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_dump.c	2015-01-21 12:02:54.166935871 +0300
@@ -0,0 +1,1375 @@
+/*
+ *
+ *  kernel/cpt/cpt_dump.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/ptrace.h>
+#include <linux/utrace.h>
+#include <linux/smp_lock.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <bc/task.h>
+#include <linux/cpt_image.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
+#include <linux/netdevice.h>
+#include <linux/dcache.h>
+#include <linux/if_tun.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/netdevice.h>
+#include <linux/mount.h>
+#include <linux/ve_nfs.h>
+#include <linux/freezer.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_dump.h"
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_process.h"
+#include "cpt_net.h"
+#include "cpt_socket.h"
+#include "cpt_ubc.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+
+
+static int vps_child_level(struct task_struct *root, struct task_struct *c)
+{
+	int level = 0;
+	int veid = VE_TASK_INFO(c)->owner_env->veid;
+
+	while (VE_TASK_INFO(c)->owner_env->veid == veid) {
+		if (c->pid != c->tgid)
+			c = c->group_leader;
+		if (c == root)
+			return level;
+
+		c = c->real_parent;
+		level++;
+	}
+	return -1;
+}
+
+static inline int freezable(struct task_struct * p)
+{
+	if (p->exit_state)
+		return 0;
+
+	/* skip kernel threads */
+	if (p->flags & PF_KTHREAD)
+		return 0;
+
+	switch (p->state) {
+	case TASK_STOPPED:
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void ve_print_frozen(cpt_context_t *ctx)
+{
+	struct task_struct *p, *g;
+	int tainted = 0;
+	struct ve_struct *curr, *ve;
+
+	ve = get_ve_by_id(ctx->ve_id);
+	if (!ve) {
+		eprintk_ctx("%s: Failed to get VE by id %d\n", __func__, ctx->ve_id);
+		return;
+	}
+
+	curr = set_exec_env(ve);
+
+	read_lock(&tasklist_lock);
+	do_each_thread_ve(g, p) {
+		if (freezing(p) || frozen(p)) {
+			if (!tainted++)
+				add_taint(TAINT_CRAP);
+			eprintk_ctx("ve#%d: %s process: " CPT_FID
+				    ", exit_state: %d, state: %ld\n",
+				    get_exec_env()->veid,
+				    (freezing(p)) ? "freezing" : "frozen",
+				    CPT_TID(p), p->exit_state, p->state);
+		}
+	} while_each_thread_ve(g, p);
+	read_unlock(&tasklist_lock);
+
+	(void)set_exec_env(curr);
+
+	put_ve(ve);
+}
+
+static void wake_ve(cpt_context_t *ctx)
+{
+	struct task_struct *p, *g;
+
+	read_lock(&tasklist_lock);
+	do_each_thread_ve(g, p) {
+		thaw_process(p);
+	} while_each_thread_ve(g, p);
+	read_unlock(&tasklist_lock);
+	ve_print_frozen(ctx);
+}
+
+static int check_process_external(struct task_struct *p)
+{
+	if (pid_alive(p)) {
+		if (p->pids[PIDTYPE_PID].pid->level == 0)
+			return PIDTYPE_PID;
+		if (p->pids[PIDTYPE_PGID].pid->level == 0)
+			return PIDTYPE_PGID;
+		if (p->pids[PIDTYPE_SID].pid->level == 0)
+			return PIDTYPE_SID;
+	}
+
+	return PIDTYPE_MAX;
+}
+
+enum
+{
+	OBSTACLE_NOGO = -1,
+	OBSTACLE_TIMEOUT = -2,
+	OBSTACLE_TRYAGAIN = -3,
+	OBSTACLE_SIGNAL = -4,
+};
+
+#define DEFAULT_SUSPEND_TIMEOUT_MIN		10UL
+#define DEFAULT_SUSPEND_TIMEOUT_MAX		(60UL * 60)       /* 1 hour */
+
+unsigned long suspend_timeout_min = DEFAULT_SUSPEND_TIMEOUT_MIN;
+unsigned long suspend_timeout_max = DEFAULT_SUSPEND_TIMEOUT_MAX;
+
+unsigned int suspend_timeout = DEFAULT_SUSPEND_TIMEOUT_MIN;
+
+unsigned int kill_external = 0;
+
+static int check_trace(struct task_struct *tsk, struct task_struct *root,
+			cpt_context_t *ctx)
+{
+	return task_utrace_attached(tsk);
+}
+
+static int vps_kill_and_reap_external(struct cpt_context *ctx)
+{
+	struct task_struct *p, *g;
+	struct task_struct *root;
+	int external_tasks = 0;
+
+	read_lock(&tasklist_lock);
+
+	root = find_task_by_pid_ns(1, current->nsproxy->pid_ns);
+	if (!root) {
+		eprintk_ctx("cannot find ve init\n");
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+
+	do_each_thread_ve(g, p) {
+		if ((vps_child_level(root, p) < 0) && !cpt_skip_task(p)) {
+			if (!fatal_signal_pending(p)) {
+				p->flags |= PF_EXIT_RESTART;
+				send_sig(SIGKILL, p, true);
+				wake_up_process(p);
+			}
+			external_tasks++;
+		}
+	} while_each_thread_ve(g, p);
+
+	read_unlock(&tasklist_lock);
+
+	if (external_tasks > 0) {
+		/* Wait for external tasks to die. See do_notify_parent(). */
+		schedule_timeout_interruptible(HZ/20);
+	}
+	return external_tasks;
+}
+
+static int vps_handle_external(struct cpt_context *ctx)
+{
+	int status;
+
+	if (!kill_external)
+		return 0;
+
+	do {
+		status = vps_kill_and_reap_external(ctx);
+	} while (status > 0);
+
+	return status;
+}
+
+static int vps_stop_iteration(struct cpt_context *ctx)
+{
+	struct task_struct *p, *g;
+	struct task_struct *root;
+	int todo = 0;
+
+	read_lock(&tasklist_lock);
+
+	root = find_task_by_pid_ns(1, current->nsproxy->pid_ns);
+	if (!root) {
+		eprintk_ctx("cannot find ve init\n");
+		todo = -ESRCH;
+		goto out;
+	}
+
+	do_each_thread_ve(g, p) {
+		if (vps_child_level(root, p) >= 0) {
+			if (!freezable(p) || frozen(p))
+				continue;
+
+			switch (check_process_external(p)) {
+			case PIDTYPE_PID:
+				eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
+						cpt_task_pid_nr(p, PIDTYPE_PID), p->pid, p->comm);
+				todo = OBSTACLE_NOGO;
+				goto out;
+			case PIDTYPE_PGID:
+				eprintk_ctx("external process group %d/%d(%s) inside CT "
+						"(e.g. vzctl enter or vzctl exec).\n",
+						cpt_task_pid_nr(p, PIDTYPE_PGID), p->pid, p->comm);
+				todo = OBSTACLE_NOGO;
+				goto out;
+			case PIDTYPE_SID:
+				eprintk_ctx("external process session %d/%d(%s) inside CT "
+						"(e.g. vzctl enter or vzctl exec).\n",
+						cpt_task_pid_nr(p, PIDTYPE_SID), p->pid, p->comm);
+				todo = OBSTACLE_NOGO;
+				goto out;
+			}
+
+			if (p->vfork_done) {
+				/* Task between vfork()...exec()
+				 * cannot be frozen, because parent
+				 * wait in uninterruptible state.
+				 * So, we do nothing, waiting for
+				 * exec(), unless:
+				 */
+				if (p->state == TASK_STOPPED ||
+				    p->state == TASK_TRACED) {
+					eprintk_ctx("task " CPT_FID " is stopped while vfork(). "
+							"Checkpointing is impossible.\n",
+							CPT_TID(p));
+					todo = OBSTACLE_NOGO;
+					/* It is fatal, _user_ stopped
+					 * vfork()ing task, so that we
+					 * cannot suspend now.
+					 */
+				} else {
+					todo = OBSTACLE_TRYAGAIN;
+				}
+				goto out;
+			}
+			if (p->signal->group_exit_task &&
+			    p->signal->notify_count) {
+				/* exec() waits for threads' death */
+				wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p));
+				todo = OBSTACLE_TRYAGAIN;
+				goto out;
+			}
+			if (check_trace(p, root, ctx)) {
+				eprintk_ctx("task " CPT_FID " is traced. Checkpointing is impossible.\n", CPT_TID(p));
+				todo = OBSTACLE_NOGO;
+				goto out;
+			}
+			if (p->flags & PF_NOFREEZE) {
+				eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p));
+				todo = OBSTACLE_NOGO;
+				goto out;
+			}
+
+			if (freeze_task(p, false) && (todo >= 0))
+				todo++;
+			else if (!frozen(p))
+				eprintk_ctx("This can't be: task's " CPT_FID " is not frozen, and freeze attempt has failed.\n", CPT_TID(p));
+		} else {
+			if (!cpt_skip_task(p)) {
+				eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
+						cpt_task_pid_nr(p, PIDTYPE_PID), task_pid_nr(p), p->comm);
+				todo = OBSTACLE_NOGO;
+				goto out;
+			}
+		}
+	} while_each_thread_ve(g, p);
+out:
+	read_unlock(&tasklist_lock);
+	return todo;
+}
+
+static int check_stop_status(struct cpt_context *ctx, int todo,
+			     unsigned long start_time,
+			     unsigned long stop_time)
+{
+	int status = todo;
+
+	if (todo > 0) {
+		/* No visible obstacles, but VE did not freeze
+		 * for timeout. Interrupt suspend, if it is major
+		 * timeout or signal; if it is minor timeout
+		 * we will wake VE and restart suspend.
+		 */
+		if (time_after(jiffies, start_time + suspend_timeout*HZ)) {
+			struct task_struct *p, *g;
+			int i = 0;
+
+			eprintk_ctx("timed out (%d seconds).\n", suspend_timeout);
+			eprintk_ctx("Unfrozen tasks (no more than 10): see dmesg output.\n");
+			read_lock(&tasklist_lock);
+			do_each_thread_ve(g, p) {
+				task_lock(p);
+				if (freezable(p) && !frozen(p) && (i++ < 10))
+					sched_show_task(p);
+				task_unlock(p);
+
+			} while_each_thread_ve(g, p);
+			read_unlock(&tasklist_lock);
+
+			todo = OBSTACLE_TIMEOUT;
+		} else if (signal_pending(current))
+			todo = OBSTACLE_SIGNAL;
+		else if (time_after(jiffies, stop_time))
+			todo = OBSTACLE_TRYAGAIN;
+	}
+
+	switch (todo) {
+	case OBSTACLE_NOGO:
+		eprintk_ctx("suspend is impossible now.\n");
+		status = -EAGAIN;
+		break;
+	case OBSTACLE_SIGNAL:
+		{
+			int i = _NSIG;
+			sigset_t *set = &current->pending.signal;
+
+			eprintk_ctx("interrupted by signal: ");
+			do {
+				int x = 0;
+
+				i -= 4;
+				if (sigismember(set, i+1)) x |= 1;
+				if (sigismember(set, i+2)) x |= 2;
+				if (sigismember(set, i+3)) x |= 4;
+				if (sigismember(set, i+4)) x |= 8;
+				eprintk_ctx("%x", x);
+			} while (i >= 4);
+			eprintk_ctx(".\n");
+		}
+	case OBSTACLE_TIMEOUT:
+		status = -EINTR;
+		break;
+	case OBSTACLE_TRYAGAIN:
+		if (time_after(jiffies, start_time + DEFAULT_SUSPEND_TIMEOUT_MIN*HZ) ||
+		    signal_pending(current)) {
+			wprintk_ctx("suspend timed out\n");
+			status = -EAGAIN;
+			break;
+		}
+		/*
+		 * All we need here is to return a posive code, which
+		 * indicates, that we have to continue the suspend loop.
+		 */
+		status = 1;
+		break;
+	}
+	return status;
+}
+
+static int vps_stop_tasks(struct cpt_context *ctx)
+{
+	unsigned long start_time = jiffies;
+	unsigned long timeout = HZ/5;
+	int status;
+	int round = 0;
+
+	do_gettimespec(&ctx->start_time); 
+	do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time);
+	ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup;
+
+	atomic_inc(&get_exec_env()->suspend);
+
+	status = vps_handle_external(ctx);
+	if (status)
+		goto out;
+
+	do {
+		unsigned long stop_time = jiffies + timeout;
+		int result;
+
+		result = vps_stop_iteration(ctx);
+
+		status = check_stop_status(ctx, result, start_time, stop_time);
+		if (status > 0) {
+			if (result == OBSTACLE_TRYAGAIN) {
+				wprintk_ctx("minor suspend timeout (%lu) expired, "
+					    "trying again\n", timeout);
+
+				/* Try again. VE is awake, give it some time to run. */
+				current->state = TASK_INTERRUPTIBLE;
+				schedule_timeout(HZ);
+
+				/* After a short wait restart suspend
+				 * with longer timeout */
+				timeout = min(timeout<<1, DEFAULT_SUSPEND_TIMEOUT_MIN*HZ);
+			} else {
+				if (round++ > 0) {
+					/* VE is partially frozen, give processes
+					 * a chance to enter to refrigerator(). */
+					current->state = TASK_INTERRUPTIBLE;
+					schedule_timeout(HZ/20);
+				} else {
+					yield();
+				}
+			}
+		}
+	} while (status > 0);
+
+out:
+	atomic_dec(&get_exec_env()->suspend);
+
+	return status;
+}
+
+static int cpt_unlock_ve(struct cpt_context *ctx)
+{
+	struct ve_struct *env;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+	down_write(&env->op_sem);
+	env->is_locked = 0;
+	up_write(&env->op_sem);
+	put_ve(env);
+	return 0;
+}
+
+int cpt_resume(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_unlock_sockets(ctx);
+
+	if (list_empty(&ctx->object_array[CPT_OBJ_TASK]))
+		goto skip_tasks;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+
+		if (thaw_process(tsk) == 0 && freezable(tsk))
+			eprintk_ctx("strange, %s not frozen\n", tsk->comm );
+		put_task_struct(tsk);
+	}
+
+	ve_print_frozen(ctx);
+
+skip_tasks:
+	cpt_resume_network(ctx);
+
+	cpt_unlock_ve(ctx);
+
+	cpt_finish_ubc(ctx);
+	cpt_finish_vfsmount_ref(ctx);
+	cpt_object_destroy(ctx);
+	return 0;
+}
+
+void cpt_drop_nfs_unhashed(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+		struct dentry *d = file->f_dentry;
+
+		if (d->d_flags & DCACHE_NFSFS_RENAMED) {
+			spin_lock(&d->d_lock);
+			d->d_flags &= ~DCACHE_NFSFS_RENAMED;
+			spin_unlock(&d->d_lock);
+		}
+	}
+}
+
+int cpt_kill(struct cpt_context *ctx)
+{
+	int err = 0;
+	struct ve_struct *env;
+	cpt_object_t *obj;
+	struct task_struct *root_task = NULL;
+
+	if (!ctx->ve_id)
+		return -EINVAL;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+
+	if (current->ve_task_info.owner_env == env) {
+		wprintk_ctx("attempt to kill ve from inside, escaping...\n");
+		err = -EPERM;
+		goto out;
+	}
+
+	cpt_kill_sockets(ctx);
+
+	cpt_drop_nfs_unhashed(ctx);
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+
+		if (tsk->exit_state) {
+			put_task_struct(tsk);
+			continue;
+		}
+
+		if (cpt_task_pid_nr(tsk, PIDTYPE_PID) == 1) {
+			root_task = tsk;
+			continue;
+		}
+
+		tsk->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+		tsk->compat_robust_list = NULL;
+#endif
+		tsk->clear_child_tid = NULL;
+
+		if (tsk->ptrace) {
+			write_lock_irq(&tasklist_lock);
+			tsk->ptrace = 0;
+			if (!list_empty(&tsk->ptrace_entry)) {
+				tsk->parent = tsk->real_parent;
+				list_del_init(&tsk->ptrace_entry);
+			}
+			write_unlock_irq(&tasklist_lock);
+		}
+
+		send_sig(SIGKILL, tsk, 1);
+		thaw_process(tsk);
+
+		put_task_struct(tsk);
+	}
+
+	yield();
+
+	if (root_task != NULL) {
+		send_sig(SIGKILL, root_task, 1);
+		thaw_process(root_task);
+
+		put_task_struct(root_task);
+	}
+
+	cpt_finish_ubc(ctx);
+	cpt_finish_vfsmount_ref(ctx);
+	cpt_object_destroy(ctx);
+
+	wait_event_interruptible(env->ve_list_wait, list_empty(&env->ve_list));
+
+out:
+	put_ve(env);
+	return err;
+}
+
+#ifdef CONFIG_BEANCOUNTERS
+static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx)
+{
+	struct task_beancounter *tbc;
+
+	tbc = &(t->task_bc);
+	cpt_add_ubc(tbc->exec_ub, ctx);
+	cpt_add_ubc(tbc->task_ub, ctx);
+}
+#else
+static void inline collect_task_ubc(struct task_struct *t,
+		struct cpt_context *ctx)
+{ return; }
+#endif
+
+static cpt_object_t * remember_task(struct task_struct * child,
+		cpt_object_t * head, cpt_context_t * ctx)
+{
+	cpt_object_t *cobj;
+	int err;
+
+	if (freezable(child) && !frozen(child)) {
+		eprintk_ctx("process " CPT_FID " is not frozen, state: %ld, flags: 0x%x\n",
+				CPT_TID(child), child->state, child->flags);
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (child->exit_state && (child->exit_state != EXIT_ZOMBIE))
+		/*
+		 * Don't collect dead tasks - just skip it and return the same
+		 * object.
+		 */
+		return head;
+
+	if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG();
+	if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
+		eprintk_ctx("task obj allocation failure\n");
+		err = -ENOMEM;
+		goto err;
+	}
+	cobj->o_count = 1;
+	cpt_obj_setobj(cobj, child, ctx);
+	insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx);
+	collect_task_ubc(child, ctx);
+	return cobj;
+
+err:
+	put_task_struct(child);
+	return ERR_PTR(err);
+}
+
+static int vps_collect_tasks(struct cpt_context *ctx)
+{
+	int err = -ESRCH;
+	cpt_object_t *obj;
+	struct task_struct *root;
+	read_lock(&tasklist_lock);
+	root = find_task_by_pid_ns(1, current->nsproxy->pid_ns);
+	if (root)
+		get_task_struct(root);
+	read_unlock(&tasklist_lock);
+
+	if (!root) {
+		err = -ESRCH;
+		eprintk_ctx("vps_collect_tasks: cannot find root\n");
+		goto out;
+	}
+
+	if (root->exit_state) {
+		err = -ESRCH;
+		eprintk_ctx("vps_collect_tasks: init is dead, nothing to dump\n");
+		goto out;
+	}
+
+	if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
+		put_task_struct(root);
+		return -ENOMEM;
+	}
+	obj->o_count = 1;
+	cpt_obj_setobj(obj, root, ctx);
+	intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
+	collect_task_ubc(root, ctx);
+
+	/* Collect process subtree recursively */
+	for_each_object(obj, CPT_OBJ_TASK) {
+		cpt_object_t *head = obj;
+		struct task_struct *tsk = obj->o_obj;
+		struct task_struct *child;
+
+		if (freezable(tsk) && !frozen(tsk)) {
+			eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk));
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (tsk->state == TASK_RUNNING)
+			printk("State TASK_RUNNING on collect stage %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk));
+
+		wait_task_inactive(tsk, 0);
+
+		err = check_task_state(tsk, ctx);
+		if (err)
+			goto out;
+
+		if (tsk->pid == tsk->tgid) {
+			child = tsk;
+			for (;;) {
+				read_lock(&tasklist_lock);
+				child = next_thread(child);
+				if (child != tsk)
+					get_task_struct(child);
+				read_unlock(&tasklist_lock);
+
+				if (child == tsk)
+					break;
+
+				if (child->parent != tsk->parent) {
+					put_task_struct(child);
+					eprintk_ctx("illegal thread structure, kernel bug\n");
+					err = -EINVAL;
+					goto out;
+				}
+
+				head = remember_task(child, head, ctx);
+				if (IS_ERR(head)) {
+					err = PTR_ERR(head);
+					goto out;
+				}
+			}
+		}
+
+		/* About locking. VE is frozen. But lists of children
+		 * may change at least for init, when entered task reparents
+		 * to init and when reparented task exits. If we take care
+		 * of this case, we still can unlock while scanning
+		 * tasklists.
+		 */
+		read_lock(&tasklist_lock);
+		list_for_each_entry(child, &tsk->children, sibling) {
+			if (child->pid != child->tgid)
+				continue;
+			/* skip kernel threads */
+			if (child->flags & PF_KTHREAD)
+				continue;
+
+			get_task_struct(child);
+			read_unlock(&tasklist_lock);
+
+			head = remember_task(child, head, ctx);
+			if (IS_ERR(head)) {
+				err = PTR_ERR(head);
+				goto out;
+			}
+
+			read_lock(&tasklist_lock);
+		}
+
+		read_unlock(&tasklist_lock);
+	}
+
+	return 0;
+
+out:
+	while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) {
+		struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next;
+		cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
+		struct task_struct *tsk;
+
+		list_del(head);
+		tsk = obj->o_obj;
+		put_task_struct(tsk);
+		free_cpt_object(obj, ctx);
+	}
+	return err;
+}
+
+static int cpt_collect(struct cpt_context *ctx)
+{
+	int err;
+
+	if ((err = cpt_collect_mm(ctx)) != 0)
+		return err;
+
+	if ((err = cpt_collect_sysv(ctx)) != 0)
+		return err;
+
+	if ((err = cpt_collect_namespace(ctx)) != 0)
+		return err;
+
+	if ((err = cpt_collect_files(ctx)) != 0)
+		return err;
+
+	if ((err = cpt_collect_fs(ctx)) != 0)
+		return err;
+
+	if ((err = cpt_collect_signals(ctx)) != 0)
+		return err;
+
+	if ((err = cpt_collect_posix_timers(ctx)) != 0)
+		return err;
+
+	return 0;
+}
+
+static int cpt_dump_veinfo(cpt_context_t *ctx)
+{
+	struct cpt_veinfo_image *i = cpt_get_buf(ctx);
+	struct ve_struct *ve;
+	struct timespec delta;
+	struct ipc_namespace *ns;
+
+	cpt_open_section(ctx, CPT_SECT_VEINFO);
+	cpt_open_object(NULL, ctx);
+
+	memset(i, 0, sizeof(*i));
+
+	i->cpt_next = CPT_NULL;
+	i->cpt_object = CPT_OBJ_VEINFO;
+	i->cpt_hdrlen = sizeof(*i);
+	i->cpt_content = CPT_CONTENT_VOID;
+
+	ve = get_exec_env();
+	ns = ve->ve_ns->ipc_ns;
+
+	i->shm_ctl_all = ns->shm_ctlall;
+	if (ns->shm_ctlall > 0xFFFFFFFFU)
+		i->shm_ctl_all = 0xFFFFFFFFU;
+	i->shm_ctl_max = ns->shm_ctlmax;
+	if (ns->shm_ctlmax > 0xFFFFFFFFU)
+		i->shm_ctl_max = 0xFFFFFFFFU;
+	i->shm_ctl_mni = ns->shm_ctlmni;
+
+	i->msg_ctl_max = ns->msg_ctlmax;
+	i->msg_ctl_mni = ns->msg_ctlmni;
+	i->msg_ctl_mnb = ns->msg_ctlmnb;
+
+	BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr));
+	i->sem_ctl_arr[0] = ns->sem_ctls[0];
+	i->sem_ctl_arr[1] = ns->sem_ctls[1];
+	i->sem_ctl_arr[2] = ns->sem_ctls[2];
+	i->sem_ctl_arr[3] = ns->sem_ctls[3];
+
+	do_posix_clock_monotonic_gettime(&delta);
+	_set_normalized_timespec(&delta,
+			delta.tv_sec - ve->start_timespec.tv_sec,
+			delta.tv_nsec - ve->start_timespec.tv_nsec);
+	i->start_timespec_delta = cpt_timespec_export(&delta);
+	i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies;
+
+	do_posix_clock_monotonic_gettime(&delta);
+	monotonic_to_bootbased(&delta);
+	_set_normalized_timespec(&delta,
+			delta.tv_sec - ve->real_start_timespec.tv_sec,
+			delta.tv_nsec - ve->real_start_timespec.tv_nsec);
+	i->real_start_timespec_delta = cpt_timespec_export(&delta);
+
+	i->last_pid = ve->ve_ns->pid_ns->last_pid;
+	i->rnd_va_space	= ve->_randomize_va_space + 1;
+	i->vpid_max = ve->ve_ns->pid_ns->pid_max;
+	i->aio_max_nr = ve->aio_max_nr;
+
+	ctx->write(i, sizeof(*i), ctx);
+	cpt_release_buf(ctx);
+	cpt_close_object(ctx);
+	cpt_close_section(ctx);
+	return 0;
+}
+
+static int cpt_dump_utsname(cpt_context_t *ctx)
+{
+	int len;
+	struct cpt_object_hdr o;
+	struct ve_struct *ve;
+	struct uts_namespace *ns;
+
+	cpt_open_section(ctx, CPT_SECT_UTSNAME);
+
+	ve = get_exec_env();
+	ns = ve->ve_ns->uts_ns;
+
+ 	cpt_open_object(NULL, ctx);
+	len = strlen(ns->name.nodename);
+ 	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_NAME;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_NAME;
+
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(ns->name.nodename, len+1, ctx);
+	ctx->align(ctx);
+ 	cpt_close_object(ctx);
+ 
+ 	cpt_open_object(NULL, ctx);
+	len = strlen(ns->name.domainname);
+ 	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_NAME;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_NAME;
+
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(ns->name.domainname, len+1, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+
+ 	cpt_open_object(NULL, ctx);
+	len = strlen(ns->name.release);
+ 	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_NAME;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_NAME;
+
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(ns->name.release, len+1, ctx);
+	ctx->align(ctx);
+ 	cpt_close_object(ctx);
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+#ifndef CONFIG_IA64
+static int cpt_dump_vsyscall(cpt_context_t *ctx)
+{
+	struct cpt_page_block *pgb = cpt_get_buf(ctx);
+
+	cpt_open_section(ctx, CPT_SECT_VSYSCALL);
+	cpt_open_object(NULL, ctx);
+
+	pgb->cpt_next = CPT_NULL;
+	pgb->cpt_object = CPT_OBJ_VSYSCALL;
+	pgb->cpt_hdrlen = sizeof(*pgb);
+	pgb->cpt_content = CPT_CONTENT_DATA;
+	pgb->cpt_start = cpt_ptr_export(vsyscall_addr);
+	pgb->cpt_end = pgb->cpt_start + PAGE_SIZE;
+
+	ctx->write(pgb, sizeof(*pgb), ctx);
+	cpt_release_buf(ctx);
+
+	ctx->write(vsyscall_addr, PAGE_SIZE, ctx);
+
+	cpt_close_object(ctx);
+	cpt_close_section(ctx);
+	return 0;
+}
+#endif
+
+int cpt_dump(struct cpt_context *ctx)
+{
+	struct user_beancounter *bc = get_exec_ub();
+	struct ve_struct *oldenv, *env;
+	struct nsproxy *old_ns;
+	int err, err2 = 0;
+
+	if (!ctx->ve_id)
+		return -EINVAL;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+
+	down_read(&env->op_sem);
+	err = -ESRCH;
+	if (!env->is_running)
+		goto out_noenv;
+	if (!env->is_locked)
+		goto out_noenv;
+	err = -EINVAL;
+	if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) {
+		printk(KERN_WARNING "CT: checkpointing not supported yet"
+				" for hidden pid namespaces.\n");
+		goto out_noenv;
+	}
+
+	oldenv = set_exec_env(env);
+	old_ns = current->nsproxy;
+	current->nsproxy = env->ve_ns;
+
+	/* Phase 2: real checkpointing */
+	err = cpt_open_dumpfile(ctx);
+	if (err)
+		goto out;
+	
+	cpt_major_hdr_out(ctx);
+
+	if (!err)
+		err = cpt_dump_veinfo(ctx);
+	if (!err)
+		err = cpt_dump_ubc(ctx);
+
+	/*
+	 * Backup old limits and set them temporary unlimited to avoid
+	 * internal reclaimer, oomkiller and other unpleasantnesses
+	 * Correct value already dumpled into image at this point
+	 */
+	set_ubc_unlimited(ctx, bc);
+
+	if (!err)
+		err = cpt_dump_namespace(ctx);
+	if (!err)
+		err = cpt_dump_cgroups(ctx);
+	if (!err)
+		err = cpt_dump_files(ctx);
+	if (!err)
+		err = cpt_dump_files_struct(ctx);
+	if (!err)
+		err = cpt_dump_fs_struct(ctx);
+	/* netdevices should be dumped after dumping open files
+	   as we need to restore netdevice binding to /dev/net/tun file */
+	if (!err)
+		err = cpt_dump_ifinfo(ctx);
+	if (!err)
+		err = cpt_dump_sighand(ctx);
+	if (!err)
+		err = cpt_dump_posix_timers(ctx);
+	if (!err)
+		err = cpt_dump_vm(ctx);
+	if (!err)
+		err = cpt_dump_sysvsem(ctx);
+	if (!err)
+		err = cpt_dump_sysvmsg(ctx);
+	if (!err)
+		err = cpt_dump_tasks(ctx);
+	if (!err)
+		err = cpt_dump_orphaned_sockets(ctx);
+#if defined(CONFIG_VE_IPTABLES) && \
+    (defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE))
+	if (!err)
+		err = cpt_dump_ip_conntrack(ctx);
+#endif
+	if (!err)
+		err = cpt_dump_utsname(ctx);
+
+#ifndef CONFIG_IA64
+	if (!err)
+		err = cpt_dump_vsyscall(ctx);
+#endif
+
+	if (!err)
+		err = cpt_dump_tail(ctx);
+
+	err2 = cpt_close_dumpfile(ctx);
+
+	cpt_close_pram(ctx, err ? : err2);
+
+	/*
+	 * Restore limits back
+	 */
+	restore_ubc_limits(ctx, bc);
+
+out:
+	current->nsproxy = old_ns;
+	set_exec_env(oldenv);
+out_noenv:
+	up_read(&env->op_sem);
+	put_ve(env);
+	return err ? : err2;
+}
+
+int cpt_vps_suspend(struct cpt_context *ctx)
+{
+	struct ve_struct *oldenv, *env;
+	struct nsproxy *old_ns;
+	int err = 0;
+
+	ctx->kernel_config_flags = test_kernel_config();
+	cpt_object_init(ctx);
+
+	if (!ctx->ve_id) {
+		env = get_exec_env();
+		if (env == get_ve0())
+			return -EINVAL;
+		wprintk("undefined ve_id\n");
+		ctx->ve_id = env->veid;
+		get_ve(env);
+	} else {
+		env = get_ve_by_id(ctx->ve_id);
+		if (!env)
+			return -ESRCH;
+	}
+
+#ifdef CONFIG_VE_IPTABLES
+	ctx->iptables_mask = env->_iptables_modules;
+#endif
+	ctx->features = env->features;
+
+	down_write(&env->op_sem);
+	err = -ESRCH;
+	if (!env->is_running)
+		goto out_noenv;
+
+	err = -EBUSY;
+	if (env->is_locked)
+		goto out_noenv;
+	env->is_locked = 1;
+	downgrade_write(&env->op_sem);
+
+	oldenv = set_exec_env(env);
+	old_ns = current->nsproxy;
+	current->nsproxy = env->ve_ns;
+
+	/* Start syncing NFS */
+	ve_nfs_sync(env, 0);
+
+	/* Find and stop all the tasks */
+	if ((err = vps_stop_tasks(ctx)) != 0)
+		goto out_wake;
+
+	/* Wait for syncing NFS mounts */
+	if ((err = ve_nfs_sync(env, 1)) != 0) {
+		eprintk_ctx("failed to sync nfs\n");
+		goto out_wake;
+	}
+
+	if ((err = cpt_suspend_network(ctx)) != 0)
+		goto out_wake;
+
+	/* At the moment all the state is frozen. We do not need to lock
+	 * the state, which can be changed only if the tasks are running.
+	 */
+
+	/* Collect task tree */
+	if ((err = vps_collect_tasks(ctx)) != 0)
+		goto out_wake;
+
+	/* Collect all the resources */
+	err = cpt_collect(ctx);
+
+out:
+	current->nsproxy = old_ns;
+	set_exec_env(oldenv);
+	up_read(&env->op_sem);
+	put_ve(env);
+        return err;
+
+out_noenv:
+	up_write(&env->op_sem);
+	put_ve(env);
+	return err;
+
+out_wake:
+	wake_ve(ctx);
+	goto out;
+}
+
+static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps)
+{
+	struct net *net = get_exec_env()->ve_netns;
+	struct net_device *dev;
+
+	read_lock(&dev_base_lock);
+	for_each_netdev(net, dev) {
+		if (dev->netdev_ops->ndo_cpt)
+			continue;
+
+		eprintk_ctx("unsupported netdevice %s\n", dev->name);
+		*caps |= (1<<CPT_UNSUPPORTED_NETDEV);
+		break;
+	}
+	read_unlock(&dev_base_lock);
+}
+
+static void check_one_process(struct cpt_context *ctx, __u32 *caps,
+		unsigned int flags, struct ve_struct *env,
+		struct task_struct *root, struct task_struct *p)
+{
+	struct mnt_namespace *ns;
+
+	if (p->flags & PF_KTHREAD)
+		return;
+
+	if (tsk_used_math(p)) {
+		*caps |= flags & ((1<<CPT_CPU_X86_FXSR) |
+				(1<<CPT_CPU_X86_SSE) |
+				(1<<CPT_CPU_X86_SSE2) |
+				(1<<CPT_CPU_X86_SSE4_1) |
+				(1<<CPT_CPU_X86_SSE4_2) |
+				(1<<CPT_CPU_X86_MMX) |
+				(1<<CPT_CPU_X86_3DNOW) |
+				(1<<CPT_CPU_X86_3DNOW2) |
+				(1<<CPT_CPU_X86_SSE4A) |
+				(1<<CPT_CPU_X86_XSAVE) |
+				(1<<CPT_CPU_X86_AVX) |
+				(1<<CPT_CPU_X86_AESNI) |
+				(1<<CPT_CPU_X86_RDRAND));
+	}
+	/* This is not 100% true. VE could migrate with vdso using int80.
+	 * In this case we do not need SEP/SYSCALL32 caps. It is not so easy
+	 * to test, so that we do not. */
+#ifdef CONFIG_X86_64
+	if (!(task_thread_info(p)->flags & _TIF_IA32))
+		*caps |= flags & (1<<CPT_CPU_X86_EMT64);
+	else if (p->mm && p->mm->context.vdso) {
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+			*caps |= flags & (1<<CPT_CPU_X86_SEP);
+		else
+			*caps |= flags & (1<<CPT_CPU_X86_SYSCALL32);
+	}
+#elif defined(CONFIG_X86_32)
+	if (p->mm && p->mm->context.vdso)
+		*caps |= flags & (1<<CPT_CPU_X86_SEP);
+#endif
+#ifdef CONFIG_IA64
+	if (!IS_IA32_PROCESS(task_pt_regs(p)))
+		*caps |= (1<<CPT_CPU_X86_IA64);
+#endif
+	if (vps_child_level(root, p) >= 0) {
+		switch (check_process_external(p)) {
+		case PIDTYPE_PID:
+			eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
+					cpt_task_pid_nr(p, PIDTYPE_PID), p->pid, p->comm);
+			*caps |= (1<<CPT_EXTERNAL_PROCESS);
+			break;
+		case PIDTYPE_PGID:
+			eprintk_ctx("external process group %d/%d(%s) inside CT "
+					"(e.g. vzctl enter or vzctl exec).\n",
+					task_pgrp_vnr(p), p->pid, p->comm);
+			*caps |= (1<<CPT_EXTERNAL_PROCESS);
+			break;
+		case PIDTYPE_SID:
+			eprintk_ctx("external process session %d/%d(%s) inside CT "
+					"(e.g. vzctl enter or vzctl exec).\n",
+					task_session_vnr(p), p->pid, p->comm);
+			*caps |= (1<<CPT_EXTERNAL_PROCESS);
+		}
+	} else {
+		if (!cpt_skip_task(p)) {
+			eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
+					cpt_task_pid_nr(p, PIDTYPE_PID), p->pid, p->comm);
+			*caps |= (1<<CPT_EXTERNAL_PROCESS);
+		}
+	}
+	task_lock(p);
+	ns = NULL;
+	if (p->nsproxy) {
+		ns = p->nsproxy->mnt_ns;
+		if (ns)
+			get_mnt_ns(ns);
+	}
+	task_unlock(p);
+	if (ns) {
+		if (ns != current->nsproxy->mnt_ns) {
+			*caps |= (1<<CPT_NAMESPACES);
+		}
+		put_mnt_ns(ns);
+	}
+	if (p->policy != SCHED_NORMAL && p->policy != SCHED_BATCH && p->policy != SCHED_IDLE) {
+		eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n",
+				cpt_task_pid_nr(p, PIDTYPE_PID), p->pid, p->comm);
+		*caps |= (1<<CPT_SCHEDULER_POLICY);
+	}
+	if (check_trace(p, root, ctx)) {
+		eprintk_ctx("task %d/%d(%s) is ptraced from host system\n",
+				p->pid, cpt_task_pid_nr(p, PIDTYPE_PID), p->comm);
+		*caps |= (1<<CPT_PTRACED_FROM_VE0);
+	}
+	if (cpt_check_unsupported(p, ctx)) {
+		*caps |= (1<<CPT_UNSUPPORTED_MISC);
+	}
+}
+
+static void check_unsupported_mounts(struct cpt_context *ctx, __u32 *caps,
+		struct ve_struct *env, struct mnt_namespace *n, char *path_buf)
+{
+	struct list_head *p;
+	char *path;
+
+	down_read(&namespace_sem);
+	list_for_each(p, &n->list) {
+		struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list);
+		struct path p, tmp = env->root_path;
+
+		p.dentry = mnt->mnt_root;
+		p.mnt = mnt;
+		spin_lock(&dcache_lock);
+		path = __d_path(&p, &tmp,
+				path_buf, PAGE_SIZE);
+		spin_unlock(&dcache_lock);
+		if (IS_ERR(path))
+			continue;
+
+		if (check_one_vfsmount(mnt)) {
+			eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name);
+			*caps |= (1<<CPT_UNSUPPORTED_FSTYPE);
+		}
+	}
+	up_read(&namespace_sem);
+}
+
+int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps)
+{
+	struct task_struct *p;
+	struct task_struct *root;
+	struct ve_struct *env;
+	struct ve_struct *old_env;
+	struct nsproxy *old_ns;
+	struct mnt_namespace *n;
+	int err;
+	unsigned int flags = test_cpu_caps_and_features();
+
+	if (!ctx->ve_id)
+		return -EINVAL;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (env == NULL)
+		return -ESRCH;
+
+	down_read(&env->op_sem);
+	err = -ESRCH;
+	if (!env->is_running) {
+		eprintk_ctx("CT is not running\n");
+		goto out_noenv;
+	}
+
+	err = -EBUSY;
+	if (env->is_locked) {
+		eprintk_ctx("CT is locked\n");
+		goto out_noenv;
+	}
+
+	*caps = flags & ((1<<CPT_CPU_X86_CMOV) | (1 << CPT_NO_IPV6));
+#ifdef CONFIG_X86_64
+	*caps |= flags & (1<<CPT_CPU_X86_SYSCALL);
+#endif
+	if (flags & (1 << CPT_SLM_DMPRST)) {
+		eprintk_ctx("SLM is enabled, but slm_dmprst module is not loaded\n");
+		*caps |= (1 << CPT_SLM_DMPRST);
+	}
+
+	old_env = set_exec_env(env);
+	old_ns = current->nsproxy;
+	current->nsproxy = env->ve_ns;
+
+	check_unsupported_netdevices(ctx, caps);
+
+	read_lock(&tasklist_lock);
+	root = find_task_by_pid_ns(1, current->nsproxy->pid_ns);
+	if (!root) {
+		read_unlock(&tasklist_lock);
+		eprintk_ctx("cannot find ve init\n");
+		err = -ESRCH;
+		goto out;
+	}
+	get_task_struct(root);
+	for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p))
+		check_one_process(ctx, caps, flags, env, root, p);
+	read_unlock(&tasklist_lock);
+
+	task_lock(root);
+	n = NULL;
+	if (root->nsproxy) {
+		n = root->nsproxy->mnt_ns;
+		if (n)
+			get_mnt_ns(n);
+	}
+	task_unlock(root);
+	if (n) {
+		char *path_buf;
+
+		path_buf = (char *) __get_free_page(GFP_KERNEL);
+		if (!path_buf) {
+			put_mnt_ns(n);
+			err = -ENOMEM;
+			goto out_root;
+		}
+
+		check_unsupported_mounts(ctx, caps, env, n, path_buf);
+
+		free_page((unsigned long) path_buf);
+		put_mnt_ns(n);
+	}
+
+	err = 0;
+
+out_root:
+	put_task_struct(root);
+out:
+	current->nsproxy = old_ns;
+	set_exec_env(old_env);
+out_noenv:
+	up_read(&env->op_sem);
+	put_ve(env);
+
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_dump.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_dump.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_dump.h	2015-01-21 12:02:48.222093685 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_dump.h	2015-01-21 12:02:50.878023179 +0300
@@ -0,0 +1,22 @@
+int cpt_dump(struct cpt_context *cpt);
+int rst_undump(struct cpt_context *cpt);
+int cpt_suspend(struct cpt_context *cpt);
+int cpt_resume(struct cpt_context *cpt);
+int cpt_kill(struct cpt_context *cpt);
+int rst_clean(struct cpt_context *cpt);
+int rst_resume(struct cpt_context *cpt);
+int rst_kill(struct cpt_context *cpt);
+
+int cpt_freeze_one(pid_t pid, int freeze);
+int cpt_vps_suspend(struct cpt_context *ctx);
+int vps_rst_undump(struct cpt_context *ctx);
+
+int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps);
+
+int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx);
+
+extern unsigned long suspend_timeout_min;
+extern unsigned long suspend_timeout_max;
+extern unsigned int suspend_timeout;
+
+extern unsigned int kill_external;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_epoll.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_epoll.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_epoll.c	2015-01-21 12:02:48.222093685 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_epoll.c	2015-01-21 12:02:49.750053122 +0300
@@ -0,0 +1,113 @@
+/*
+ *
+ *  kernel/cpt/cpt_epoll.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/eventpoll.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx)
+{
+	int err = 0;
+	struct file *file = obj->o_obj;
+	struct eventpoll *ep;
+	struct rb_node *rbp;
+	struct cpt_epoll_image ei;
+
+	if (file->f_op != &eventpoll_fops) {
+		eprintk_ctx("bad epoll file\n");
+		return -EINVAL;
+	}
+
+	ep = file->private_data;
+
+	/* eventpoll.c does not protect open /proc/N/fd, silly.
+	 * Opener will get an invalid file with uninitialized private_data
+	 */
+	if (unlikely(ep == NULL)) {
+		eprintk_ctx("bad epoll device\n");
+		return -EINVAL;
+	}
+
+	cpt_open_object(NULL, ctx);
+
+	ei.cpt_next = CPT_NULL;
+	ei.cpt_object = CPT_OBJ_EPOLL;
+	ei.cpt_hdrlen = sizeof(ei);
+	ei.cpt_content = CPT_CONTENT_ARRAY;
+	ei.cpt_file = obj->o_pos;
+
+	ctx->write(&ei, sizeof(ei), ctx);
+
+	mutex_lock(&epmutex);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		loff_t saved_obj;
+		cpt_object_t *tobj;
+		struct cpt_epoll_file_image efi;
+		struct epitem *epi;
+		epi = rb_entry(rbp, struct epitem, rbn);
+		tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx);
+		if (tobj == NULL) {
+			eprintk_ctx("epoll device refers to an external file\n");
+			err = -EBUSY;
+			break;
+		}
+		cpt_push_object(&saved_obj, ctx);
+		cpt_open_object(NULL, ctx);
+
+		efi.cpt_next = CPT_NULL;
+		efi.cpt_object = CPT_OBJ_EPOLL_FILE;
+		efi.cpt_hdrlen = sizeof(efi);
+		efi.cpt_content = CPT_CONTENT_VOID;
+		efi.cpt_file = tobj->o_pos;
+		efi.cpt_fd = epi->ffd.fd;
+		efi.cpt_events = epi->event.events;
+		efi.cpt_data = epi->event.data;
+		efi.cpt_revents = 0;
+		efi.cpt_ready = 0;
+		if (!list_empty(&epi->rdllink))
+			efi.cpt_ready = 1;
+
+		ctx->write(&efi, sizeof(efi), ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+	mutex_unlock(&epmutex);
+
+	cpt_close_object(ctx);
+
+	return err;
+}
+
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_exports.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_exports.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_exports.c	2015-01-21 12:02:48.223093658 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_exports.c	2015-01-21 12:02:49.750053122 +0300
@@ -0,0 +1,13 @@
+#include <linux/module.h>
+#include <asm/signal.h>
+
+#include <linux/cpt_obj.h>
+
+EXPORT_SYMBOL(alloc_cpt_object);
+EXPORT_SYMBOL(intern_cpt_object);
+EXPORT_SYMBOL(insert_cpt_object);
+EXPORT_SYMBOL(__cpt_object_add);
+EXPORT_SYMBOL(cpt_object_add);
+EXPORT_SYMBOL(cpt_object_get);
+EXPORT_SYMBOL(lookup_cpt_object);
+EXPORT_SYMBOL(lookup_cpt_obj_bypos);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_files.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_files.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_files.c	2015-01-21 12:02:48.223093658 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_files.c	2015-01-21 12:02:57.970834900 +0300
@@ -0,0 +1,2556 @@
+/*
+ *
+ *  kernel/cpt/cpt_files.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/ve_nfs.h>
+#include <linux/ve_proto.h>
+#include <bc/kmem.h>
+#include <linux/cpt_image.h>
+#include <linux/if_tun.h>
+#include <linux/fdtable.h>
+#include <linux/shm.h>
+#include <linux/signalfd.h>
+#include <linux/nsproxy.h>
+#include <linux/fs_struct.h>
+#include <linux/miscdevice.h>
+#include <linux/eventpoll.h>
+#include <linux/splice.h>
+#include <linux/tty.h>
+#include <linux/timerfd.h>
+#include <linux/cgroup.h>
+#include <linux/eventfd.h>
+#include <linux/anon_inodes.h>
+#include <linux/genhd.h>
+
+#include <linux/nfs_mount.h>
+#include <linux/nfs_fs.h>
+#undef dprintk
+
+#include "../../fs/autofs4/autofs_i.h"
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+#include "cpt_process.h"
+
+static cpt_object_t *
+cpt_lookup_bind_source(struct vfsmount *mnt, cpt_context_t *ctx);
+
+void (*vefs_track_notify_hook)(struct dentry *vdentry, int track_cow);
+void (*vefs_track_force_stop_hook)(struct super_block *super);
+struct dentry * (*vefs_replaced_dentry_hook)(struct dentry *de);
+int (*vefs_is_renamed_dentry_hook)(struct dentry *vde, struct dentry *pde);
+EXPORT_SYMBOL(vefs_track_notify_hook);
+EXPORT_SYMBOL(vefs_track_force_stop_hook);
+EXPORT_SYMBOL(vefs_replaced_dentry_hook);
+EXPORT_SYMBOL(vefs_is_renamed_dentry_hook);
+
+static inline int is_signalfd_file(struct file *file)
+{
+	/* no other users of it yet */
+	return file->f_op == &signalfd_fops;
+}
+
+static inline int is_timerfd_file(struct file *file)
+{
+	/* no other users of it yet */
+	return file->f_op == &timerfd_fops;
+}
+
+static inline int is_eventfd_file(struct file *file)
+{
+	/* no other users of it yet */
+	return file->f_op == &eventfd_fops;
+}
+
+static inline int is_fake_file(struct file *file)
+{
+	if (file->f_op != &bad_file_ops)
+		return 0;
+
+	return file->f_dentry->d_inode == anon_inode_inode;
+}
+
+static int chrdev_is_tty(dev_t dev)
+{
+	int major = MAJOR(dev);
+
+	return (major == PTY_MASTER_MAJOR ||
+	    (major >= UNIX98_PTY_MASTER_MAJOR &&
+	     major < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
+	    major == PTY_SLAVE_MAJOR ||
+	    major == UNIX98_PTY_SLAVE_MAJOR ||
+	    major == TTYAUX_MAJOR || major == TTY_MAJOR);
+}
+
+void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt)
+{
+	char *path;
+	struct path p;
+	unsigned long pg = __get_free_page(GFP_KERNEL);
+
+	if (!pg)
+		return;
+
+	p.dentry = d;
+	p.mnt = mnt;
+	path = d_path(&p, (char *)pg, PAGE_SIZE);
+
+	if (!IS_ERR(path))
+		eprintk("<%s>\n", path);
+	free_page(pg);
+}
+
+int cpt_need_delayfs(struct vfsmount *mnt)
+{
+	if (slab_ub(mnt) != get_exec_ub())
+		return 0;
+	if (mnt->mnt_sb->s_magic == FSMAGIC_NFS)
+		return 1;
+	if (is_autofs_mount(mnt))
+		return 1;
+	if (is_autofs_mount(mnt->mnt_parent))
+		return 1;
+	return 0;
+}
+
+int cpt_need_vfsmount(struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	if (vfsmnt == get_exec_env()->shmem_mnt)
+		return 0;
+
+	switch (dentry->d_inode->i_sb->s_magic) {
+		case FSMAGIC_PIPEFS:
+		case FSMAGIC_SOCKFS:
+		case FSMAGIC_BDEV:
+		case FSMAGIC_FUTEX:
+		case FSMAGIC_INOTIFY:
+		case FSMAGIC_MQUEUE:
+		case FSMAGIC_ANON:
+			return 0;
+		default:
+			eprintk("no vfsmount: ");
+			cpt_printk_dentry(dentry, vfsmnt);
+			eprintk(" magic:%lx\n", dentry->d_inode->i_sb->s_magic);
+			return 1;
+	}
+}
+
+static int
+cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx)
+{
+	int result = 0;
+	char *path;
+	unsigned long pg;
+	struct dentry * renamed_dentry;
+	struct path p;
+
+	if (de->d_sb->s_magic != FSMAGIC_VEFS)
+		return 0;
+	if (de->d_inode->i_nlink != 0 ||
+	    atomic_read(&de->d_inode->i_writecount) > 0) 
+		return 0;
+
+	renamed_dentry = vefs_replaced_dentry_hook(de);
+	if (renamed_dentry == NULL)
+		return 0;
+
+	pg = __get_free_page(GFP_KERNEL);
+	if (!pg)
+		return 0;
+
+	p.dentry = de;
+	p.mnt = mnt;
+	path = d_path(&p, (char *)pg, PAGE_SIZE);
+	if (!IS_ERR(path)) {
+		int len;
+		struct nameidata nd;
+
+		len = pg + PAGE_SIZE - 1 - (unsigned long)path;
+		if (len >= sizeof("(deleted) ") - 1 &&
+		    !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) {
+			len -= sizeof("(deleted) ") - 1;
+			path += sizeof("(deleted) ") - 1;
+		}
+
+		if (path_lookup(path, 0, &nd) == 0) {
+			if (mnt == nd.path.mnt &&
+			    vefs_is_renamed_dentry_hook(nd.path.dentry, renamed_dentry))
+				result = 1;
+			path_put(&nd.path);
+		}
+	}
+	free_page(pg);
+	return result;
+}
+
+static int cpt_dump_path(struct dentry *d, struct vfsmount *mnt,
+			   int replaced, cpt_context_t *ctx)
+{
+	int len;
+	char *path;
+	char *pg = cpt_get_buf(ctx);
+	loff_t saved;
+	struct path p;
+
+	p.dentry = d;
+	p.mnt = mnt;
+
+	path = d_path(&p, pg, PAGE_SIZE);
+	len = PTR_ERR(path);
+
+	if (IS_ERR(path)) {
+		struct cpt_object_hdr o;
+		char tmp[1];
+
+		/* VZ changes d_path() to return EINVAL, when path
+		 * is not supposed to be visible inside VE.
+		 * This changes behaviour of d_path() comparing
+		 * to mainstream kernel, f.e. d_path() fails
+		 * on any kind of shared memory. Maybe, there are
+		 * another cases, but I am aware only about this one.
+		 * So, we just ignore error on shmem mounts and proceed.
+		 * Otherwise, checkpointing is prohibited because
+		 * of reference to an invisible file.
+		 */
+		if (len != -EINVAL ||
+		    mnt != get_exec_env()->shmem_mnt)
+			eprintk_ctx("d_path err=%d\n", len);
+		else
+			len = 0;
+
+		cpt_push_object(&saved, ctx);
+		cpt_open_object(NULL, ctx);
+		o.cpt_next = CPT_NULL;
+		o.cpt_object = CPT_OBJ_NAME;
+		o.cpt_hdrlen = sizeof(o);
+		o.cpt_content = CPT_CONTENT_NAME;
+		tmp[0] = 0;
+
+		ctx->write(&o, sizeof(o), ctx);
+		ctx->write(tmp, 1, ctx);
+		ctx->align(ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved, ctx);
+
+		__cpt_release_buf(ctx);
+		return len;
+	} else {
+		struct cpt_object_hdr o;
+
+		len = pg + PAGE_SIZE - 1 - path;
+		if (replaced &&
+		    len >= sizeof("(deleted) ") - 1 &&
+		    !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) {
+			len -= sizeof("(deleted) ") - 1;
+			path += sizeof("(deleted) ") - 1;
+		}
+		o.cpt_next = CPT_NULL;
+		o.cpt_object = CPT_OBJ_NAME;
+		o.cpt_hdrlen = sizeof(o);
+		o.cpt_content = CPT_CONTENT_NAME;
+		path[len] = 0;
+
+		cpt_push_object(&saved, ctx);
+		cpt_open_object(NULL, ctx);
+		ctx->write(&o, sizeof(o), ctx);
+		ctx->write(path, len+1, ctx);
+		ctx->align(ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved, ctx);
+		__cpt_release_buf(ctx);
+	}
+	return 0;
+}
+
+static int cpt_dump_nfs_path(struct dentry *d, struct vfsmount *mnt,
+			     cpt_context_t *ctx)
+{
+	char *path;
+	char *pg = cpt_get_buf(ctx);
+	loff_t saved;
+	struct path p;
+	struct nfs_unlinkdata *ud = d->d_fsdata;
+	int dentry_name_len = ud->args.name.len;
+	struct cpt_object_hdr o;
+
+	p.dentry = d->d_parent;
+	p.mnt = mnt;
+
+	path = d_path(&p, pg, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		eprintk_ctx("getting path failed\n");
+		__cpt_release_buf(ctx);
+		return PTR_ERR(path);
+	}
+
+	if (path - pg < dentry_name_len + 1) {
+		eprintk_ctx("full path is too long\n");
+		__cpt_release_buf(ctx);
+		return -ENOMEM;
+	}
+
+	path = strcpy(pg, path);
+	strcat(path, "/");
+	strncat(path, ud->args.name.name, dentry_name_len);
+
+	cpt_push_object(&saved, ctx);
+	cpt_open_object(NULL, ctx);
+	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_NAME;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_NAME;
+
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(path, strlen(path) + 1, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved, ctx);
+
+	__cpt_release_buf(ctx);
+	return 0;
+}
+
+int cpt_dump_string(const char *s, struct cpt_context *ctx)
+{
+	int len;
+	struct cpt_object_hdr o;
+
+	cpt_open_object(NULL, ctx);
+	len = strlen(s);
+	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_NAME;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_NAME;
+
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(s, len+1, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+	return 0;
+}
+
+cpt_object_t *cpt_lookup_vfsmount_obj(struct vfsmount *mnt,
+		struct cpt_context *ctx)
+{
+	while (is_nfs_automount(mnt))
+		mnt = mnt->mnt_parent;
+
+	if (is_autofs_mount(mnt->mnt_parent))
+		mnt = mnt->mnt_parent;
+
+	return lookup_cpt_object(CPT_OBJ_VFSMOUNT_REF, mnt, ctx);
+}
+
+int cpt_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	if (cpt_need_delayfs(mnt)) {
+		generic_fillattr(dentry->d_inode, stat);
+		return 0;
+	}
+
+	return vfs_getattr(mnt, dentry, stat);
+}
+
+int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_inode_image *v = cpt_get_buf(ctx);
+	struct kstat sbuf;
+	cpt_object_t *mntobj;
+
+	v->cpt_next = sizeof(*v);
+	v->cpt_object = CPT_OBJ_INODE;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	if ((err = cpt_getattr(mnt, d, &sbuf)) != 0) {
+		cpt_release_buf(ctx);
+		return err;
+	}
+
+	mntobj = cpt_lookup_vfsmount_obj(mnt, ctx);
+	if (!mntobj && cpt_need_vfsmount(d, mnt)) {
+		cpt_release_buf(ctx);
+		return -ENODEV;
+	}
+
+	v->cpt_dev	= d->d_inode->i_sb->s_dev;
+	v->cpt_ino	= d->d_inode->i_ino;
+	v->cpt_mode	= sbuf.mode;
+	v->cpt_nlink	= sbuf.nlink;
+	v->cpt_uid	= sbuf.uid;
+	v->cpt_gid	= sbuf.gid;
+	v->cpt_rdev	= d->d_inode->i_rdev;
+	v->cpt_size	= sbuf.size;
+	v->cpt_atime	= cpt_timespec_export(&sbuf.atime);
+	v->cpt_mtime	= cpt_timespec_export(&sbuf.mtime);
+	v->cpt_ctime	= cpt_timespec_export(&sbuf.ctime);
+	v->cpt_blksize	= sbuf.blksize;
+	v->cpt_blocks	= sbuf.blocks;
+	v->cpt_sb	= d->d_inode->i_sb->s_magic;
+	v->cpt_vfsmount = mntobj ? mntobj->o_pos : CPT_NULL;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+	return 0;
+}
+
+int cpt_collect_files(cpt_context_t * ctx)
+{
+	int err;
+	cpt_object_t *obj;
+	int index = 0;
+
+	/* Collect process fd sets */
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL)
+			return -ENOMEM;
+	}
+
+	/* Collect files from fd sets */
+	for_each_object(obj, CPT_OBJ_FILES) {
+		int fd;
+		struct files_struct *f = obj->o_obj;
+
+		cpt_obj_setindex(obj, index++, ctx);
+
+		if (obj->o_count != atomic_read(&f->count)) {
+			eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count));
+			return -EBUSY;
+		}
+
+		for (fd = 0; fd < f->fdt->max_fds; fd++) {
+			struct file *file = fcheck_files(f, fd);
+			if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL)
+				return -ENOMEM;
+		}
+	}
+
+	/* Collect files queued by AF_UNIX sockets. */
+	if ((err = cpt_collect_passedfds(ctx)) < 0)
+		return err;
+
+	/* OK. At this point we should count all the references. */
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+		struct file *parent;
+		cpt_object_t *ino_obj;
+
+		if (obj->o_count != atomic_long_read(&file->f_count)) {
+			eprintk_ctx("file struct is referenced outside %d %ld\n", obj->o_count, atomic_long_read(&file->f_count));
+			cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
+			return -EBUSY;
+		}
+
+		switch (file->f_dentry->d_inode->i_sb->s_magic) {
+		case FSMAGIC_FUTEX:
+		case FSMAGIC_MQUEUE:
+		case FSMAGIC_BDEV:
+#ifndef CONFIG_INOTIFY_USER
+		case FSMAGIC_INOTIFY:
+#endif
+			eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic);
+			return -EBUSY;
+		}
+
+		/* Collect inode. It is necessary mostly to resolve deleted
+		 * hard links. */
+		ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
+		if (ino_obj == NULL)
+			return -ENOMEM;
+
+		parent = ino_obj->o_parent;
+		if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry)))
+			ino_obj->o_parent = file;
+
+		if (S_ISCHR(file->f_dentry->d_inode->i_mode)) {
+			if (chrdev_is_tty(file->f_dentry->d_inode->i_rdev)) {
+				err = cpt_collect_tty(file, ctx);
+				if (err)
+					return err;
+			}
+		}
+
+		if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
+			err = cpt_collect_socket(file, ctx);
+			if (err)
+				return err;
+		}
+	}
+
+	err = cpt_index_sockets(ctx);
+
+	return err;
+}
+
+/* /dev/ptmx is special, all the files share one inode, but real tty backend
+ * is attached via file->private_data.
+ */
+
+static inline int is_cloning_inode(struct inode *ino)
+{
+	return S_ISCHR(ino->i_mode) && 
+		ino->i_rdev == MKDEV(TTYAUX_MAJOR,2);
+}
+
+static int dump_one_flock(struct file_lock *fl, int owner,
+		struct cpt_context *ctx, int delay)
+{
+	pid_t pid;
+	struct cpt_flock_image *v;
+
+	if (delay && !fl->fl_ops)
+		delay = 0; /* no remote locks */
+
+	v = cpt_get_buf(ctx);
+
+	v->cpt_next = sizeof(*v);
+	v->cpt_object = CPT_OBJ_FLOCK;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_VOID;
+
+	v->cpt_owner = owner;
+
+	if (fl->fl_nspid)
+		pid = cpt_pid_nr(fl->fl_nspid);
+	else
+		pid = fl->fl_pid;
+
+	if (pid == -1) {
+		if (!(fl->fl_flags&FL_FLOCK)) {
+			eprintk_ctx("posix lock from another container?\n");
+			cpt_release_buf(ctx);
+			return -EBUSY;
+		}
+		pid = 0;
+	}
+
+	v->cpt_pid = pid;
+	v->cpt_start = fl->fl_start;
+	v->cpt_end = fl->fl_end;
+	v->cpt_flags = fl->fl_flags;
+	if (delay)
+		v->cpt_flags |= CPT_FLOCK_DELAYED;
+	v->cpt_type = fl->fl_type;
+	v->cpt_svid = delay ? (__u32)fl->fl_ops->fl_owner_id(fl) : CPT_NOINDEX;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	return 0;
+}
+
+
+int cpt_dump_flock(struct file *file, struct cpt_context *ctx)
+{
+	int err = 0, delay;
+	struct file_lock *fl;
+
+	lock_kernel();
+	for (fl = file->f_dentry->d_inode->i_flock;
+	     fl; fl = fl->fl_next) {
+		if (file != fl->fl_file)
+			continue;
+		if (fl->fl_flags & FL_LEASE) {
+			eprintk_ctx("lease lock is not supported\n");
+			err = -EINVAL;
+			break;
+		}
+
+		delay = cpt_need_delayfs(file->f_vfsmnt);
+
+		if (fl->fl_flags & FL_POSIX) {
+			cpt_object_t *obj;
+			obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx);
+			if (obj) {
+				dump_one_flock(fl, obj->o_index, ctx, delay);
+				continue;
+			} else {
+				eprintk_ctx("unknown lock owner %p\n", fl->fl_owner);
+				err = -EINVAL;
+			}
+		}
+		if (fl->fl_flags & FL_FLOCK) {
+			dump_one_flock(fl, -1, ctx, delay);
+			continue;
+		}
+	}
+	unlock_kernel();
+	return err;
+}
+
+static int dump_content_timerfd(struct file *file, struct cpt_context *ctx)
+{
+	struct cpt_timerfd_image o;
+	loff_t saved_pos;
+	struct timerfd_ctx *timerfd_ctx = file->private_data;
+	struct timespec tv;
+
+	cpt_push_object(&saved_pos, ctx);
+
+	o.cpt_next = sizeof(o);
+	o.cpt_object = CPT_OBJ_TIMERFD;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_VOID;
+
+	o.cpt_clockid = timerfd_ctx->clockid;
+	o.cpt_ticks = timerfd_ctx->ticks;
+	o.cpt_expired = timerfd_ctx->expired;
+
+	tv = ktime_to_timespec(timerfd_get_remaining(timerfd_ctx));
+	o.cpt_it_value = cpt_timespec_export(&tv);
+	tv = ktime_to_timespec(timerfd_ctx->tintv);
+	o.cpt_it_interval = cpt_timespec_export(&tv);
+
+	ctx->write(&o, sizeof(o), ctx);
+
+	cpt_pop_object(&saved_pos, ctx);
+
+	return 0;
+}
+
+static int dump_content_eventfd(struct file *file, struct cpt_context *ctx)
+{
+	struct cpt_eventfd_image o;
+	loff_t saved_pos;
+	struct eventfd_ctx *eventfd_ctx = file->private_data;
+
+	cpt_push_object(&saved_pos, ctx);
+
+	o.cpt_next = sizeof(o);
+	o.cpt_object = CPT_OBJ_EVENTFD;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_VOID;
+
+	o.cpt_count = eventfd_ctx->count;
+	o.cpt_flags = eventfd_ctx->flags;
+
+	ctx->write(&o, sizeof(o), ctx);
+
+	cpt_pop_object(&saved_pos, ctx);
+
+	return 0;
+}
+
+int cpt_pipe_fasync(struct file *file, struct cpt_context *ctx)
+{
+	struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
+	struct fasync_struct *fa;
+
+	for (fa = pipe->fasync_readers; fa; fa = fa->fa_next) {
+		if (fa->fa_file == file)
+			return fa->fa_fd;
+	}
+	for (fa = pipe->fasync_writers; fa; fa = fa->fa_next) {
+		if (fa->fa_file == file)
+			return fa->fa_fd;
+	}
+	return -1;
+}
+
+static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx)
+{
+	int err = 0;
+	cpt_object_t *iobj;
+	struct cpt_file_image *v = cpt_get_buf(ctx);
+	struct kstat sbuf;
+	int replaced = 0;
+	cpt_object_t *mntobj;
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_FILE;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_flags = file->f_flags;
+	v->cpt_mode = file->f_mode;
+	v->cpt_pos = file->f_pos;
+	v->cpt_uid = file->f_cred->uid;
+	v->cpt_gid = file->f_cred->gid;
+
+	cpt_getattr(file->f_vfsmnt, file->f_dentry, &sbuf);
+
+	mntobj = cpt_lookup_vfsmount_obj(file->f_vfsmnt, ctx);
+	if (!mntobj && cpt_need_vfsmount(file->f_dentry, file->f_vfsmnt)) {
+		cpt_release_buf(ctx);
+		return -ENODEV;
+	}
+	v->cpt_i_mode = sbuf.mode;
+	v->cpt_lflags = 0;
+
+	if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) {
+		v->cpt_lflags |= CPT_DENTRY_PROC;
+		if (proc_dentry_of_dead_task(file->f_dentry))
+			v->cpt_lflags |= CPT_DENTRY_PROCPID_DEAD;
+	}
+
+	if (cpt_need_delayfs(file->f_vfsmnt)) {
+		struct dentry *de = file->f_dentry;
+
+		if (obj)
+			obj->o_flags |= CPT_FILE_DELAYFS;
+
+		if (de->d_flags & DCACHE_NFSFS_RENAMED) {
+			v->cpt_lflags |= CPT_DENTRY_SILLYRENAME;
+			if (obj)
+				obj->o_flags |= CPT_FILE_SILLYRENAME;
+		}
+	}
+
+	if (is_fake_file(file))
+		v->cpt_lflags |= CPT_DENTRY_FAKEFILE;
+	else if (IS_ROOT(file->f_dentry))
+		v->cpt_lflags |= CPT_DENTRY_ROOT;
+	else if (d_unhashed(file->f_dentry)) {
+		if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) {
+			v->cpt_lflags |= CPT_DENTRY_REPLACED;
+			replaced = 1;
+		} else if (!(v->cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) {
+			if (file->f_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+				v->cpt_lflags |= CPT_DENTRY_SILLYRENAME;
+			v->cpt_lflags |= CPT_DENTRY_DELETED;
+		}
+	}
+	if (is_cloning_inode(file->f_dentry->d_inode))
+		v->cpt_lflags |= CPT_DENTRY_CLONING;
+
+	v->cpt_inode = CPT_NULL;
+	if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) {
+		iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
+		if (iobj) {
+			v->cpt_inode = iobj->o_pos;
+			if (iobj->o_flags & CPT_INODE_HARDLINKED)
+				v->cpt_lflags |= CPT_DENTRY_HARDLINKED;
+		}
+	}
+	v->cpt_priv = CPT_NULL;
+	v->cpt_fown_fd = -1;
+	if (S_ISCHR(v->cpt_i_mode)) {
+		dev_t dev = file->f_dentry->d_inode->i_rdev;
+
+		if (chrdev_is_tty(dev)) {
+			if (file->private_data) {
+				iobj = lookup_cpt_object(CPT_OBJ_TTY, file_tty(file), ctx);
+				if (iobj) {
+					v->cpt_priv = iobj->o_pos;
+					if (file->f_flags&FASYNC)
+						v->cpt_fown_fd = cpt_tty_fasync(file, ctx);
+				}
+			} else if (hlist_empty(&file->f_dentry->d_inode->i_fsnotify_mark_entries)) {
+				eprintk_ctx("BUG: tty char dev without tty "
+					    "struct and not inotify watched\n");
+				cpt_release_buf(ctx);
+				return -EINVAL;
+			}
+		} else if (dev == MKDEV(MISC_MAJOR, TUN_MINOR))
+			v->cpt_lflags |= CPT_DENTRY_TUNTAP;
+	}
+	if (S_ISSOCK(v->cpt_i_mode)) {
+		if (obj->o_index < 0) {
+			eprintk_ctx("BUG: no socket index\n");
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+		v->cpt_priv = obj->o_index;
+		if (file->f_flags&FASYNC)
+			v->cpt_fown_fd = cpt_socket_fasync(file, ctx);
+	}
+	if (S_ISFIFO(v->cpt_i_mode)) {
+		if (file->f_flags & FASYNC)
+			v->cpt_fown_fd = cpt_pipe_fasync(file, ctx);
+	}
+	if (file->f_op == &eventpoll_fops) {
+		v->cpt_priv = file->f_dentry->d_inode->i_ino;
+		v->cpt_lflags |= CPT_DENTRY_EPOLL;
+	}
+	if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) {
+		v->cpt_priv = file->f_dentry->d_inode->i_ino;
+		v->cpt_lflags |= CPT_DENTRY_INOTIFY;
+	}
+
+	v->cpt_fown_pid = (file->f_owner.pid == NULL ?
+			   CPT_FOWN_STRAY_PID : cpt_pid_nr(file->f_owner.pid));
+	v->cpt_fown_uid = file->f_owner.uid;
+	v->cpt_fown_euid = file->f_owner.euid;
+	v->cpt_fown_signo = file->f_owner.signum;
+
+	if (is_signalfd_file(file)) {
+		struct signalfd_ctx *ctx = file->private_data;
+		v->cpt_lflags |= CPT_DENTRY_SIGNALFD;
+		v->cpt_priv = cpt_sigset_export(&ctx->sigmask);
+	} else if (is_timerfd_file(file))
+		v->cpt_lflags |= CPT_DENTRY_TIMERFD;
+	else if (is_eventfd_file(file))
+		v->cpt_lflags |= CPT_DENTRY_EVENTFD;
+
+	v->cpt_vfsmount = mntobj ? mntobj->o_pos : CPT_NULL;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	if (!S_ISSOCK(v->cpt_i_mode)) {
+		err = cpt_dump_path(file->f_dentry, file->f_vfsmnt,
+				replaced, ctx);
+		if (err)
+			return err;
+		if ((file->f_mode & FMODE_WRITE) &&
+				file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS)
+			vefs_track_notify_hook(file->f_dentry, 1);
+	}
+
+	if (is_timerfd_file(file))
+		dump_content_timerfd(file, ctx);
+
+	if (is_eventfd_file(file))
+		dump_content_eventfd(file, ctx);
+
+	if (file->f_dentry->d_inode->i_flock)
+		err = cpt_dump_flock(file, ctx);
+
+	cpt_close_object(ctx);
+
+	if ((file->f_flags & FASYNC) && (v->cpt_fown_fd == -1)) {
+		eprintk_ctx("No fd for FASYNC %pS\n", file->f_op);
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+int cpt_page_is_zero(struct page * page)
+{
+	int res;
+	unsigned long *kaddr = kmap_atomic(page, KM_USER0);
+
+	if (kaddr[0] ||
+	    memcmp(kaddr, kaddr + 1, PAGE_SIZE - sizeof(unsigned long)))
+		res = 0;
+	else
+		res = 1;
+
+	kunmap_atomic(kaddr, KM_USER0);
+	return res;
+}
+
+enum {
+	TYPE_NONE,
+	TYPE_ZERO,
+	TYPE_DATA,
+	TYPE_ITER
+};
+
+struct dump_data
+{
+	cpt_context_t * ctx;
+	loff_t obj_opened;
+	struct cpt_page_block pgb;
+	int type;
+};
+
+static void flush_block(struct dump_data *dat)
+{
+	cpt_context_t * ctx = dat->ctx;
+
+	if (dat->type == TYPE_NONE)
+		return;
+	if (dat->type == TYPE_ZERO)
+		return;
+
+	ctx->pwrite(&dat->pgb.cpt_end, 8, ctx,
+		    dat->obj_opened + offsetof(struct cpt_page_block, cpt_end));
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+
+	dat->obj_opened = CPT_NULL;
+	dat->type = TYPE_NONE;
+}
+
+static int
+dump_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		struct splice_desc *sd)
+{
+	struct dump_data * dat = sd->u.data;
+	cpt_context_t * ctx = dat->ctx;
+	struct page *page = buf->page;
+	unsigned long size;
+	int ret;
+
+	ret = buf->ops->confirm(pipe, buf);
+	if (unlikely(ret))
+		return ret;
+
+	size = sd->len;
+
+	if (page == ZERO_PAGE(0) ||
+	    cpt_page_is_zero(page)) {
+		if (dat->type == TYPE_ZERO) {
+			/* Just append. */
+			dat->pgb.cpt_end += PAGE_SIZE;
+		}
+		/* Flush opened segment */
+		if (dat->type != TYPE_NONE)
+			flush_block(dat);
+
+		dat->pgb.cpt_start = page->index << PAGE_CACHE_SHIFT;
+		dat->type = TYPE_ZERO;
+	} else {
+		int ntype = TYPE_DATA;
+
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+		if (PageCheckpointed(page) &&
+		    ctx->iter_shm_start &&
+		    !cpt_verify_wrprot(page, ctx))
+			ntype = TYPE_ITER;
+#endif
+		if (ntype != dat->type ||
+		    (ntype == TYPE_ITER &&
+		     dat->pgb.cpt_end - dat->pgb.cpt_start >= 16*PAGE_SIZE))
+			flush_block(dat);
+
+		if (ntype != dat->type) {
+			cpt_open_object(NULL, ctx);
+			dat->obj_opened = ctx->file->f_pos;
+			dat->pgb.cpt_next = CPT_NULL;
+			dat->pgb.cpt_object = ntype == TYPE_DATA ? CPT_OBJ_PAGES :
+				CPT_OBJ_ITERPAGES;
+			dat->pgb.cpt_hdrlen = sizeof(dat->pgb);
+			dat->pgb.cpt_content = CPT_CONTENT_DATA;
+			dat->pgb.cpt_start = page->index << PAGE_CACHE_SHIFT;
+			dat->pgb.cpt_end = dat->pgb.cpt_start;
+
+			ctx->write(&dat->pgb, sizeof(dat->pgb), ctx);
+			dat->type = ntype;
+		}
+
+		if (ntype == TYPE_DATA) {
+			char * kaddr = kmap(page);
+			ctx->write(kaddr, size, ctx);
+			kunmap(page);
+			if (size < PAGE_SIZE) {
+				kaddr = kmap(ZERO_PAGE(0));
+				ctx->write(kaddr, PAGE_SIZE - size, ctx);
+				kunmap(ZERO_PAGE(0));
+				size = PAGE_SIZE;
+			}
+		} else {
+			__u64 pfn = page_to_pfn(page);
+			ctx->write(&pfn, 8, ctx);
+			size = PAGE_SIZE;
+		}
+	}
+	dat->pgb.cpt_end += size;
+
+	return sd->len;
+}
+
+static int
+dump_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, dump_actor);
+}
+
+static int dump_content_regular(struct file *file, struct cpt_context *ctx)
+{
+	loff_t saved_pos;
+	struct dump_data dat;
+	long retval;
+	struct splice_desc sd;
+
+	if (file->f_op == NULL)
+		return -EINVAL;
+
+	if (file->f_op == &shm_file_operations)
+		file = ((struct shm_file_data *)file->private_data)->file;
+
+	if (file->f_op == &shmem_file_operations) {
+		cpt_object_t *obj;
+
+		obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx);
+		if (!obj) {
+			eprintk_ctx("failed to find tmpfs file %p\n", file);
+			return -ENOENT;
+		}
+
+		if (obj->o_flags & CPT_FILE_SYSVIPC) {
+			retval = cpt_dump_content_sysvshm(file, ctx);
+			if (retval < 0) {
+				eprintk_ctx("cannot dump SysV IPC Shared Memory %ld\n", retval);
+				return retval;
+			}
+		}
+	}
+
+	if (!(file->f_mode & FMODE_READ) || (file->f_flags & O_DIRECT)) {
+		struct file *filp;
+
+		filp = dentry_open(dget(file->f_dentry),
+					mntget(file->f_vfsmnt),
+					O_RDONLY | O_LARGEFILE,
+					current_cred());
+		if (IS_ERR(filp)) {
+			cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
+			eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(filp));
+			return PTR_ERR(filp);
+		}
+		file = filp;
+	} else
+		get_file(file);
+
+	dat.ctx = ctx;
+	dat.type = TYPE_NONE;
+
+	cpt_push_object(&saved_pos, ctx);
+
+	sd.len = 0;
+	sd.total_len = 0x40000000UL;
+	sd.flags = 0;
+	sd.pos = 0;
+	sd.u.data = &dat;
+
+	retval = splice_direct_to_actor(file, &sd, dump_splice_actor);
+	if (unlikely(retval < 0)) {
+		fput(file);
+		return retval;
+	}
+
+	if (dat.type != TYPE_NONE)
+		flush_block(&dat);
+
+	cpt_pop_object(&saved_pos, ctx);
+
+	fput(file);
+
+	return 0;
+}
+
+
+static int dump_content_chrdev(struct file *file, struct cpt_context *ctx)
+{
+	dev_t dev = file->f_dentry->d_inode->i_rdev;
+
+	if (MAJOR(dev) == MEM_MAJOR || dev == MKDEV(MISC_MAJOR, TUN_MINOR))
+		return 0;
+	if (chrdev_is_tty(dev))
+		return cpt_dump_content_tty(file, ctx);
+
+	eprintk_ctx("unsupported chrdev %d/%d\n", MAJOR(dev), MINOR(dev));
+	return -EINVAL;
+}
+
+static int dump_content_blkdev(struct file *file, struct cpt_context *ctx)
+{
+	struct inode *ino = file->f_dentry->d_inode;
+
+	/* We are not going to transfer them. */
+	eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino));
+	return -EINVAL;
+}
+
+static int dump_content_fifo(struct file *file, struct cpt_context *ctx)
+{
+	struct inode *ino = file->f_dentry->d_inode;
+	cpt_object_t *obj;
+	loff_t saved_pos;
+	int readers;
+	int writers;
+	int anon = 0;
+
+	mutex_lock(&ino->i_mutex);
+	readers = ino->i_pipe->readers;
+	writers = ino->i_pipe->writers;
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file1 = obj->o_obj;
+		if (file1->f_dentry->d_inode == ino) {
+			if (file1->f_mode & FMODE_READ)
+				readers--;
+			if (file1->f_mode & FMODE_WRITE)
+				writers--;
+		}
+	}
+	mutex_unlock(&ino->i_mutex);
+	if (readers || writers) {
+		struct dentry *dr = file->f_dentry->d_sb->s_root;
+		if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0)
+			anon = 1;
+
+		if (anon) {
+			eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers);
+			return -EBUSY;
+		}
+		/* If fifo has external readers/writers, we are in troubles.
+		 * If the buffer is not empty, we must move its content.
+		 * But if the fifo is owned by a service, we cannot do
+		 * this. See?
+		 *
+		 * For now we assume, that if fifo is opened by another
+		 * process, we do not own it and, hence, migrate without
+		 * data.
+		 */
+		return 0;
+	}
+
+	/* OK, we must save fifo state. No semaphores required. */
+
+	if (ino->i_pipe->nrbufs) {
+		struct cpt_obj_bits *v;
+		struct pipe_inode_info *info;
+		int count, buf, nrbufs;
+
+		cpt_push_object(&saved_pos, ctx);
+		cpt_close_object(ctx);
+		mutex_lock(&ino->i_mutex);
+		info =  ino->i_pipe;
+		count = 0;
+		buf = info->curbuf;
+		nrbufs = info->nrbufs;
+		while (--nrbufs >= 0) {
+			if (!info->bufs[buf].ops->can_merge) {
+				mutex_unlock(&ino->i_mutex);
+				cpt_pop_object(&saved_pos, ctx);
+				eprintk_ctx("unknown format of pipe buffer\n");
+				return -EINVAL;
+			}
+			count += info->bufs[buf].len;
+			buf = (buf+1) & (PIPE_BUFFERS-1);
+		}
+
+		if (!count) {
+			mutex_unlock(&ino->i_mutex);
+			cpt_pop_object(&saved_pos, ctx);
+			return 0;
+		}
+		v = cpt_get_buf(ctx);
+		cpt_open_object(NULL, ctx);
+		v->cpt_next = CPT_NULL;
+		v->cpt_object = CPT_OBJ_BITS;
+		v->cpt_hdrlen = sizeof(*v);
+		v->cpt_content = CPT_CONTENT_DATA;
+		v->cpt_size = count;
+		ctx->write(v, sizeof(*v), ctx);
+		cpt_release_buf(ctx);
+
+		count = 0;
+		buf = info->curbuf;
+		nrbufs = info->nrbufs;
+		while (--nrbufs >= 0) {
+			struct pipe_buffer *b = info->bufs + buf;
+			/* need to ->pin first? */
+			void * addr = b->ops->map(info, b, 0);
+			ctx->write(addr + b->offset, b->len, ctx);
+			b->ops->unmap(info, b, addr);
+			buf = (buf+1) & (PIPE_BUFFERS-1);
+		}
+
+		mutex_unlock(&ino->i_mutex);
+
+		ctx->align(ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_pos, ctx);
+	}
+
+	return 0;
+}
+
+static int dump_content_socket(struct file *file, struct cpt_context *ctx)
+{
+	return 0;
+}
+
+struct cpt_dirent {
+	unsigned long	ino;
+	char		*name;
+	int		namelen;
+	int		found;
+};
+
+static int cpt_filldir(void * __buf, const char * name, int namelen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct cpt_dirent * dirent = __buf;
+
+	if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) {
+		memcpy(dirent->name, name, namelen);
+		dirent->name[namelen] = '\0';
+		dirent->namelen = namelen;
+		dirent->found = 1;
+		return 1;
+	}
+	return 0;
+}
+
+static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt,
+		struct inode *ino, struct cpt_context *ctx)
+{
+	int err = -EBUSY;
+	struct file *f = NULL;
+	struct cpt_dirent entry;
+	struct dentry *de, *found = NULL;
+
+	dprintk_ctx("deleted reference to existing inode, try to find file\n");
+	/* 1. Try to find not deleted dentry in ino->i_dentry list */
+	spin_lock(&dcache_lock);
+	list_for_each_entry(de, &ino->i_dentry, d_alias) {
+		if (!IS_ROOT(de) && d_unhashed(de))
+			continue;
+		found = de;
+		dget_locked(found);
+		break;
+	}
+	spin_unlock(&dcache_lock);
+	if (found) {
+		err = cpt_dump_path(found, mnt, 0, ctx);
+		dput(found);
+		if (!err) {
+			dprintk_ctx("dentry found in aliases\n");
+			return 0;
+		}
+	}
+
+	/* 2. Try to find file in current dir */
+	de = dget_parent(d);
+	if (!de)
+		return -EINVAL;
+
+	mntget(mnt);
+	f = dentry_open(de, mnt, O_RDONLY | O_LARGEFILE, current_cred());
+	if (IS_ERR(f))
+		return PTR_ERR(f);
+
+	entry.ino = ino->i_ino;
+	entry.name = cpt_get_buf(ctx);
+	entry.found = 0;
+	err = vfs_readdir(f, cpt_filldir, &entry);
+	if (err || !entry.found) {
+		err = err ? err : -ENOENT;
+		goto err_readdir;
+	}
+
+	mutex_lock(&de->d_inode->i_mutex);
+	found = lookup_one_len(entry.name, de, entry.namelen);
+	mutex_unlock(&de->d_inode->i_mutex);
+	if (IS_ERR(found)) {
+		err = PTR_ERR(found);
+		goto err_readdir;
+	}
+
+	err = -ENOENT;
+	if (found->d_inode != ino)
+		goto err_lookup;
+
+	dprintk_ctx("dentry found in dir\n");
+	__cpt_release_buf(ctx);
+	err = cpt_dump_path(found, mnt, 0, ctx);
+
+err_lookup:
+	dput(found);
+err_readdir:
+	fput(f);
+	__cpt_release_buf(ctx);
+	return err;
+}
+
+static int dump_unlinked_dentry(struct dentry *d, struct vfsmount *mnt,
+				     struct cpt_context *ctx)
+{
+	if (d->d_flags & DCACHE_NFSFS_RENAMED)
+		return cpt_dump_nfs_path(d, mnt, ctx);
+	return find_linked_dentry(d, mnt, d->d_inode, ctx);
+}
+
+static struct dentry *find_linkdir(struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	int i;
+
+	for (i = 0; i < ctx->linkdirs_num; i++)
+		if (ctx->linkdirs[i]->f_vfsmnt == mnt)
+			return ctx->linkdirs[i]->f_dentry;
+	return NULL;
+}
+
+static struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt,
+				    struct inode *ino, struct cpt_context *ctx)
+{
+	int err;
+	int order = 8;
+	const char *prefix = ".cpt_hardlink.";
+	int preflen = strlen(prefix) + order;
+	char name[preflen + 1];
+	struct dentry *dirde, *hardde;
+
+	dirde = find_linkdir(mnt, ctx);
+	if (!dirde) {
+		eprintk_ctx("Can't find fake link mntdir\n");
+		err = -ENOENT;
+		goto out;
+	}
+
+	ctx->linkcnt++;
+	snprintf(name, sizeof(name), "%s%0*u", prefix, order, ctx->linkcnt);
+
+	mutex_lock(&dirde->d_inode->i_mutex);
+	hardde = lookup_one_len(name, dirde, strlen(name));
+	if (IS_ERR(hardde)) {
+		eprintk_ctx("Can't find hardde: %s\n", name);
+		err = PTR_ERR(hardde);
+		goto out_unlock;
+	}
+
+	if (hardde->d_inode) {
+		/* Userspace should clean hardlinked files from previous
+		 * dump/undump
+		 */
+		eprintk_ctx("Hardlinked file already exists: %s\n", name);
+		err = -EEXIST;
+		goto out_put;
+	}
+
+	if (d == NULL) {
+		struct nameidata nd;
+
+		nd.flags = LOOKUP_CREATE;
+		nd.intent.open.flags = O_EXCL;
+
+		err = vfs_create(dirde->d_inode, hardde, 0600, &nd);
+	} else
+		err = vfs_link(d, dirde->d_inode, hardde);
+	if (err) {
+		eprintk_ctx("error hardlink %s, %d\n", name, err);
+		goto out_put;
+	}
+
+out_unlock:
+	mutex_unlock(&dirde->d_inode->i_mutex);
+out:
+	return err ? ERR_PTR(err) : hardde;
+
+out_put:
+	dput(hardde);
+	goto out_unlock;
+}
+
+static int create_dump_hardlink(struct dentry *d, struct vfsmount *mnt,
+				struct inode *ino, struct cpt_context *ctx)
+{
+	int err;
+	struct dentry *hardde;
+
+	hardde = cpt_fake_link(d, mnt, ino, ctx);
+	if (IS_ERR(hardde))
+		return PTR_ERR(hardde);
+
+	err = cpt_dump_path(hardde, mnt, 0, ctx);
+	dput(hardde);
+
+	return err;
+}
+
+static int dump_one_inode(struct file *file, struct dentry *d,
+			  struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	int err = 0;
+	struct inode *ino = d->d_inode;
+	cpt_object_t *iobj;
+	int dump_it = 0;
+
+	iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx);
+	if (!iobj)
+		return -EINVAL;
+
+	if (iobj->o_pos >= 0)
+		return 0;
+
+	if (ino->i_sb->s_magic == FSMAGIC_PROC &&
+	    proc_dentry_of_dead_task(d))
+		return 0;
+
+	if ((!IS_ROOT(d) && d_unhashed(d)) &&
+	    !cpt_replaced(d, mnt, ctx))
+		dump_it = 1;
+	if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) {
+		if (file->f_dentry->d_inode == anon_inode_inode)
+			return 0;
+		dump_it = 1;
+	}
+
+	if (!dump_it)
+		return 0;
+
+	cpt_open_object(iobj, ctx);
+	cpt_dump_inode(d, mnt, ctx);
+
+	if (!IS_ROOT(d) && d_unhashed(d)) {
+		struct file *parent;
+		parent = iobj->o_parent;
+		if (!parent ||
+		    (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) {
+			/* Inode is not deleted, but it does not
+			 * have references from inside checkpointed
+			 * process group. */
+			if (ino->i_nlink != 0) {
+				err = dump_unlinked_dentry(d, mnt, ctx);
+				if (err && S_ISREG(ino->i_mode)) {
+					err = create_dump_hardlink(d, mnt, ino, ctx);
+					iobj->o_flags |= CPT_INODE_HARDLINKED;
+				} else if (S_ISCHR(ino->i_mode) ||
+					   S_ISBLK(ino->i_mode) ||
+					   S_ISFIFO(ino->i_mode))
+					err = 0;
+
+				if (err) {
+					eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err);
+					return -EBUSY;
+				}
+				if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
+					dump_it = 0;
+			}
+		} else {
+			/* Refer to _another_ file name. */
+			err = cpt_dump_path(parent->f_dentry,
+					parent->f_vfsmnt, 0, ctx);
+			if (err)
+				return err;
+			if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
+				dump_it = 0;
+		}
+	}
+	if (dump_it) {
+		if (S_ISREG(ino->i_mode)) {
+			if ((err = dump_content_regular(file, ctx)) != 0) {
+				eprintk_ctx("dump_content_regular ");
+				cpt_printk_dentry(d, mnt);
+			}
+		} else if (S_ISDIR(ino->i_mode)) {
+			/* We cannot do anything. The directory should be
+			 * empty, so it is not a big deal.
+			 */
+		} else if (S_ISCHR(ino->i_mode)) {
+			err = dump_content_chrdev(file, ctx);
+		} else if (S_ISBLK(ino->i_mode)) {
+			err = dump_content_blkdev(file, ctx);
+		} else if (S_ISFIFO(ino->i_mode)) {
+			err = dump_content_fifo(file, ctx);
+		} else if (S_ISSOCK(ino->i_mode)) {
+			err = dump_content_socket(file, ctx);
+		} else {
+			eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic);
+			err = -EINVAL;
+		}
+	}
+	cpt_close_object(ctx);
+
+	return err;
+}
+
+static void cpt_stop_vzfs_trackers(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_VFSMOUNT_REF) {
+		struct vfsmount *mnt = obj->o_obj;
+		if (mnt->mnt_sb->s_magic == FSMAGIC_VEFS)
+			vefs_track_force_stop_hook(mnt->mnt_sb);
+	}
+}
+
+void cpt_stop_tracker(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+	struct kstat sbuf;
+
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+
+		cpt_getattr(file->f_vfsmnt, file->f_dentry, &sbuf);
+
+		if (!S_ISSOCK(sbuf.mode) && (file->f_mode & FMODE_WRITE) &&
+		    file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS)
+			vefs_track_notify_hook(file->f_dentry, 1);
+	}
+
+	cpt_stop_vzfs_trackers(ctx);
+}
+
+int cpt_dump_files(struct cpt_context *ctx)
+{
+	int epoll_nr, inotify_nr;
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_TTY);
+	for_each_object(obj, CPT_OBJ_TTY) {
+		int err;
+
+		if ((err = cpt_dump_tty(obj, ctx)) != 0)
+			return err;
+	}
+	cpt_close_section(ctx);
+
+	cpt_open_section(ctx, CPT_SECT_INODE);
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+		int err;
+
+		if ((err = dump_one_inode(file, file->f_dentry,
+					  file->f_vfsmnt, ctx)) != 0)
+			return err;
+	}
+	for_each_object(obj, CPT_OBJ_FS) {
+		struct fs_struct *fs = obj->o_obj;
+		int err;
+
+		if (fs->root.dentry &&
+		    (err = dump_one_inode(NULL, fs->root.dentry, fs->root.mnt, ctx)) != 0)
+			return err;
+		if (fs->pwd.dentry &&
+		    (err = dump_one_inode(NULL, fs->pwd.dentry, fs->pwd.mnt, ctx)) != 0)
+			return err;
+	}
+	cpt_close_section(ctx);
+
+	epoll_nr = 0;
+	inotify_nr = 0;
+	cpt_open_section(ctx, CPT_SECT_FILES);
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+		int err;
+
+		if ((err = dump_one_file(obj, file, ctx)) != 0)
+			return err;
+		if (file->f_op == &eventpoll_fops)
+			epoll_nr++;
+		if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY)
+			inotify_nr++;
+	}
+	cpt_close_section(ctx);
+
+	if (epoll_nr) {
+		cpt_open_section(ctx, CPT_SECT_EPOLL);
+		for_each_object(obj, CPT_OBJ_FILE) {
+			struct file *file = obj->o_obj;
+			if (file->f_op == &eventpoll_fops) {
+				int err;
+				if ((err = cpt_dump_epolldev(obj, ctx)) != 0)
+					return err;
+			}
+		}
+		cpt_close_section(ctx);
+	}
+
+	if (inotify_nr) {
+		cpt_open_section(ctx, CPT_SECT_INOTIFY);
+		for_each_object(obj, CPT_OBJ_FILE) {
+			struct file *file = obj->o_obj;
+			if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) {
+				int err = -EINVAL;
+#ifdef CONFIG_INOTIFY_USER
+				if ((err = cpt_dump_inotify(obj, ctx)) != 0)
+#endif
+					return err;
+			}
+		}
+		cpt_close_section(ctx);
+	}
+
+	cpt_open_section(ctx, CPT_SECT_SOCKET);
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		int err;
+
+		if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0)
+			return err;
+	}
+	cpt_close_section(ctx);
+
+	cpt_stop_vzfs_trackers(ctx);
+
+	return 0;
+}
+
+static int dump_filedesc(int fd, struct file *file,
+			 struct files_struct *f, struct cpt_context *ctx)
+{
+	struct cpt_fd_image *v = cpt_get_buf(ctx);
+	cpt_object_t *obj;
+
+	cpt_open_object(NULL, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_FILEDESC;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_VOID;
+
+	v->cpt_fd = fd;
+	obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx);
+	if (!obj) BUG();
+	v->cpt_file = obj->o_pos;
+	v->cpt_flags = 0;
+	if (FD_ISSET(fd, f->fdt->close_on_exec))
+		v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+	cpt_close_object(ctx);
+
+	return 0;
+}
+
+static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct files_struct *f = obj->o_obj;
+	struct cpt_files_struct_image *v = cpt_get_buf(ctx);
+	int fd;
+	loff_t saved_obj;
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_FILES;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_index = obj->o_index;
+	v->cpt_max_fds = f->fdt->max_fds;
+	v->cpt_next_fd = f->next_fd;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	for (fd = 0; fd < f->fdt->max_fds; fd++) {
+		struct file *file = fcheck_files(f, fd);
+		if (file)
+			dump_filedesc(fd, file, f, ctx);
+	}
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_close_object(ctx);
+
+	return 0;
+}
+
+int cpt_dump_files_struct(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_FILES_STRUCT);
+
+	for_each_object(obj, CPT_OBJ_FILES) {
+		int err;
+
+		if ((err = dump_one_file_struct(obj, ctx)) != 0)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+int cpt_collect_fs(cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		if (tsk->fs) {
+			if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL)
+				return -ENOMEM;
+			if (tsk->fs->pwd.dentry &&
+			    cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd.dentry->d_inode, ctx) == NULL)
+				return -ENOMEM;
+			if (tsk->fs->root.dentry &&
+			    cpt_object_add(CPT_OBJ_INODE, tsk->fs->root.dentry->d_inode, ctx) == NULL)
+				return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	struct dentry new_d;
+	struct file file;
+
+	memset(&file, 0, sizeof(file));
+
+	if (!d) {
+		memset(&new_d, 0, sizeof(new_d));
+		new_d.d_parent = &new_d;
+		new_d.d_inode = anon_inode_inode;
+		new_d.d_name.name = FAKE_FILE_NAME;
+		new_d.d_name.len = strlen(FAKE_FILE_NAME);
+		file.f_op = &bad_file_ops;
+		d = &new_d;
+	}
+
+	file.f_dentry = d;
+	file.f_vfsmnt = mnt;
+	file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK;
+	file.f_cred = current->cred;
+
+	return dump_one_file(NULL, &file, ctx);
+}
+
+static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct fs_struct *fs = obj->o_obj;
+	struct cpt_fs_struct_image *v = cpt_get_buf(ctx);
+	loff_t saved_obj;
+	int err;
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_FS;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_umask = fs->umask;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	err = cpt_dump_dir(fs->root.dentry, fs->root.mnt, ctx);
+	if (!err)
+		err = cpt_dump_dir(fs->pwd.dentry, fs->pwd.mnt, ctx);
+
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_close_object(ctx);
+
+	return err;
+}
+
+int cpt_dump_fs_struct(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_FS);
+
+	for_each_object(obj, CPT_OBJ_FS) {
+		int err;
+
+		if ((err = dump_one_fs(obj, ctx)) != 0)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+static int check_autofs(struct super_block *sb, struct cpt_context *ctx)
+{
+	struct autofs_sb_info *si;
+	struct file *f;
+
+	si = autofs4_sbi(sb);
+	if (si->version > 5) {
+		eprintk_ctx("autofs higher than ver5 is not supported\n");
+		return -EINVAL;
+	}
+
+	f = get_task_file(si->pipe_pid, si->pipefd);
+	if (IS_ERR(f)) {
+		eprintk_ctx("autofs pipe is not attached (%ld)\n", PTR_ERR(f));
+		return PTR_ERR(f);
+	}
+
+	if (f != si->pipe) {
+		eprintk_ctx("autofs pipe is not attached\n");
+		fput(f);
+		return -EBADF;
+	}
+
+	if (f->f_mode & FMODE_READ) {
+		fput(f);
+		eprintk_ctx("autofs pipe is attached by the wrong end\n");
+		return -EBADF;
+	}
+
+	/*
+	 * currently autofs' pipefd is
+	 *  a) opened write only
+	 *  b) attached to the daemon task
+	 * these two points make our life very easy:
+	 *  a) we can attach the file to sbi on restore after
+	 *     unfreeze - daemon will not try to write in there
+	 *  b) we can avoid dumping the fd for sbi separately,
+	 *     since the required file will be restore with the
+	 *     task struct in question
+	 *
+	 * In case this breaks some time later (I don't believe it)
+	 * we'll have to dump the opened file ID to the pipe_fd_id
+	 * field of the autofs_mount_data
+	 */
+	fput(f);
+
+	return cpt_object_add(CPT_OBJ_FILE, si->pipe, ctx) ? 0 : -ENOMEM;
+}
+
+static int collect_vfsmount_tree(struct vfsmount *tree, cpt_object_t *ns_obj,
+				 cpt_context_t *ctx)
+{
+	int err = 0;
+	char *path_buf, *path;
+	struct vfsmount *mnt;
+	cpt_object_t *obj;
+
+	path_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!path_buf)
+		return -ENOMEM;
+
+	down_read(&namespace_sem);
+	for (mnt = tree; mnt; mnt = next_mnt(mnt, tree)) {
+		struct path pt;
+
+		pt.dentry = mnt->mnt_root;
+		pt.mnt = mnt;
+		path = d_path(&pt, path_buf, PAGE_SIZE);
+		if (IS_ERR(path))
+			continue;
+
+		if (check_one_vfsmount(mnt)) {
+			eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name);
+			err = -EINVAL;
+			break;
+		}
+
+		if (is_autofs_mount(mnt->mnt_parent))
+			continue;
+
+		if (is_nfs_automount(mnt))
+			continue;
+
+		if (cpt_need_delayfs(mnt->mnt_parent)) {
+			eprintk_ctx("unsupported delayfs submount: %s\n", path);
+			err = -EINVAL;
+			break;
+		}
+
+		if (strncmp(path, " (deleted)", 10) == 0) {
+			eprintk_ctx("unsupported deleted submount: %s\n", path);
+			err = -EINVAL;
+			break;
+		}
+
+		if (is_autofs_mount(mnt)) {
+			err = check_autofs(mnt->mnt_sb, ctx);
+			if (err)
+				break;
+		}
+
+		obj = cpt_object_add(CPT_OBJ_VFSMOUNT_REF, mnt, ctx);
+		if (!obj) {
+			err = -ENOMEM;
+			break;
+		}
+		mntget(mnt);
+
+		if (mnt != tree) {
+			obj->o_parent = lookup_cpt_object(CPT_OBJ_VFSMOUNT_REF,
+							mnt->mnt_parent, ctx);
+			if (!obj->o_parent) {
+				err = -ENOLINK;
+				break;
+			}
+		}
+	}
+	up_read(&namespace_sem);
+
+	free_page((unsigned long) path_buf);
+
+	return err;
+}
+
+int cpt_collect_namespace(cpt_context_t * ctx)
+{
+	struct vfsmount *root;
+	cpt_object_t *obj, *ns_obj;
+	int err;
+
+	/*
+	 * Main namespace shared between all containers,
+	 * here we want to collect only subtree for one ve.
+	 */
+	root = get_exec_env()->root_path.mnt;
+	ns_obj = cpt_object_add(CPT_OBJ_NAMESPACE, root->mnt_ns, ctx);
+	if (!ns_obj)
+		return -ENOMEM;
+	ns_obj->o_flags |= CPT_NAMESPACE_MAIN;
+
+	err = collect_vfsmount_tree(root, ns_obj, ctx);
+	if (err)
+		return err;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+
+		if (!tsk->nsproxy || !tsk->nsproxy->mnt_ns)
+			continue;
+
+		root = tsk->nsproxy->mnt_ns->root;
+		ns_obj = cpt_object_add(CPT_OBJ_NAMESPACE, root->mnt_ns, ctx);
+		if (!ns_obj)
+			return -ENOMEM;
+		if (ns_obj->o_count > 1)
+			continue;
+		err = collect_vfsmount_tree(root, ns_obj, ctx);
+		if (err)
+			 break;
+	}
+
+	return err;
+}
+
+static void *collect_nfs_mount_data(struct vfsmount *mnt) 
+{
+	struct nfs_mount_data_dump *d;
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+	struct nfs_fh *mntfh = NFS_FH(mnt->mnt_root->d_inode);
+	struct nfs_client *clp = nfss->nfs_client;
+	struct rpc_clnt *rpc_clp = clp->cl_rpcclient;
+	char *tmp;
+
+	d = (void *)__get_free_pages(GFP_KERNEL, 1);
+	if (!d)
+		return NULL;
+
+	memset(d, 0, PAGE_SIZE << 1);
+
+	d->version = NFS_MOUNT_MIGRATED;
+	d->flags = nfss->flags;
+	d->rsize = nfss->rsize;
+	d->wsize = nfss->wsize;
+	d->timeo = 10U * rpc_clp->cl_timeout->to_initval / HZ;
+	d->retrans = rpc_clp->cl_timeout->to_retries;
+	d->acregmin = nfss->acregmin/HZ;
+	d->acregmax = nfss->acregmax/HZ;
+	d->acdirmin = nfss->acdirmin/HZ;
+	d->acdirmax = nfss->acdirmax/HZ;
+	d->namlen = nfss->namelen;
+	d->options = nfss->options;
+	d->bsize = nfss->bsize;
+	d->minorversion = clp->cl_minorversion;
+
+	strcpy(d->client_address, clp->cl_ipaddr);
+
+	nfs_fscache_dup_uniq_id(d->fscache_uniq, mnt->mnt_sb);
+
+	d->mount_server.addrlen = nfss->mountd_addrlen;
+	memcpy(&d->mount_server.address, &nfss->mountd_address,
+			d->mount_server.addrlen);
+
+	d->mount_server.version = nfss->mountd_version;
+	d->mount_server.port = nfss->mountd_port;
+	d->mount_server.protocol = nfss->mountd_protocol;
+
+	d->nfs_server.addrlen = clp->cl_addrlen;
+	memcpy(&d->nfs_server.address, &clp->cl_addr,
+			d->nfs_server.addrlen);
+	strcpy(d->nfs_server.hostname, clp->cl_hostname);
+
+	tmp = strchr(mnt->mnt_devname, '/');
+	if (tmp)
+		strcpy(d->nfs_server.export_path, tmp);
+
+	d->nfs_server.port = nfss->port;
+	d->nfs_server.protocol = clp->cl_proto;
+
+	d->auth_flavors = clp->cl_rpcclient->cl_auth->au_flavor;
+
+	d->root.size = mntfh->size;
+	memcpy(d->root.data, mntfh->data, sizeof(d->root.data));
+
+	BUILD_BUG_ON(sizeof(*d) > (PAGE_SIZE << 1));
+
+	return d;
+}
+
+static int dump_nfs_mount_data(struct vfsmount *mnt, cpt_context_t * ctx)
+{
+	struct cpt_object_hdr o;
+	void *data;
+
+	BUG_ON(mnt->mnt_sb->s_magic != FSMAGIC_NFS);
+
+	data = collect_nfs_mount_data(mnt);
+	if (!data)
+		return -ENOMEM;
+
+	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_MOUNT_DATA;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_VOID;
+
+	cpt_open_object(NULL, ctx);
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(data, PAGE_SIZE << 1, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+
+	free_pages((unsigned long)data, 1);
+	return 0;
+}
+
+static void dump_autofs_mount_data(struct vfsmount *mnt, cpt_context_t * ctx)
+{
+	struct autofs_mount_data d;
+	struct autofs_sb_info *si;
+	struct cpt_object_hdr o;
+
+	si = autofs4_sbi(mnt->mnt_sb);
+
+	d.i_uid = mnt->mnt_sb->s_root->d_inode->i_uid;
+	d.i_gid = mnt->mnt_sb->s_root->d_inode->i_gid;
+	d.oz_pgrp = cpt_pid_nr(si->oz_pgrp);
+	d.type = si->type;
+	d.min_proto = si->min_proto;
+	d.max_proto = si->max_proto;
+	d.exp_timeout = si->exp_timeout;
+	d.pipefd = si->pipefd;
+	d.pipe_pid = si->pipe_pid;
+	d.is32bit = 0;
+#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
+	d.is32bit = si->is32bit;
+#endif
+	d.pipe_fd_id = CPT_NULL;
+
+	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_MOUNT_DATA;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_VOID;
+
+	cpt_open_object(NULL, ctx);
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(&d, sizeof(d), ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+}
+
+struct args_t
+{
+	int* pfd;
+	char* path;
+	envid_t veid;
+	struct vfsmount *mnt;
+	char *buf;
+};
+
+static int dumptmpfs(void *arg)
+{
+	int i;
+	struct args_t *args = arg;
+	int *pfd = args->pfd;
+	int fd0, fd2;
+	char *path = args->path;
+	char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL, NULL, NULL };
+
+	i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
+	if (i < 0) {
+		eprintk("cannot enter ve to dump tmpfs\n");
+		module_put(THIS_MODULE);
+		return 255 << 8;
+	}
+
+	if (args->mnt && !list_empty(&args->mnt->mnt_mounts) && strcmp(path, ".") != 0) {
+		/*
+		 * Child mounts prevent dumping of parent tmpfs content.
+		 * We use bind mount to make them hidden. Trick with
+		 * "--transform" allows to save full path in tar file.
+		 */
+		args->buf = vmalloc(strlen(path) + sizeof("s#^#/#") + 1);
+		if (!args->buf) {
+			eprintk("cannot alloc memory\n");
+			module_put(THIS_MODULE);
+			return 255 << 8;
+		}
+
+		sprintf(args->buf, "s#^#%s/#", path); /* Add a prefix to path */
+		path = ".";
+		argv[4] = path;
+		argv[5] = "--transform";
+		argv[6] = args->buf;
+	}
+
+	if (strcmp(path, ".") == 0) {
+		struct path pwd;
+
+		pwd.mnt = vfs_bind_mount(args->mnt, args->mnt->mnt_root);
+		if (IS_ERR(pwd.mnt)) {
+			eprintk("cannot create bind mount to dump tmpfs\n");
+			module_put(THIS_MODULE);
+			return 255 << 8;
+		}
+		pwd.dentry = pwd.mnt->mnt_root;
+		set_fs_pwd(current->fs, &pwd);
+		mntput(pwd.mnt);
+	}
+
+	if (pfd[1] != 1)
+		sc_dup2(pfd[1], 1);
+	set_fs(KERNEL_DS);
+	fd0 = sc_open("/dev/null", O_RDONLY, 0);
+	fd2 = sc_open("/dev/null", O_WRONLY, 0);
+	if (fd0 < 0 || fd2 < 0) {
+		eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2);
+		module_put(THIS_MODULE);
+		return 255 << 8;
+	}
+	if (fd0 != 0)
+		sc_dup2(fd0, 0);
+	if (fd2 != 2)
+		sc_dup2(fd2, 2);
+
+	for (i = 3; i < current->files->fdt->max_fds; i++) {
+		sc_close(i);
+	}
+
+	module_put(THIS_MODULE);
+
+	i = kernel_execve("/bin/tar", argv, NULL);
+	eprintk("failed to exec /bin/tar: %d\n", i);
+	return 255 << 8;
+}
+
+static int cpt_dump_tmpfs(char *path, struct vfsmount *mnt,
+			  struct cpt_context *ctx)
+{
+	int err;
+	int pid;
+	int pfd[2];
+	struct file *f;
+	struct cpt_obj_tar v;
+	char buf[16];
+	int n;
+	loff_t saved_obj;
+	struct args_t args;
+	int status;
+	mm_segment_t oldfs;
+	sigset_t ignore, blocked;
+	struct ve_struct *oldenv;
+	u32 len;
+	loff_t start_pos = ctx->file->f_pos;
+again:
+	len = 0;
+
+	err = sc_pipe(pfd);
+	if (err < 0)
+		return err;
+	args.pfd = pfd;
+	args.path = path;
+	args.veid = VEID(get_exec_env());
+	args.mnt = mnt;
+	args.buf = NULL;
+	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+	sigprocmask(SIG_BLOCK, &ignore, &blocked);
+	oldenv = set_exec_env(get_ve0());
+	err = pid = local_kernel_thread(dumptmpfs, (void*)&args,
+			SIGCHLD | CLONE_VFORK, 0);
+	set_exec_env(oldenv);
+	if (err < 0) {
+		eprintk_ctx("tmpfs local_kernel_thread: %d\n", err);
+		goto out;
+	}
+	f = fget(pfd[0]);
+	sc_close(pfd[1]);
+	sc_close(pfd[0]);
+
+	cpt_push_object(&saved_obj, ctx);
+	cpt_open_object(NULL, ctx);
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NAME;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_DATA;
+	v.cpt_len = 0;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	do {
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
+		set_fs(oldfs);
+		if (n > 0)
+			ctx->write(buf, n, ctx);
+		len += n;
+	} while (n > 0);
+
+	fput(f);
+
+	/* Write real tar'ed lenght */
+	ctx->pwrite(&len, sizeof(len), ctx,
+		    ctx->current_object + offsetof(struct cpt_obj_tar, cpt_len));
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if ((err = sc_waitx(pid, 0, &status)) < 0)
+		eprintk_ctx("wait4: %d\n", err);
+	else if ((status & 0x7f) == 0) {
+		err = (status & 0xff00) >> 8;
+		if (err != 0) {
+			eprintk_ctx("tar exited with %d\n", err);
+			err = -EINVAL;
+		}
+	} else {
+		eprintk_ctx("tar terminated\n");
+		err = -EINVAL;
+	}
+	if (args.buf)
+		vfree(args.buf);
+	set_fs(oldfs);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+	buf[0] = 0;
+	ctx->write(buf, 1, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved_obj, ctx);
+
+	if (((status & 0xff00) >> 8) == 64 && mnt && !list_empty(&mnt->mnt_mounts) &&
+	    strcmp(path, ".") != 0) {
+		eprintk_ctx("old tar version is detected inside container, "
+			    "it does not allow us to dump child tmpfs bindmounts "
+			    "correctly, using workaround\n");
+		mnt = NULL;
+		ctx->file->f_pos = start_pos;
+		goto again;
+	}
+
+	return n ? : err;
+
+out:
+	if (pfd[1] >= 0)
+		sc_close(pfd[1]);
+	if (pfd[0] >= 0)
+		sc_close(pfd[0]);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+	return err;
+}
+
+static cpt_object_t *cpt_lookup_bind_source(struct vfsmount *mnt,
+		cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+	struct vfsmount *src;
+	struct path p;
+
+	p.dentry = mnt->mnt_root;
+
+	for_each_object(obj, CPT_OBJ_VFSMOUNT_REF) {
+		src = obj->o_obj;
+		p.mnt = src;
+
+		if (src == mnt)
+			break;
+		if (src->mnt_sb != mnt->mnt_sb)
+			continue;
+		if (IS_ERR(d_path(&p, NULL, 0)))
+			continue;
+		return obj;
+	}
+	if (mnt->mnt_root != mnt->mnt_sb->s_root)
+		return ERR_PTR(-ENODEV);
+	return NULL;
+}
+
+void uuid_bytes_to_hex(char *buf, const u8 *u)
+{
+	sprintf(buf, "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+			(((((u[0] * 0x100) + u[1]) * 0x100) + u[2]) * 0x100 + u[3]),
+			u[4] * 0x100 + u[5],
+			u[6] * 0x100 + u[7],
+			u[8], u[9],
+			u[10], u[11], u[12], u[13], u[14], u[15]);
+}
+
+EXPORT_SYMBOL(uuid_bytes_to_hex);
+
+static void cpt_dump_uuid(struct vfsmount *mnt, cpt_context_t *ctx)
+{
+	const u8 *uuid = mnt->mnt_sb->s_uuid;
+	char *buf = cpt_get_buf(ctx);
+
+	uuid_bytes_to_hex(buf, uuid);
+	cpt_dump_string(buf, ctx);
+
+	__cpt_release_buf(ctx);
+}
+
+/* Checks if mnt is ploop, which is mounted inside container */
+static int is_ploop(struct vfsmount *mnt, struct cpt_context *ctx)
+{
+	struct super_block *sb = mnt->mnt_sb;
+	const char *name;
+
+	BUG_ON(!rwsem_is_locked(&namespace_sem));
+
+	if (slab_ub(mnt) != get_exec_ub())
+		return 0;
+
+	if (!sb->s_bdev || !sb->s_bdev->bd_disk)
+		return 0;
+
+	name = sb->s_bdev->bd_disk->disk_name;
+
+	if (strncmp(name, "ploop", 5) != 0)
+		return 0;
+
+	if (mnt->mnt_root != mnt->mnt_sb->s_root)
+		return 0;
+
+	return (cpt_lookup_bind_source(mnt, ctx) == NULL);
+}
+
+static int dump_vfsmount(cpt_object_t *obj, cpt_object_t *ns_obj,
+			 struct cpt_context *ctx)
+{
+	struct vfsmount *mnt = obj->o_obj;
+	int err = 0;
+	struct cpt_vfsmount_image v;
+	loff_t saved_obj;
+	char *path_buf, *path;
+	struct path p;
+	cpt_object_t *parent_obj = obj->o_parent, *bind_obj = NULL;
+	int is_cgroup;
+
+	path_buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!path_buf)
+		return -ENOMEM;
+
+	p.dentry = mnt->mnt_root;
+	p.mnt = mnt;
+	path = d_path(&p, path_buf, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		free_page((unsigned long) path_buf);
+		return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path);
+	}
+
+	cpt_open_object(obj, ctx);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_VFSMOUNT;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_ARRAY;
+
+	v.cpt_mntflags = mnt->mnt_flags;
+	v.cpt_mnt_bind = CPT_NULL;
+	v.cpt_mnt_parent = parent_obj ? parent_obj->o_pos : CPT_NULL;
+
+	v.cpt_mnt_shared = CPT_NULL;
+	if ((mnt->mnt_flags & MNT_SHARED) && !list_empty(&mnt->mnt_share)) {
+		struct vfsmount *m;
+		cpt_object_t *shared = NULL;
+		bool found = false;
+
+		list_for_each_entry(m, &mnt->mnt_share, mnt_share) {
+			shared = lookup_cpt_object(CPT_OBJ_VFSMOUNT_REF, m, ctx);
+			if (!shared)
+				continue;
+			found = true;
+			if (shared->o_pos == CPT_NULL)
+				continue;
+			v.cpt_mnt_shared = shared->o_pos;
+			break;
+		}
+		if (!found) {
+			eprintk_ctx("shared mount not found: %s\n", path);
+			err = -ENOENT;
+			goto out_err;
+		}
+	}
+
+	v.cpt_mnt_master = CPT_NULL;
+	if (mnt->mnt_master) {
+		cpt_object_t *master;
+
+		master = lookup_cpt_object(CPT_OBJ_VFSMOUNT_REF,
+				mnt->mnt_master, ctx);
+		if (!master || master->o_pos == CPT_NULL) {
+			eprintk_ctx("master mount not found: %s\n", path);
+			err = -ENOENT;
+			goto out_err;
+		}
+		v.cpt_mnt_master = master->o_pos;
+	}
+
+	is_cgroup = !strcmp(mnt->mnt_sb->s_type->name, "cgroup");
+
+	if (slab_ub(mnt) != get_exec_ub()) {
+		v.cpt_mntflags |= CPT_MNT_EXT;
+	} else if (is_ploop(mnt, ctx)) {
+		v.cpt_mntflags |= CPT_MNT_PLOOP;
+	} else if (cpt_need_delayfs(mnt)) {
+		v.cpt_mntflags |= CPT_MNT_DELAYFS;
+		obj->o_flags |= CPT_VFSMOUNT_DELAYFS;
+	} else if (is_cgroup) {
+		v.cpt_mnt_bind = cpt_add_cgroup(mnt, ctx);
+		if (v.cpt_mnt_bind == CPT_NOINDEX) {
+			err = -ENOENT;
+			goto out_err;
+		}
+	} else {
+		bind_obj = cpt_lookup_bind_source(mnt, ctx);
+		if (IS_ERR(bind_obj)) {
+			err = PTR_ERR(bind_obj);
+			eprintk_ctx("bind mount source not found: %s\n", path);
+			goto out_err;
+		} else if (bind_obj) {
+			v.cpt_mntflags |= CPT_MNT_BIND;
+			v.cpt_mnt_bind = bind_obj->o_pos;
+		} /* else non-bindmount */
+	}
+	v.cpt_flags = mnt->mnt_sb->s_flags;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	if (!is_ploop(mnt, ctx))
+		cpt_dump_string(mnt->mnt_devname ? : "none", ctx);
+	else
+		cpt_dump_uuid(mnt, ctx);
+	cpt_dump_string(path, ctx);
+	cpt_dump_string(mnt->mnt_sb->s_type->name, ctx);
+
+	if (v.cpt_mntflags & CPT_MNT_BIND)
+		err = cpt_dump_path(mnt->mnt_root, bind_obj->o_obj, 0, ctx);
+	else if (!(v.cpt_mntflags & CPT_MNT_EXT) &&
+		 !(v.cpt_mntflags & CPT_MNT_PLOOP)) {
+		if (mnt->mnt_sb->s_type->fs_flags & FS_REQUIRES_DEV) {
+			eprintk_ctx("Checkpoint supports only nodev fs: %s\n",
+				    mnt->mnt_sb->s_type->name);
+			err = -EXDEV;
+		} else if (!strcmp(mnt->mnt_sb->s_type->name, "tmpfs") ||
+			   !strcmp(mnt->mnt_sb->s_type->name, "devtmpfs")) {
+			mntget(mnt);
+			up_read(&namespace_sem);
+			if (ns_obj->o_flags & CPT_NAMESPACE_MAIN)
+				err = cpt_dump_tmpfs(path, mnt, ctx);
+			else
+				err = cpt_dump_tmpfs(".", mnt, ctx);
+			down_read(&namespace_sem);
+			if (!err && list_empty(&mnt->mnt_list))
+				err = -EBUSY;
+			mntput(mnt);
+		}
+	}
+	if (v.cpt_mntflags & CPT_MNT_DELAYFS) {
+		if (mnt->mnt_sb->s_magic == FSMAGIC_NFS) {
+			dump_nfs_mount_data(mnt, ctx);
+		} else if (is_autofs_mount(mnt)) {
+			dump_autofs_mount_data(mnt, ctx);
+		} else {
+			//FIXME dump sb show_options output
+			BUG();
+		}
+	}
+
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_close_object(ctx);
+
+out_err:
+	free_page((unsigned long) path_buf);
+
+	return err;
+}
+
+static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct mnt_namespace *ns = obj->o_obj;
+	struct cpt_object_hdr v;
+	cpt_object_t *mnt_obj;
+	loff_t saved_obj;
+	int err = 0;
+
+	cpt_open_object(obj, ctx);
+
+	v.cpt_next = -1;
+	v.cpt_object = CPT_OBJ_NAMESPACE;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_ARRAY;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+
+	down_read(&namespace_sem);
+	for_each_object(mnt_obj, CPT_OBJ_VFSMOUNT_REF) {
+		struct vfsmount *mnt = mnt_obj->o_obj;
+
+		if (!mnt->mnt_ns) {
+			eprintk_ctx("detached vfsmount %s\n", mnt->mnt_devname);
+			err = -ENOLINK;
+			break;
+		}
+
+		if (mnt->mnt_ns != ns)
+			continue;
+
+		err = dump_vfsmount(mnt_obj, obj, ctx);
+		if (err)
+			break;
+	}
+	up_read(&namespace_sem);
+
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_close_object(ctx);
+
+	return err;
+}
+
+int cpt_dump_namespace(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_NAMESPACE);
+
+	for_each_object(obj, CPT_OBJ_NAMESPACE) {
+		int err;
+
+		if ((err = dump_one_namespace(obj, ctx)) != 0)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+void cpt_finish_vfsmount_ref(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_VFSMOUNT_REF)
+		mntput(obj->o_obj);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_files.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_files.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_files.h	2015-01-21 12:02:48.224093631 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_files.h	2015-01-21 12:02:57.970834900 +0300
@@ -0,0 +1,126 @@
+int cpt_collect_files(cpt_context_t *);
+int cpt_collect_fs(cpt_context_t *);
+int cpt_collect_namespace(cpt_context_t *);
+int cpt_collect_sysvsem_undo(cpt_context_t *);
+int cpt_collect_tty(struct file *, cpt_context_t *);
+void cpt_stop_tracker(struct cpt_context *);
+int cpt_dump_files(struct cpt_context *ctx);
+int cpt_dump_files_struct(struct cpt_context *ctx);
+int cpt_dump_fs_struct(struct cpt_context *ctx);
+int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx);
+int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx);
+int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx);
+struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx);
+struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx);
+struct file * rst_open_file(cpt_object_t *mntobj, char *name,
+		struct cpt_file_image *fi, unsigned flags, struct cpt_context *ctx);
+struct file * rst_open_tty(cpt_object_t *mntobj, char *name,
+		struct cpt_file_image *fi, struct cpt_inode_image *ii,
+		unsigned flags, struct cpt_context *ctx);
+__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx);
+
+int rst_posix_locks(struct cpt_context *ctx);
+
+struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx);
+int rst_task_namespace(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_files(struct cpt_task_image *ti, struct cpt_context *ctx);
+__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_restore_fs(struct cpt_context *ctx);
+
+int cpt_collect_sysv(cpt_context_t *);
+int cpt_dump_sysvsem(struct cpt_context *ctx);
+int cpt_dump_sysvmsg(struct cpt_context *ctx);
+int rst_sysv_ipc(struct cpt_context *ctx);
+int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+
+int cpt_dump_namespace(struct cpt_context *ctx);
+int rst_root_namespace(struct cpt_context *ctx);
+
+int rst_stray_files(struct cpt_context *ctx);
+int rst_tty_jobcontrol(struct cpt_context *ctx);
+
+void rst_flush_filejobs(struct cpt_context *);
+int rst_do_filejobs(struct cpt_context *);
+
+extern struct file_operations signalfd_fops;
+
+int rst_eventpoll(struct cpt_context *);
+struct file *cpt_open_epolldev(struct cpt_file_image *fi,
+			       unsigned flags,
+			       struct cpt_context *ctx);
+int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *);
+
+int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx);
+int rst_get_dentry(struct dentry **dp, struct vfsmount **mp,
+		   loff_t *pos, struct cpt_context *ctx);
+
+int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx);
+int rst_inotify(cpt_context_t *ctx);
+struct file *rst_open_inotify(struct cpt_file_image *fi,
+			      unsigned flags,
+			      struct cpt_context *ctx);
+
+extern struct dentry_operations delay_dir_dops;
+
+#define FAKE_FILE_NAME "[fake_file]"
+
+int rst_path_lookup_at(struct vfsmount *mnt, struct dentry *dentry,
+		const char *name, unsigned int flags, struct nameidata *nd);
+int rst_path_lookup(cpt_object_t *mntobj, const char *path,
+		unsigned int flags, struct nameidata *nd);
+
+#define check_one_vfsmount(mnt) \
+	(strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "devtmpfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "nfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "nfs4") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "autofs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "binfmt_misc") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "ext4") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "vzfs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "rpc_pipefs") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "mqueue") != 0 && \
+	 strcmp(mnt->mnt_sb->s_type->name, "cgroup") != 0)
+
+#define is_autofs_mount(mnt) ((mnt)->mnt_sb->s_magic == FSMAGIC_AUTOFS)
+#define is_sunrpc_pipefs(mnt) ((mnt)->mnt_sb->s_magic == FSMAGIC_RPCAUTH)
+
+int cpt_page_is_zero(struct page * page);
+void cpt_finish_vfsmount_ref(struct cpt_context *ctx);
+void rst_finish_vfsmount_ref(struct cpt_context *ctx);
+
+struct vfsmount *rst_kern_mount(const char *fstype, int flags,
+		const char *name, void *data);
+
+cpt_object_t *cpt_lookup_vfsmount_obj(struct vfsmount *mnt,
+		struct cpt_context *ctx);
+
+int cpt_need_delayfs(struct vfsmount *mnt);
+extern struct file_system_type delayfs_type;
+struct file *rst_delayfs_screw(struct vfsmount *mnt, char *name, int flags, loff_t offset, unsigned int mode);
+struct vfsmount *rst_mount_delayfs(char *type, int flags,
+		char *name, void *data, cpt_context_t *ctx);
+int rst_freeze_delayfs(cpt_context_t *ctx);
+int rst_init_delayfs_daemon(cpt_context_t *ctx);
+int rst_delay_flock(struct file *, struct cpt_flock_image *, cpt_context_t *);
+
+int cpt_dump_string(const char *s, struct cpt_context *ctx);
+
+int cpt_dump_cgroups(struct cpt_context *ctx);
+int rst_cgroups(struct cpt_context *ctx);
+
+int cpt_add_cgroup(struct vfsmount *mnt, struct cpt_context *ctx);
+int rst_cgroup_task(struct cpt_context * ctx);
+void rst_cgroup_close(struct cpt_context * ctx);
+
+void uuid_bytes_to_hex(char *buf, const u8 *u);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_fsmagic.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_fsmagic.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_fsmagic.h	2015-01-21 12:02:48.224093631 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_fsmagic.h	2015-01-21 12:02:49.698054503 +0300
@@ -0,0 +1,19 @@
+/* Collected from kernel sources. */
+
+#define FSMAGIC_TMPFS	0x01021994
+#define FSMAGIC_PIPEFS	0x50495045
+#define FSMAGIC_SOCKFS	0x534F434B
+#define FSMAGIC_PFMFS	0xa0b4d889
+#define FSMAGIC_BDEV	0x62646576
+#define FSMAGIC_FUTEX	0x0BAD1DEA
+#define FSMAGIC_INOTIFY	0x2BAD1DEA
+#define FSMAGIC_MQUEUE	0x19800202
+#define FSMAGIC_PROC	0x9fa0
+#define FSMAGIC_DEVPTS	0x1CD1
+#define FSMAGIC_AUTOFS	0x0187
+#define FSMAGIC_EXT2	0xEF53
+#define FSMAGIC_REISER	0x52654973
+#define FSMAGIC_VEFS    0x565a4653
+#define FSMAGIC_ANON	0x09041934
+#define FSMAGIC_NFS	0x6969
+#define FSMAGIC_RPCAUTH 0x67596969
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_inotify.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_inotify.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_inotify.c	2015-01-21 12:02:48.224093631 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_inotify.c	2015-01-21 12:02:50.892022806 +0300
@@ -0,0 +1,189 @@
+/*
+ *
+ *  kernel/cpt/cpt_inotify.c
+ *
+ *  Copyright (C) 2000-2007  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/inotify.h>
+#include <linux/cpt_image.h>
+#include <linux/fsnotify_backend.h>
+
+#include "../../fs/notify/inotify/inotify.h"
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+static int is_fake_file_dentry(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode != anon_inode_inode)
+		return 0;
+
+	return !strcmp(dentry->d_name.name, FAKE_FILE_NAME);
+}
+
+static int dump_watch_inode(struct path *path, cpt_context_t *ctx)
+{
+	struct dentry *d;
+
+	if (cpt_need_delayfs(path->mnt)) {
+		eprintk_ctx("inotify migration for delayed mounts (NFS) is not "
+				"supported\n");
+		return -EINVAL;
+	}
+
+	d = path->dentry;
+
+	if ((d_unhashed(d) && !IS_ROOT(d)) || is_fake_file_dentry(d))
+		d = NULL;
+
+	return cpt_dump_dir(d, path->mnt, ctx);
+}
+
+static int cpt_dump_watches(struct fsnotify_group *g, struct cpt_context *ctx)
+{
+	int err = 0;
+	struct fsnotify_mark_entry *fse;
+	struct inotify_inode_mark_entry *ie;
+	struct cpt_inotify_wd_image wi;
+	loff_t saved_obj;
+
+	/* FIXME locking */
+	list_for_each_entry(fse, &g->mark_entries, g_list) {
+		struct nameidata nd;
+
+		ie = container_of(fse, struct inotify_inode_mark_entry,
+				fsn_entry);
+
+		err = path_lookup(ie->cpt_wd_path, 0, &nd);
+		if (err) {
+			/*
+			 * If the watchee we're looking for has been
+			 * deleted, the "delete" event should be in
+			 * notify queue, mark is alive but path no
+			 * longer accessible, thus simply skip it
+			 * from dumping -- we won't receive any new
+			 * event from it.
+			 */
+			if (err == -ENOENT) {
+				err = 0;
+				continue;
+			} else {
+				eprintk_ctx("Unable to resolve inotify mark path `%s': err = %d\n",
+					    ie->cpt_wd_path, err);
+				break;
+			}
+		}
+
+		cpt_open_object(NULL, ctx);
+
+		wi.cpt_next = CPT_NULL;
+		wi.cpt_object = CPT_OBJ_INOTIFY_WATCH;
+		wi.cpt_hdrlen = sizeof(wi);
+		wi.cpt_content = CPT_CONTENT_ARRAY;
+		wi.cpt_wd = ie->wd;
+		wi.cpt_mask = fse->mask;
+
+		ctx->write(&wi, sizeof(wi), ctx);
+
+		cpt_push_object(&saved_obj, ctx);
+
+		err = dump_watch_inode(&nd.path, ctx);
+		path_put(&nd.path);
+
+		cpt_pop_object(&saved_obj, ctx);
+
+		cpt_close_object(ctx);
+
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int cpt_dump_events(struct fsnotify_group *g, struct cpt_context *ctx)
+{
+	/* FIXME - implement */
+	if (!list_empty(&g->notification_list))
+		wprintk_ctx("Inotify events are lost. Sorry...\n");
+
+	return 0;
+}
+
+int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx)
+{
+	int err;
+	struct file *file = obj->o_obj;
+	struct fsnotify_group *group;
+	struct cpt_inotify_image ii;
+	loff_t saved_obj;
+
+	if (file->f_op != &inotify_fops) {
+		eprintk_ctx("bad inotify file\n");
+		return -EINVAL;
+	}
+
+	group = file->private_data;
+	if (unlikely(group == NULL)) {
+		eprintk_ctx("bad inotify group\n");
+		return -EINVAL;
+	}
+
+	if (group->inotify_data.fa != NULL) {
+		eprintk_ctx("inotify with fasync\n");
+		return -ENOTSUPP;
+	}
+
+	cpt_open_object(NULL, ctx);
+
+	ii.cpt_next = CPT_NULL;
+	ii.cpt_object = CPT_OBJ_INOTIFY;
+	ii.cpt_hdrlen = sizeof(ii);
+	ii.cpt_content = CPT_CONTENT_ARRAY;
+	ii.cpt_file = obj->o_pos;
+	ii.cpt_user = group->inotify_data.user->uid;
+	ii.cpt_max_events = group->max_events;
+	ii.cpt_last_wd = group->max_events;
+
+	ctx->write(&ii, sizeof(ii), ctx);
+	cpt_push_object(&saved_obj, ctx);
+
+	err = cpt_dump_watches(group, ctx);
+	if (err == 0)
+		err = cpt_dump_events(group, ctx);
+
+	cpt_pop_object(&saved_obj, ctx);
+	cpt_close_object(ctx);
+
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_iterative.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_iterative.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_iterative.c	2015-01-21 12:02:48.692081209 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_iterative.c	2015-01-21 12:02:50.280039053 +0300
@@ -0,0 +1,740 @@
+#include <linux/autoconf.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/errno.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/uio.h>
+#include <linux/mount.h>
+#include <linux/splice.h>
+#ifndef __ia64__
+#include <asm/ldt.h>
+#endif
+#include <asm/mmu.h>
+#include <asm/tlb.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_pagein.h"
+
+struct iter_data
+{
+#define CPT_XFER_BATCH	64
+	int		xfer_nr;
+	struct page	*xfer_batch[CPT_XFER_BATCH];
+	int		iter_new;
+	int		iter_young;
+	int		iter_shm;
+	int		iter;
+	cpt_context_t	*ctx;
+};
+
+/* Algo is the following:
+ * 
+ * 1. At the first iteration all appropriate pte's are maked COW,
+ *    pages are marked PG_checkpointed and transferred (indexed
+ *    by pfn).
+ * 2. do_wp_page(), if it wants to pte_mkwrite(), clears PG_checkpointed.
+ *    Also, PG_checkpointed is cleared, when a page is unmapped.
+ * 3. At the next iterations we check PG_checkpoint. If it is set,
+ *    we are lucky. If it is not, page is new or it was changed, so that
+ *    we send new copy.
+ * 4. Iterations stop when amount of new pages is < thresh_1 or it is
+ *    more than pages found at the first iteration / 2^N. So, we never
+ *    transfer more than 2*memsize.
+ * 5. Then we freeze VE.
+ * 6. cpt_mm, if sees a page, marked PG_checkpoint, sends its pfn.
+ *    (well, and panics, if pte is writable).
+ */
+
+static int add_to_xfer_list(struct page *pg, struct iter_data *iter,
+			    cpt_context_t *ctx)
+{
+	int slot = iter->xfer_nr;
+
+	BUG_ON(slot >= CPT_XFER_BATCH);
+	iter->xfer_batch[slot] = pg;
+	return ((iter->xfer_nr = slot + 1) == CPT_XFER_BATCH);
+}
+
+static int submit_page(struct page *pg, cpt_context_t *ctx)
+{
+	int err;
+	struct iovec iov[2];
+	struct file *file = ctx->pagein_file_out;
+	mm_segment_t oldfs;
+	struct pgin_reply rep;
+
+	if (!file)
+		return -EBADF;
+
+	rep.rmid = PGIN_RMID;
+	rep.error = 0;
+	rep.handle = page_to_pfn(pg);
+
+	iov[0].iov_base = &rep;
+	iov[0].iov_len = sizeof(rep);
+	iov[1].iov_base = kmap(pg);
+	iov[1].iov_len = PAGE_SIZE;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	err = vfs_writev(file, iov, 2, &file->f_pos);
+	set_fs(oldfs);
+	kunmap(pg);
+	if (err < 0)
+		return err;
+	if (err != sizeof(rep) + PAGE_SIZE)
+		return -EIO;
+	return 0;
+}
+
+static int flush_transfer(struct iter_data *iter, cpt_context_t *ctx)
+{
+	int err = 0;
+	int slot;
+
+	for (slot = 0; slot < iter->xfer_nr; slot++) {
+		struct page *pg = iter->xfer_batch[slot];
+		if (!err)
+			err = submit_page(pg, ctx);
+		page_cache_release(pg);
+	}
+	iter->xfer_nr = 0;
+	return err;
+}
+
+static inline int iter_one_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			       unsigned long addr, unsigned long end,
+			       struct iter_data *iter, cpt_context_t *ctx)
+{
+	int err = 0;
+	pte_t *pte;
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	do {
+		pte_t ptent = *pte;
+		struct page *pg;
+		int retr = 0;
+
+retry:
+		if (pte_none(ptent))
+			continue;
+		if (!pte_present(*pte)) {
+			if (pte_file(ptent))
+				continue;
+
+			pte_unmap_unlock(pte, ptl);
+			err = handle_mm_fault(mm, vma, addr, 0);
+			if (err & VM_FAULT_OOM)
+				return -ENOMEM;
+			if (err & VM_FAULT_ERROR)
+				return -EFAULT;
+			err = 0;
+			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+			ptent = *pte;
+			retr = 1;
+			goto retry;
+		}
+
+		pg = vm_normal_page(vma, addr, ptent);
+
+                if (pg == NULL ||
+		    !pg->mapping ||
+		    !PageAnon(pg) ||
+		    PageReserved(pg) ||
+		    pg == ZERO_PAGE(addr))
+			continue;
+
+		if (iter->iter >= 0) {
+			if (ptep_test_and_clear_young(vma, addr, pte) && !retr)
+				iter->iter_young++;
+		}
+
+		if (iter->iter == 0) {
+			/* Just clear the state */
+			ClearPageCheckpointed(pg);
+			iter->iter_new++;
+			continue;
+		}
+
+		if (PageCheckpointed(pg)) {
+			if (pte_write(ptent)) {
+				pte_unmap_unlock(pte, ptl);
+				eprintk("COW lost %lu %lu!\n", addr, page_to_pfn(pg));
+				return -EFAULT;
+			}
+			continue;
+		}
+
+		iter->iter_new++;
+		get_page(pg);
+		SetPageCheckpointed(pg);
+		ptep_set_wrprotect(vma->vm_mm, addr, pte);
+		if (add_to_xfer_list(pg, iter, ctx)) {
+			pte_unmap_unlock(pte, ptl);
+			err = flush_transfer(iter, ctx);
+			flush_tlb_range(vma, vma->vm_start, vma->vm_end);
+			if (err)
+				return err;
+			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	pte_unmap_unlock(pte - 1, ptl);
+
+	return err;
+}
+
+static inline int
+iter_one_pud(struct vm_area_struct * vma, pud_t *pud,
+		unsigned long addr, unsigned long end, struct iter_data *iter,
+		cpt_context_t *ctx)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		int err;
+		next = pmd_addr_end(addr, end);
+		split_huge_page_pmd(vma->vm_mm, pmd);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		err = iter_one_pmd(vma, pmd, addr, next, iter, ctx);
+		if (err)
+			return err;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int
+iter_one_pgd(struct vm_area_struct * vma, pgd_t *pgd,
+	     unsigned long addr, unsigned long end, struct iter_data *iter,
+	     cpt_context_t *ctx)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		int err;
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		err = iter_one_pud(vma, pud, addr, next, iter, ctx);
+		if (err)
+			return err;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+static int iter_one_vma(struct iter_data *iter, struct vm_area_struct *vma,
+			struct task_struct *tsk, cpt_context_t *ctx)
+{
+	pgd_t *pgd;
+	unsigned long addr, end, next;
+
+	addr = vma->vm_start;
+	end = vma->vm_end;
+
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		int err;
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		err = iter_one_pgd(vma, pgd, addr, next, iter, ctx);
+		if (err)
+			return err;
+	} while (pgd++, addr = next, addr != end);
+	return 0;
+}
+
+static int iter_one_mm(struct task_struct *tsk, struct mm_struct *mm,
+		       void *data, cpt_context_t *ctx)
+{
+	int err = 0, err2 = 0;
+	struct iter_data *iter = data;
+	struct vm_area_struct *vma;
+
+	/* OK, now we are going to scan VM */
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		/* We have to mangle page protection bits on share memory
+		 * vmas to enforce write ptotection. 
+		 */
+		if (ctx->iter_shm_start &&
+		    (vma->vm_flags & VM_SHARED) &&
+		    vma->vm_file &&
+		    vma->vm_file->f_vfsmnt == VE_TASK_INFO(tsk)->owner_env->shmem_mnt) {
+			int vm_flags = vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
+			pgprot_t nprot = vm_get_page_prot(vm_flags);
+			pgprot_t oprot = vm_get_page_prot(vm_flags|VM_SHARED);
+
+			if (iter->iter == 0) {
+				if (pgprot_val(oprot) != pgprot_val(vma->vm_page_prot)) {
+					iprintk_ctx("Unusual page protection %llx %llx " CPT_FID " %lx\n",
+						    (unsigned long long)pgprot_val(oprot),
+						    (unsigned long long)pgprot_val(vma->vm_page_prot),
+						    CPT_TID(tsk), vma->vm_start);
+					ctx->iter_shm_start = 0;
+				} else {
+					vma->vm_page_prot = nprot;
+				}
+			} else {
+				/* Old vma's were updated at 0th iteration.
+				 * New ones must have correct protection because
+				 * we set AS_CHECKPOINT on shmem mapping.
+				 * If pgprot is wrong, something is wrong.
+				 */
+				if (pgprot_val(nprot) != pgprot_val(vma->vm_page_prot)) {
+					iprintk_ctx("Page protection lost\n");
+					ctx->iter_shm_start = 0;
+				}
+			}
+		}
+
+		/* Do only true simple anonymous VMAs. */
+		if (!vma->anon_vma)
+			continue;
+		if (is_vm_hugetlb_page(vma))
+			continue;
+		if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) != VM_MAYWRITE)
+			continue;
+		err = iter_one_vma(iter, vma, tsk, ctx);
+		if (iter->xfer_nr) {
+			flush_tlb_range(vma, vma->vm_start, vma->vm_end);
+			if (iter->iter)
+				err2 = flush_transfer(iter, ctx);
+		}
+		if (err || err2)
+			break;
+	}
+	up_read(&mm->mmap_sem);
+	return err ? : err2;
+}
+
+int cpt_walk_mm(int (*doit)(struct task_struct *tsk, struct mm_struct *mm,
+			  void *data, cpt_context_t *ctx),
+		void *data,
+		cpt_context_t *ctx)
+{
+	int err = 0;
+	struct task_struct *p;
+	struct ve_struct *env;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (env == NULL)
+		return -ESRCH;
+
+	write_lock_irq(&tasklist_lock);
+
+	do {
+		struct mm_struct *mm;
+
+		/* VE is empty, stop scanning. */
+		if (list_empty(&env->vetask_auxlist))
+			break;
+
+		p = list_entry(env->vetask_auxlist.prev, struct task_struct, ve_task_info.aux_list);
+		list_del(&VE_TASK_INFO(p)->aux_list);
+		list_add(&VE_TASK_INFO(p)->aux_list, &env->vetask_auxlist);
+
+		get_task_struct(p);
+		write_unlock_irq(&tasklist_lock);
+
+		mm = get_task_mm(p);
+		if (mm) {
+			err = doit(p, mm, data, ctx);
+			mmput(mm);
+		}
+
+		put_task_struct(p);
+
+		cond_resched();
+
+		write_lock_irq(&tasklist_lock);
+		if (err)
+			break;
+	} while (p != __first_task_ve(env));
+
+	write_unlock_irq(&tasklist_lock);
+
+	put_ve(env);
+
+	return err;
+}
+
+/* Just clear the state */
+
+static int iter_one_shm_zero(struct inode * inode,
+			     void *data, cpt_context_t *ctx)
+{
+	struct iter_data *iter = data;
+	unsigned long idx;
+
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
+	for (idx = 0;
+	     idx < (i_size_read(inode)+PAGE_CACHE_SIZE-1)/PAGE_CACHE_SIZE;
+	     idx++) {
+		struct page * pg;
+
+		pg = find_lock_page(inode->i_mapping, idx);
+		if (pg && !radix_tree_exceptional_entry(pg)) {
+			ClearPageCheckpointed(pg);
+			iter->iter_new++;
+			iter->iter_shm++;
+			unlock_page(pg);
+			page_cache_release(pg);
+		}
+	}
+	if (iter->iter_shm) {
+		set_bit(AS_CHECKPOINT, &inode->i_mapping->flags);
+		ctx->iter_shm_start = 1;
+	}
+	return 0;
+}
+
+static int write_protect(struct page * page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int mapcnt = 0;
+
+	SetPageCheckpointed(page);
+
+	if (!page_mapcount(page))
+		return 0;
+
+	/* Lazy... */
+	if (!list_empty(&mapping->i_mmap_nonlinear)) {
+		ClearPageCheckpointed(page);
+		return -EBUSY;
+	}
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &page->mapping->i_mmap,
+			      pgoff, pgoff) {
+		unsigned long addr = vma_address(page, vma);
+		BUG_ON(IS_ERR_VALUE(addr));
+		if (cpt_check_page(vma, addr, page, 1)) {
+			flush_tlb_page(vma, addr);
+			mapcnt++;
+		}
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+	return mapcnt;
+}
+
+int cpt_verify_wrprot(struct page * page, cpt_context_t * ctx)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int mapcnt = 0;
+
+	if (!list_empty(&mapping->i_mmap_nonlinear)) {
+		iprintk_ctx("Unexpected nonlinear mapping %Ld\n", ctx->file->f_pos);
+		return -EBUSY;
+	}
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &page->mapping->i_mmap,
+			      pgoff, pgoff) {
+		unsigned long addr = vma_address(page, vma);
+		BUG_ON(IS_ERR_VALUE(addr));
+		if (cpt_check_page(vma, addr, page, -1)) {
+			mapcnt++;
+		}
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+	if (mapcnt)
+		iprintk("WRPROT broken, %Ld\n", ctx->file->f_pos);
+	return mapcnt;
+}
+
+static int
+iter_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		struct splice_desc *sd)
+{
+	struct iter_data * iter = sd->u.data;
+	struct page *page = buf->page;
+	cpt_context_t * ctx = iter->ctx;
+	int err = 0, ret;
+
+	ret = buf->ops->confirm(pipe, buf);
+	if (unlikely(ret))
+		return ret;
+
+	if (page != ZERO_PAGE(0) && !cpt_page_is_zero(page)) {
+		lock_page(page);
+		if (!PageCheckpointed(page)) {
+			if (write_protect(page) >= 0) {
+				iter->iter_new++;
+				iter->iter_shm++;
+				get_page(page);
+				if (add_to_xfer_list(page, iter, ctx))
+					err = flush_transfer(iter, ctx);
+			}
+		}
+		unlock_page(page);
+	}
+
+	return (err) ? : sd->len;
+}
+
+static int
+iter_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, iter_actor);
+}
+
+static int iter_one_shm(struct inode * inode,
+			void *data, cpt_context_t *ctx)
+{
+	struct iter_data *iter = data;
+	struct file dummyf;
+	struct dentry dummyd;
+	struct splice_desc sd;
+	ssize_t retval;
+
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
+	dummyf.f_op = fops_get(inode->i_fop);
+	dummyf.f_mapping = inode->i_mapping;
+	dummyf.f_dentry = &dummyd;
+	dummyf.f_flags = O_NOATIME;
+	dummyf.f_mode = FMODE_READ;
+	dummyd.d_inode = inode;
+
+	sd.len = 0;
+	sd.total_len = 0x40000000UL;
+	sd.flags = 0;
+	sd.pos = 0;
+	sd.u.data = iter;
+
+	retval = splice_direct_to_actor(&dummyf, &sd, iter_splice_actor);
+
+	fops_put(dummyf.f_op);
+
+	return (retval < 0) ? : 0;
+}
+
+extern spinlock_t inode_lock;
+
+int cpt_walk_shm(int (*doit)(struct inode * inode,
+			     void *data, cpt_context_t *ctx),
+		void *data,
+		cpt_context_t *ctx)
+{
+	int err = 0;
+	struct ve_struct *env;
+	struct super_block *sb;
+	struct inode *inode, *old;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (env == NULL)
+		return -ESRCH;
+
+	down_read(&env->op_sem);
+	err = -ESRCH;
+	if (!env->is_running)
+		goto out;
+
+	err = 0;
+	if (env->shmem_mnt == NULL || (sb = env->shmem_mnt->mnt_sb) == NULL)
+		goto out;
+
+	old = NULL;
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
+
+		err = doit(inode, data, ctx);
+
+		if (old != NULL)
+			iput(old);
+		old = inode;
+		spin_lock(&inode_lock);
+
+		if (err)
+			break;
+	}
+	spin_unlock(&inode_lock);
+	if (old != NULL)
+		iput(old);
+
+out:
+	up_read(&env->op_sem);
+
+	put_ve(env);
+
+	return err;
+}
+
+static int nread(struct file *file, void *buf, int len)
+{
+	int offset = 0;
+
+	while (offset < len) {
+		int res;
+		mm_segment_t oldfs;
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		res = vfs_read(file, buf+offset, len-offset, &file->f_pos);
+		set_fs(oldfs);
+		if (res < 0)
+			return res;
+		if (res == 0)
+			return -EIO;
+		offset += res;
+	}
+	return 0;
+}
+
+int cpt_iteration(cpt_context_t *ctx)
+{
+	int err;
+	int prev_iter, first_iter, prev_young;
+	struct iter_data *iter;
+	int tmo;
+
+#ifdef ITER_DEBUG
+	ctx->pagein_file_out = filp_open("/var/tmp/dmp_", O_WRONLY|O_TRUNC|O_CREAT, 0666);
+	if (IS_ERR(ctx->pagein_file_out))
+		ctx->pagein_file_out = NULL;
+#endif
+
+	if (ctx->pagein_file_out == NULL)
+		return -EBADF;
+
+	iter = kmalloc(sizeof(struct iter_data), GFP_KERNEL);
+	if (iter == NULL)
+		return -ENOMEM;
+	memset(iter, 0, sizeof(struct iter_data));
+
+	iter->ctx = ctx;
+
+	/* Clear the state */ 
+	cpt_walk_shm(iter_one_shm_zero, iter, ctx);
+	cpt_walk_mm(iter_one_mm, iter, ctx);
+
+	iter->iter_new = iter->iter_young = iter->iter_shm = 0;
+	iter->iter = 1;
+	err = cpt_walk_mm(iter_one_mm, iter, ctx);
+	if (!err && ctx->iter_shm_start)
+		err = cpt_walk_shm(iter_one_shm, iter, ctx);
+	prev_iter = first_iter = iter->iter_new;
+	prev_young = iter->iter_young;
+	dprintk_ctx("%d: Found %d pages, %d young, %d shm\n",
+		    iter->iter, prev_iter, iter->iter_young, iter->iter_shm);
+	iter->iter_new = iter->iter_young = iter->iter_shm = 0;
+	if (err)
+		goto out;
+
+	tmo = HZ/20;
+
+	for (;;) {
+		iter->iter++;
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(tmo);
+		err = cpt_walk_mm(iter_one_mm, iter, ctx);
+		if (err)
+			break;
+		if (ctx->iter_shm_start) {
+			err = cpt_walk_shm(iter_one_shm, iter, ctx);
+			if (err)
+				break;
+		}
+		dprintk_ctx("%d: Found %d pages, %d young, %d shm, %d tmo\n",
+			    iter->iter, iter->iter_new, iter->iter_young,
+			    iter->iter_shm, tmo);
+		if (iter->iter_new > prev_iter/2 ||
+		    iter->iter_young > prev_young/2) {
+			tmo /= 2;
+			if (tmo < 2)
+				tmo = 2;
+		}
+		if (iter->iter_new > first_iter/2 ||
+		    iter->iter_new < 10 ||
+		    iter->iter > 10) {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(tmo/2);
+			iter->iter = -1;
+			prev_iter = iter->iter_new;
+			iter->iter_new = iter->iter_shm = 0;
+			cpt_walk_mm(iter_one_mm, iter, ctx);
+			if (ctx->iter_shm_start)
+				cpt_walk_shm(iter_one_shm, iter, ctx);
+			dprintk_ctx("%d: Found %d pages, shm %d, tmo %d\n",
+				    iter->iter, iter->iter_new,
+				    iter->iter_shm, tmo);
+			ctx->iter_done = 1;
+#ifndef ITER_DEBUG
+			do {
+				union {
+					struct pgin_reply rep;
+					struct pgin_request req;
+				} u;
+				mm_segment_t oldfs;
+				struct file * file = ctx->pagein_file_out;
+
+				u.rep.rmid = PGIN_RMID;
+				u.rep.error = ITER_STOP;
+				u.rep.handle = 0;
+
+				oldfs = get_fs(); set_fs(KERNEL_DS);
+				vfs_write(file, (void*)&u.rep, sizeof(u.rep), &file->f_pos);
+				err = nread(ctx->pagein_file_in, &u.req, sizeof(u.req));
+				set_fs(oldfs);
+				if (!err) {
+					if (u.req.rmid != PGIN_RMID ||
+					    u.req.size != PGIN_STOP)
+						err = -EIO;
+				}
+			} while (0);
+#endif
+			break;
+		}
+		prev_iter = iter->iter_new;
+		prev_young = iter->iter_young;
+		first_iter /= 2;
+		iter->iter_new = iter->iter_young = iter->iter_shm = 0;
+	}
+
+out:
+	if (err) {
+		if (ctx->pagein_file_out) {
+			fput(ctx->pagein_file_out);
+			ctx->pagein_file_out = NULL;
+		}
+		if (ctx->pagein_file_in) {
+			fput(ctx->pagein_file_in);
+			ctx->pagein_file_in = NULL;
+		}
+	}
+	kfree(iter);
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_kernel.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_kernel.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_kernel.c	2015-01-21 12:02:48.224093631 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_kernel.c	2015-01-21 12:02:51.105017152 +0300
@@ -0,0 +1,187 @@
+/*
+ *
+ *  kernel/cpt/cpt_kernel.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#define __KERNEL_SYSCALLS__ 1
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <linux/cpt_image.h>
+
+#include "cpt_kernel.h"
+#include "cpt_syscalls.h"
+
+int debug_level = 1;
+
+#ifdef CONFIG_X86_32
+
+/*
+ * Create a kernel thread
+ */
+extern void kernel_thread_helper(void);
+int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
+{
+	struct pt_regs regs;
+
+	memset(&regs, 0, sizeof(regs));
+
+	regs.bx = (unsigned long) fn;
+	regs.dx = (unsigned long) arg;
+
+	regs.ds = __USER_DS;
+	regs.es = __USER_DS;
+	regs.fs = __KERNEL_PERCPU;
+	regs.gs = __KERNEL_STACK_CANARY;
+	regs.orig_ax = -1;
+	regs.ip = (unsigned long) kernel_thread_helper;
+	regs.cs = __KERNEL_CS | get_kernel_rpl();
+	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+	/* Ok, create the new process.. */
+	return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL, pid);
+}
+#endif
+
+#ifdef CONFIG_IA64
+pid_t
+asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid)
+{
+	extern void start_kernel_thread (void);
+	unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
+	struct {
+		struct switch_stack sw;
+		struct pt_regs pt;
+	} regs;
+
+	memset(&regs, 0, sizeof(regs));
+	regs.pt.cr_iip = helper_fptr[0];	/* set entry point (IP) */
+	regs.pt.r1 = helper_fptr[1];		/* set GP */
+	regs.pt.r9 = (unsigned long) fn;	/* 1st argument */
+	regs.pt.r11 = (unsigned long) arg;	/* 2nd argument */
+	/* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
+	regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
+	regs.pt.cr_ifs = 1UL << 63;		/* mark as valid, empty frame */
+	regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
+	regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
+	regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/);
+	return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL, pid);
+}
+#endif
+
+int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
+{
+	pid_t ret;
+
+	if (current->fs == NULL) {
+		/* do_fork_pid() hates processes without fs, oopses. */
+		printk("CPT BUG: local_kernel_thread: current->fs==NULL\n");
+		return -EINVAL;
+	}
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	ret = asm_kernel_thread(fn, arg, flags, pid);
+	if (ret < 0)
+		module_put(THIS_MODULE);
+	return ret;
+}
+
+unsigned int test_cpu_caps_and_features(void)
+{
+#define has_cpu_cap(cap) test_bit((cap), (unsigned long *)caps)
+
+	u32 caps[RHNCAPINTS];
+	unsigned int flags = 0;
+
+#ifdef CONFIG_X86
+	get_cpu_cap_masked(caps);
+
+	if (has_cpu_cap(X86_FEATURE_CMOV))
+		flags |= 1 << CPT_CPU_X86_CMOV;
+	if (has_cpu_cap(X86_FEATURE_FXSR))
+		flags |= 1 << CPT_CPU_X86_FXSR;
+	if (has_cpu_cap(X86_FEATURE_XMM))
+		flags |= 1 << CPT_CPU_X86_SSE;
+#ifndef CONFIG_X86_64
+	if (has_cpu_cap(X86_FEATURE_XMM2))
+#endif
+		flags |= 1 << CPT_CPU_X86_SSE2;
+	if (has_cpu_cap(X86_FEATURE_XMM4_1))
+		flags |= 1 << CPT_CPU_X86_SSE4_1;
+	if (has_cpu_cap(X86_FEATURE_XMM4_2))
+		flags |= 1 << CPT_CPU_X86_SSE4_2;
+	if (has_cpu_cap(X86_FEATURE_MMX))
+		flags |= 1 << CPT_CPU_X86_MMX;
+	if (has_cpu_cap(X86_FEATURE_3DNOW))
+		flags |= 1 << CPT_CPU_X86_3DNOW;
+	if (has_cpu_cap(X86_FEATURE_3DNOWEXT))
+		flags |= 1 << CPT_CPU_X86_3DNOW2;
+	if (has_cpu_cap(X86_FEATURE_SSE4A))
+		flags |= 1 << CPT_CPU_X86_SSE4A;
+	if (has_cpu_cap(X86_FEATURE_SYSCALL))
+		flags |= 1 << CPT_CPU_X86_SYSCALL;
+#ifdef CONFIG_X86_64
+	if (has_cpu_cap(X86_FEATURE_SYSCALL) &&
+			boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		flags |= 1 << CPT_CPU_X86_SYSCALL32;
+#endif
+	if (has_cpu_cap(X86_FEATURE_SEP)
+#ifdef CONFIG_X86_64
+			&& boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
+#endif
+	   )
+		flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32));
+
+	if (has_cpu_cap(X86_FEATURE_XSAVE))
+		flags |= 1 << CPT_CPU_X86_XSAVE;
+
+	if (has_cpu_cap(X86_FEATURE_AVX))
+		flags |= 1 << CPT_CPU_X86_AVX;
+
+	if (has_cpu_cap(X86_FEATURE_AES))
+		flags |= 1 << CPT_CPU_X86_AESNI;
+
+	if (has_cpu_cap(X86_FEATURE_RDRAND))
+		flags |= 1 << CPT_CPU_X86_RDRAND;
+
+#ifdef CONFIG_X86_64
+	flags |= 1 << CPT_CPU_X86_EMT64;
+#endif
+#endif
+#ifdef CONFIG_IA64
+	flags |= 1 << CPT_CPU_X86_IA64;
+	flags |= 1 << CPT_CPU_X86_FXSR;
+#endif
+	if (!is_sock_registered(PF_INET6))
+		flags |= 1 << CPT_NO_IPV6;
+
+	flags |= 1 << CPT_NAMESPACES;
+
+	return flags;
+
+#undef has_cpu_cap
+}
+
+unsigned int test_kernel_config(void)
+{
+	unsigned int flags = 0;
+#ifdef CONFIG_X86
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+	flags |= 1 << CPT_KERNEL_CONFIG_PAE;
+#endif
+#endif
+	return flags;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_kernel.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_kernel.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_kernel.h	2015-01-21 12:02:48.224093631 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_kernel.h	2015-01-21 12:02:51.014019567 +0300
@@ -0,0 +1,81 @@
+/* Interface to kernel vars which we had to _add_. */
+
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+#define TASK_TRACED TASK_STOPPED
+#define unix_peer(sk) ((sk)->sk_pair)
+#define page_mapcount(pg) ((pg)->mapcount)
+#else
+#define unix_peer(sk) (unix_sk(sk)->peer)
+#endif
+
+#ifdef CONFIG_IA64
+#define cpu_has_fxsr 1
+#endif
+
+#define CPT_SIG_IGNORE_MASK (\
+        (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \
+	(1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1)))
+
+static inline void do_gettimespec(struct timespec *ts)
+{
+	struct timeval tv;
+	do_gettimeofday(&tv);
+	ts->tv_sec = tv.tv_sec;
+	ts->tv_nsec = tv.tv_usec*1000;
+}
+
+int local_kernel_thread(int (*fn)(void *),
+		void * arg,
+		unsigned long flags,
+		pid_t pid);
+int asm_kernel_thread(int (*fn)(void *),
+		void * arg,
+		unsigned long flags,
+		pid_t pid);
+
+unsigned int test_cpu_caps_and_features(void);
+int rst_image_acceptable(unsigned long version);
+unsigned int test_kernel_config(void);
+
+#define test_one_flag(src, dst, flag, message, ret) \
+if (src & (1 << flag)) \
+	if (!(dst & (1 << flag))) { \
+		eprintk_ctx("Destination cpu does not have " message "\n"); \
+		ret = VECAPS_NO_CPU_FEATURE; \
+	}
+
+static inline void
+_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+static inline struct timespec
+_ns_to_timespec(const s64 nsec)
+{
+	struct timespec ts;
+	s32 rem;
+
+	if (!nsec)
+		return (struct timespec) {0, 0};
+
+	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+	if (unlikely(rem < 0)) {
+		ts.tv_sec--;
+		rem += NSEC_PER_SEC;
+	}
+	ts.tv_nsec = rem;
+
+	return ts;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_mm.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_mm.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_mm.c	2015-01-21 12:02:48.225093605 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_mm.c	2015-01-21 12:02:51.087017630 +0300
@@ -0,0 +1,1028 @@
+/*
+ *
+ *  kernel/cpt/cpt_mm.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/errno.h>
+#include <linux/ve.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#ifdef CONFIG_X86
+#include <asm/ldt.h>
+#endif
+#include <asm/mmu.h>
+#include <linux/cpt_image.h>
+#include <linux/shm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_ubc.h"
+
+static int is_packet_sock_vma(struct vm_area_struct *vma)
+{
+	extern const struct vm_operations_struct packet_mmap_ops;
+	return vma->vm_ops == &packet_mmap_ops;
+}
+
+/*
+ * Locking order between mmap_sem and i_mutex
+ *
+ * vfs_write() -> get_user_pages()	: i_mutex    -> mmap_sem
+ * dup_mmap()				: mmap_sem   -> mmap_sem/1
+ * cpt_dump_vm() -> file_write()	: mmap_sem/2 -> i_mutex
+ */
+#define MMAP_SEM_CPT_DUMP	2
+
+static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
+			       cpt_context_t *ctx)
+{
+	if (!list_empty(&aio_ctx->run_list)) {
+		/* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */
+		eprintk_ctx("run list is not empty, cannot suspend AIO\n");
+		return -EBUSY;
+	}
+
+	/* Wait for pending IOCBs. Linux AIO is mostly _fake_.
+	 * It is actually synchronous, except for direct IO and
+	 * some funny raw USB things, which cannot happen inside VE.
+	 * However, we do this for future.
+	 *
+	 * Later note: in 2.6.16 we may allow O_DIRECT, so that
+	 * it is not meaningless code.
+	 */
+	wait_for_all_aios(aio_ctx);
+
+	if (!list_empty(&aio_ctx->run_list) ||
+	    !list_empty(&aio_ctx->active_reqs) ||
+	    aio_ctx->reqs_active) {
+		eprintk_ctx("were not able to suspend AIO\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx)
+{
+	struct vm_area_struct *vma;
+	struct hlist_node *n;
+	struct kioctx *aio_ctx;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_file) {
+			if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL)
+				return -ENOMEM;
+		}
+	}
+
+	if (mm->exe_file &&
+	    cpt_object_add(CPT_OBJ_FILE, mm->exe_file, ctx) == NULL)
+		return -ENOMEM;
+
+#ifdef CONFIG_BEANCOUNTERS
+	if (cpt_add_ubc(mm->mm_ub, ctx) == NULL)
+		return -ENOMEM;
+#endif
+
+	hlist_for_each_entry(aio_ctx, n, &mm->ioctx_list, list) {
+		int err;
+
+		if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
+			return err;
+	}
+
+	return 0;
+}
+
+int cpt_collect_mm(cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+	int err;
+	int index;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL)
+			return -ENOMEM;
+	}
+
+	index = 1;
+	for_each_object(obj, CPT_OBJ_MM) {
+		struct mm_struct *mm = obj->o_obj;
+		struct task_struct *g, *p;
+		int mm_users = 0;
+
+		rcu_read_lock();
+		do_each_thread_all(g, p) {
+			if (p->mm == mm)
+				mm_users++;
+		} while_each_thread_all(g, p);
+		rcu_read_unlock();
+
+		if (obj->o_count != mm_users) {
+			eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, mm_users);
+			return -EAGAIN;
+		}
+		cpt_obj_setindex(obj, index++, ctx);
+
+		if ((err = collect_one_mm(mm, ctx)) != 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int zcnt, scnt, scnt0, ucnt;
+
+/* Function where_is_anon_page() returns address of a anonymous page in mm
+ * of already dumped process. This happens f.e. after fork(). We do not use
+ * this right now, just keep statistics, it is diffucult to restore such state,
+ * but the most direct use is to save space in dumped image. */
+
+
+static inline unsigned long
+vma_address0(struct page *page, struct vm_area_struct *vma)
+{
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	unsigned long address;
+
+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+		address |= 1;
+	return address;
+}
+
+int cpt_check_page(struct vm_area_struct *vma, unsigned long address,
+		   struct page *page, int wrprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+	int result;
+
+	pgd = pgd_offset(mm, address);
+	if (unlikely(!pgd_present(*pgd)))
+		return 0;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return 0;
+
+	pmd = pmd_offset(pud, address);
+	if (unlikely(!pmd_present(*pmd)))
+		return 0;
+
+	result = 0;
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte)) {
+		pte_unmap(pte);
+		return 0;
+	}
+
+	ptl = pte_lockptr(mm, pmd);
+	spin_lock(ptl);
+	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+		result = 1;
+		if (wrprot < 0)
+			result = pte_write(*pte);
+		else if (wrprot)
+			ptep_set_wrprotect(mm, address, pte);
+	}
+	pte_unmap_unlock(pte, ptl);
+	return result;
+}
+
+static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr,
+				 struct page *page, cpt_context_t * ctx)
+{
+	loff_t mmptr = CPT_NULL;
+	struct anon_vma *anon_vma;
+	struct anon_vma_chain *avc;
+	int idx = mmobj->o_index;
+
+	if (!PageAnon(page))
+		return CPT_NULL;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return CPT_NULL;
+
+	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+		struct vm_area_struct *vma = avc->vma;
+		unsigned long addr = vma_address0(page, vma);
+		cpt_object_t *obj;
+
+		/* We do not try to support mremapped regions (addr != mapaddr),
+		 * only mmaps directly inherited via fork().
+		 * With this limitation we may check self-consistency of
+		 * vmas (vm_start, vm_pgoff, anon_vma) before
+		 * doing __copy_page_range() in rst_mm.
+		 */
+		if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) {
+			obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx);
+			if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) {
+				if (cpt_check_page(vma, addr, page, 0)) {
+					mmptr = obj->o_pos;
+					idx = obj->o_index;
+				}
+			}
+		}
+	}
+	page_unlock_anon_vma(anon_vma);
+
+	return mmptr;
+}
+
+struct page_area
+{
+	int type;
+	unsigned long start;
+	unsigned long end;
+	pgoff_t pgoff;
+	loff_t mm;
+	__u64 list[16];
+
+#define MAX_PAGE_BATCH 16
+	struct page *pages[MAX_PAGE_BATCH];
+};
+
+struct page_desc
+{
+	int	type;
+	pgoff_t	index;
+	loff_t	mm;
+	int	shared;
+};
+
+enum {
+	PD_ABSENT,
+	PD_COPY,
+	PD_ZERO,
+	PD_CLONE,
+	PD_FUNKEY,
+	PD_ITER,
+	PD_ITERYOUNG,
+};
+
+/* 0: page can be obtained from backstore, or still not mapped anonymous  page,
+      or something else, which does not requre copy.
+   1: page requires copy
+   2: page requres copy but its content is zero. Quite useless.
+   3: wp page is shared after fork(). It is to be COWed when modified.
+   4: page is something unsupported... We copy it right now.
+ */
+
+
+
+static void page_get_desc(cpt_object_t *mmobj,
+			  struct vm_area_struct *vma, unsigned long addr,
+			  struct page_desc *pdesc, cpt_context_t * ctx)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	struct page *pg = NULL;
+	pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff;
+
+	pdesc->index = linear_index;
+	pdesc->shared = 0;
+	pdesc->mm = CPT_NULL;
+
+	if (vma->vm_flags & VM_IO) {
+		pdesc->type = PD_ABSENT;
+		return;
+	}
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto out_absent;
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto out_absent;
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		goto out_absent;
+#ifdef CONFIG_X86
+	if (pmd_trans_huge(*pmd))
+		split_huge_page_pmd(mm, pmd);
+#endif
+
+	if (unlikely(pmd_bad(*pmd)))
+		goto out_absent;
+
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+retry:
+#endif
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	pte = *ptep;
+	pte_unmap(ptep);
+
+	if (pte_none(pte))
+		goto out_absent_unlock;
+
+	if (!pte_present(pte)) {
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+		int err;
+#endif
+		if (pte_file(pte)) {
+			pdesc->index = pte_to_pgoff(pte);
+			goto out_absent_unlock;
+		}
+		if (vma->vm_flags & VM_SHARED) {
+			/* It is impossible: shared mappings cannot be in swap */
+			eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos);
+			goto out_unsupported_unlock;
+		}
+#ifdef CONFIG_PSWAP
+		if (ctx->pram_stream &&
+		    is_swap_pte(pte) && !non_swap_entry(pte_to_swp_entry(pte))) {
+			pdesc->type = PD_COPY;
+			goto out_unlock;
+		}
+#endif
+#if defined(CONFIG_VZ_CHECKPOINT_ITER) || defined(CONFIG_PSWAP)
+		/* 
+		 * raise it from swap now, so that we save at least when the
+		 * page is shared. 
+		 */
+		spin_unlock(ptl);
+		err = handle_mm_fault(mm, vma, addr, 0);
+		if (err == VM_FAULT_SIGBUS)
+			goto out_absent;
+		if (err == VM_FAULT_OOM)
+			goto out_absent;
+		goto retry;
+#else
+		pdesc->type = PD_COPY;
+		goto out_unlock;
+#endif
+	}
+
+	if ((pg = vm_normal_page(vma, addr, pte)) == NULL) {
+		pdesc->type = PD_COPY;
+		goto out_unlock;
+	}
+
+	get_page(pg);
+
+	if (pg->mapping && !PageAnon(pg)) {
+		if (vma->vm_file == NULL) {
+			eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr);
+			goto out_unsupported_unlock;
+		}
+		/*
+		 * vma and page mappings can differ if inode has peers.
+		 * actually vma-mapping must be in page-mapping peer list,
+		 * but checking this here is overkill.
+		 *
+		 * list checks are protected with ptl: close_inode_peer() will
+		 * lock it to unmap this page and remove inode from peers list.
+		 */
+		if (vma->vm_file->f_mapping != pg->mapping &&
+		    (list_empty(&vma->vm_file->f_mapping->i_peer_list) ||
+		     list_empty(&pg->mapping->i_peer_list))) {
+			eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n",
+				    addr, vma->vm_file->f_mapping, pg->mapping,
+				    mmobj->o_pos);
+			goto out_unsupported_unlock;
+		}
+		pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+		/* Page is in backstore. For us it is like
+		 * it is not present.
+		 */
+		goto out_absent_unlock;
+	}
+
+	spin_unlock(ptl);
+
+	if (PageReserved(pg)) {
+		/* Special case: ZERO_PAGE is used, when an
+		 * anonymous page is accessed but not written. */
+		if (pg == ZERO_PAGE(addr)) {
+			if (pte_write(pte)) {
+				eprintk_ctx("not funny already, writable ZERO_PAGE\n");
+				goto out_unsupported;
+			}
+			zcnt++;
+			goto out_absent;
+		}
+		eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index,
+			    addr, mmobj->o_pos);
+		goto out_unsupported;
+	}
+
+	if (pg == ZERO_PAGE(addr)) {
+		wprintk_ctx("that's how it works now\n");
+	}
+
+	if (!pg->mapping && !is_packet_sock_vma(vma)) {
+		print_bad_pte(vma, addr, pte, pg);
+		goto out_unsupported;
+	}
+
+	if (pg->mapping && page_mapcount(pg) > 1) {
+		pdesc->shared = 1;
+		pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx);
+		if (pdesc->mm != CPT_NULL) {
+			scnt0++;
+			pdesc->type = PD_CLONE;
+			goto out_put;
+		} else {
+			scnt++;
+		}
+	}
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+	if (ctx->iter_done && PageCheckpointed(pg)) {
+		if (pte_write(pte)) {
+			wprintk_ctx("writable PG_checkpointed page\n");
+		}
+		pdesc->index = page_to_pfn(pg);
+		pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER;
+		goto out_put;
+	}
+#endif
+	pdesc->type = PD_COPY;
+
+out_put:
+	if (pg)
+		put_page(pg);
+	return;
+
+out_unlock:
+	spin_unlock(ptl);
+	goto out_put;
+
+out_absent_unlock:
+	spin_unlock(ptl);
+out_absent:
+	pdesc->type = PD_ABSENT;
+	goto out_put;
+
+out_unsupported_unlock:
+	spin_unlock(ptl);
+out_unsupported:
+	ucnt++;
+	pdesc->type = PD_FUNKEY;
+	goto out_put;
+}
+
+static inline void dump_page(struct page *page, struct cpt_context *ctx)
+{
+	char *maddr;
+
+	maddr = kmap(page);
+	ctx->write(maddr, PAGE_SIZE, ctx);
+	kunmap(page);
+}
+
+/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages()
+ * does not really need this thing. It just stores some page fault stats there.
+ *
+ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages
+ * before accessing vma.
+ */
+void dump_pages(struct vm_area_struct *vma, struct page_area *pa,
+	       	struct cpt_context *ctx)
+{
+	unsigned long start = pa->start;
+	int npages = (pa->end - pa->start) / PAGE_SIZE;
+	int count = 0;
+
+	while (count < npages) {
+		int copy = npages - count;
+		int n;
+
+		if (copy > MAX_PAGE_BATCH)
+			copy = MAX_PAGE_BATCH;
+		n = get_user_pages(current, vma->vm_mm, start, copy,
+				   0, 1, pa->pages, NULL);
+		if (n == copy) {
+			int i;
+			for (i=0; i<n; i++)
+				dump_page(pa->pages[i], ctx);
+		} else {
+			eprintk_ctx("get_user_pages fault\n");
+			for ( ; n > 0; n--)
+				page_cache_release(pa->pages[n-1]);
+			return;
+		}
+		start += n*PAGE_SIZE;
+		count += n;
+		for ( ; n > 0; n--)
+			page_cache_release(pa->pages[n-1]);
+	}
+	return;
+}
+
+int dump_page_block(struct vm_area_struct *vma, struct page_area *pa,
+		    struct cpt_context *ctx)
+{
+	loff_t saved_object;
+	struct cpt_page_block pgb;
+
+	cpt_push_object(&saved_object, ctx);
+
+	pgb.cpt_object = CPT_OBJ_PAGES;
+	pgb.cpt_hdrlen = sizeof(pgb);
+	pgb.cpt_content = (pa->type == PD_COPY) ?
+			CPT_CONTENT_DATA : CPT_CONTENT_VOID;
+#ifdef CONFIG_PRAM
+	if (pa->type == PD_COPY && ctx->pram_stream)
+		pgb.cpt_content = CPT_CONTENT_PRAM;
+#endif
+	pgb.cpt_start = pa->start;
+	pgb.cpt_end = pa->end;
+
+	ctx->write(&pgb, sizeof(pgb), ctx);
+	if (pa->type == PD_COPY) {
+		if (pgb.cpt_content == CPT_CONTENT_PRAM)
+			cpt_dump_pram(vma, pa->start, pa->end, ctx);
+		else
+			dump_pages(vma, pa, ctx);
+	}
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved_object, ctx);
+	return 0;
+}
+
+int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa,
+			 struct cpt_context *ctx)
+{
+	struct cpt_remappage_block pgb;
+	loff_t saved_object;
+
+	cpt_push_object(&saved_object, ctx);
+
+	pgb.cpt_object = CPT_OBJ_REMAPPAGES;
+	pgb.cpt_hdrlen = sizeof(pgb);
+	pgb.cpt_content = CPT_CONTENT_VOID;
+	pgb.cpt_start = pa->start;
+	pgb.cpt_end = pa->end;
+	pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1;
+
+	ctx->write(&pgb, sizeof(pgb), ctx);
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved_object, ctx);
+	return 0;
+}
+
+int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa,
+			struct cpt_context *ctx)
+{
+	struct cpt_copypage_block pgb;
+	loff_t saved_object;
+
+	cpt_push_object(&saved_object, ctx);
+
+	pgb.cpt_object = CPT_OBJ_COPYPAGES;
+	pgb.cpt_hdrlen = sizeof(pgb);
+	pgb.cpt_content = CPT_CONTENT_VOID;
+	pgb.cpt_start = pa->start;
+	pgb.cpt_end = pa->end;
+	pgb.cpt_source = pa->mm;
+
+	ctx->write(&pgb, sizeof(pgb), ctx);
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved_object, ctx);
+	return 0;
+}
+
+int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa,
+			cpt_context_t *ctx)
+{
+	struct cpt_iterpage_block pgb;
+	loff_t saved_object;
+
+	cpt_push_object(&saved_object, ctx);
+
+	pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES :
+		CPT_OBJ_ITERYOUNGPAGES;
+	pgb.cpt_hdrlen = sizeof(pgb);
+	pgb.cpt_content = CPT_CONTENT_VOID;
+	pgb.cpt_start = pa->start;
+	pgb.cpt_end = pa->end;
+	ctx->write(&pgb, sizeof(pgb), ctx);
+
+	ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx);
+
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved_object, ctx);
+	return 0;
+}
+
+
+static int can_expand(struct page_area *pa, struct page_desc *pd)
+{
+	if (pa->start == pa->end)
+		return 1;
+	if (pa->type != pd->type)
+		return 0;
+	if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) {
+		if (pa->end - pa->start >= PAGE_SIZE*16)
+			return 0;
+		pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index;
+	}
+	if (pa->type == PD_ABSENT)
+		return pd->index == pa->pgoff + 1;
+	if (pa->type == PD_CLONE)
+		return pd->mm == pa->mm;
+	return 1;
+}
+
+#ifdef CONFIG_X86_64
+extern int vdso_is_rhel5(struct page *page);
+static int vdso_is_old(struct vm_area_struct *vma)
+{
+	int n, ret;
+	struct page *p;
+
+	n = get_user_pages(current, vma->vm_mm, vma->vm_start, 1,
+			   0, 0, &p, NULL);
+	if (n < 1)
+		return -EINVAL;
+
+	ret = vdso_is_rhel5(p);
+
+	page_cache_release(p);
+
+	return ret;
+}
+#else
+#define vdso_is_old(page) 0
+#endif
+
+static int dump_one_vma(cpt_object_t *mmobj,
+			struct vm_area_struct *vma, struct cpt_context *ctx)
+{
+	struct cpt_vma_image *v = cpt_get_buf(ctx);
+	unsigned long addr;
+	loff_t saved_object;
+	struct page_area *pa;
+	int cloned_pages = 0;
+
+	cpt_push_object(&saved_object, ctx);
+
+	v->cpt_object = CPT_OBJ_VMA;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_start = vma->vm_start;
+	v->cpt_end = vma->vm_end;
+	v->cpt_flags = vma->vm_flags;
+	if (vma->vm_flags&VM_HUGETLB) {
+		eprintk_ctx("huge TLB VMAs are still not supported\n");
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+	v->cpt_pgprot = vma->vm_page_prot.pgprot;
+	v->cpt_pgoff = vma->vm_pgoff;
+	v->cpt_file = CPT_NULL;
+#ifndef CONFIG_IA64
+	if ((void *)vma->vm_start == vma->vm_mm->context.vdso &&
+			vma->vm_ops == &special_mapping_vmops) {
+		int old = vdso_is_old(vma);
+
+		if (old < 0) {
+			eprintk_ctx("can't get vdso page\n");
+			cpt_release_buf(ctx);
+			return old;
+		}
+
+		if (old)
+			v->cpt_type = CPT_VMA_VDSO_OLD;
+		else
+			v->cpt_type = CPT_VMA_VDSO;
+	} else
+#endif
+		v->cpt_type = CPT_VMA_TYPE_0;
+	v->cpt_anonvma = 0;
+
+	/*
+	 * Dump anon_vma->root instead of current anon_vma.
+	 * This allows us to make restore process easier and share one vma
+	 * structure between all processes after restore.
+	 * It is handy to use absolute address of anon_vma as this identifier.
+	 * FIXME: Implement dumping the whole anon_vma tree
+	 */
+	if (vma->anon_vma)
+		v->cpt_anonvmaid = (unsigned long)vma->anon_vma->root;
+	else
+		v->cpt_anonvmaid = 0;
+
+	if (vma->vm_file) {
+		struct file *filp;
+		cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx);
+		if (obj == NULL) BUG();
+		filp = obj->o_obj;
+		if (filp->f_op == &shm_file_operations) {
+			struct shm_file_data *sfd = filp->private_data;
+
+			v->cpt_type = CPT_VMA_TYPE_SHM;
+			obj = lookup_cpt_object(CPT_OBJ_FILE, sfd->file, ctx);
+		}
+		v->cpt_file = obj->o_pos;
+	}
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+	if (v->cpt_type == CPT_VMA_VDSO || v->cpt_type == CPT_VMA_VDSO_OLD)
+		goto out;
+
+	pa = cpt_get_buf(ctx);
+
+	pa->type = PD_ABSENT;
+	pa->pgoff = vma->vm_pgoff;
+	pa->mm = CPT_NULL;
+	pa->start = vma->vm_start;
+	pa->end = vma->vm_start;
+
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+		struct page_desc pd;
+
+		page_get_desc(mmobj, vma, addr, &pd, ctx);
+		cloned_pages += pd.shared;
+
+		if (pd.type == PD_FUNKEY) {
+			eprintk_ctx("dump_one_vma: funkey page\n");
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+
+		if (!can_expand(pa, &pd)) {
+			if (pa->type == PD_COPY ||
+			    pa->type == PD_ZERO) {
+				dump_page_block(vma, pa, ctx);
+			} else if (pa->type == PD_CLONE) {
+				dump_copypage_block(vma, pa, ctx);
+				cloned_pages++;
+			} else if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) {
+				dump_iterpage_block(vma, pa, ctx);
+				cloned_pages++;
+			} else if (pa->type == PD_ABSENT &&
+				   pa->pgoff != (pa->end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
+				dump_remappage_block(vma, pa, ctx);
+			}
+			pa->start = addr;
+		}
+		pa->type = pd.type;
+		pa->end = addr + PAGE_SIZE;
+		pa->pgoff = pd.index;
+		if (addr == pa->start)
+			pa->list[0] = pd.index;
+		pa->mm = pd.mm;
+	}
+
+	if (pa->end > pa->start) {
+		if (pa->type == PD_COPY ||
+		    pa->type == PD_ZERO) {
+			dump_page_block(vma, pa, ctx);
+		} else if (pa->type == PD_CLONE) {
+			dump_copypage_block(vma, pa, ctx);
+			cloned_pages++;
+		} else if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) {
+			dump_iterpage_block(vma, pa, ctx);
+			cloned_pages++;
+		} else if (pa->type == PD_ABSENT &&
+			   pa->pgoff != (pa->end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
+			dump_remappage_block(vma, pa, ctx);
+		}
+	}
+
+	if (cloned_pages) {
+		__u32 anonvma = 1;
+		loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma);
+		ctx->pwrite(&anonvma, 4, ctx, anonpos);
+	}
+
+	cpt_release_buf(ctx);
+out:
+	cpt_close_object(ctx);
+
+	cpt_pop_object(&saved_object, ctx);
+
+	return 0;
+}
+
+static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
+			    cpt_context_t *ctx)
+{
+	loff_t saved_object;
+	struct cpt_aio_ctx_image aimg;
+
+	if (!list_empty(&aio_ctx->run_list) ||
+	    !list_empty(&aio_ctx->active_reqs) ||
+	    aio_ctx->reqs_active) {
+		eprintk_ctx("AIO is active after suspend\n");
+		return -EBUSY;
+	}
+
+	cpt_push_object(&saved_object, ctx);
+
+	aimg.cpt_next = CPT_ALIGN(sizeof(aimg));
+	aimg.cpt_object = CPT_OBJ_AIO_CONTEXT;
+	aimg.cpt_hdrlen = sizeof(aimg);
+	aimg.cpt_content = CPT_CONTENT_ARRAY;
+
+	aimg.cpt_max_reqs = aio_ctx->max_reqs;
+	aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages;
+	aimg.cpt_nr = aio_ctx->ring_info.nr;
+	aimg.cpt_tail = aio_ctx->ring_info.tail;
+	aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base;
+
+	ctx->write(&aimg, sizeof(aimg), ctx);
+
+	cpt_pop_object(&saved_object, ctx);
+	return 0;
+}
+
+static void dump_mm_auxv(struct mm_struct *mm, cpt_context_t *ctx)
+{
+	loff_t saved_object;
+	struct cpt_object_hdr hdr;
+	unsigned nwords = 0;
+	__u64 *auxv = cpt_get_buf(ctx);
+
+	while (mm->saved_auxv[nwords]) {
+		auxv[nwords] = mm->saved_auxv[nwords];
+		nwords++;
+		auxv[nwords] = mm->saved_auxv[nwords];
+		nwords++;
+	}
+
+	if (nwords) {
+		hdr.cpt_next = CPT_NULL;
+		hdr.cpt_object = CPT_OBJ_MM_AUXV;
+		hdr.cpt_hdrlen = sizeof(hdr);
+		hdr.cpt_content = CPT_CONTENT_DATA;
+
+		cpt_push_object(&saved_object, ctx);
+		cpt_open_object(NULL, ctx);
+		ctx->write(&hdr, sizeof(hdr), ctx);
+		ctx->write(auxv, nwords * sizeof(auxv[0]), ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_object, ctx);
+	}
+
+	cpt_release_buf(ctx);
+}
+
+static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct mm_struct *mm = obj->o_obj;
+	struct vm_area_struct *vma;
+	struct cpt_mm_image *v = cpt_get_buf(ctx);
+	struct kioctx *aio_ctx;
+	struct hlist_node *n;
+	int err;
+
+	down_write_nested(&mm->mmap_sem, MMAP_SEM_CPT_DUMP);
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = -1;
+	v->cpt_object = CPT_OBJ_MM;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_start_code = mm->start_code;
+	v->cpt_end_code = mm->end_code;
+	v->cpt_start_data = mm->start_data;
+	v->cpt_end_data = mm->end_data;
+	v->cpt_start_brk = mm->start_brk;
+	v->cpt_brk = mm->brk;
+	v->cpt_start_stack = mm->start_stack;
+	v->cpt_start_arg = mm->arg_start;
+	v->cpt_end_arg = mm->arg_end;
+	v->cpt_start_env = mm->env_start;
+	v->cpt_end_env = mm->env_end;
+	v->cpt_def_flags = mm->def_flags;
+#ifdef CONFIG_BEANCOUNTERS
+	v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx);
+#endif
+	v->cpt_mm_flags = mm->flags;
+	v->cpt_vps_dumpable = mm->vps_dumpable;
+	v->cpt_used_hugetlb = 0; /* not used */
+#ifndef CONFIG_IA64
+	v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso;
+#endif
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+#ifdef CONFIG_X86
+	if (mm->context.size) {
+		loff_t saved_object;
+		struct cpt_obj_bits b;
+		int size;
+
+		dprintk_ctx("nontrivial LDT\n");
+
+		cpt_push_object(&saved_object, ctx);
+
+		cpt_open_object(NULL, ctx);
+		b.cpt_next = CPT_NULL;
+		b.cpt_object = CPT_OBJ_BITS;
+		b.cpt_hdrlen = sizeof(b);
+		b.cpt_content = CPT_CONTENT_MM_CONTEXT;
+		b.cpt_size = mm->context.size*LDT_ENTRY_SIZE;
+
+		ctx->write(&b, sizeof(b), ctx);
+
+		size = mm->context.size*LDT_ENTRY_SIZE;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \
+			LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19)
+		ctx->write(mm->context.ldt, size, ctx);
+#else
+		for (i = 0; i < size; i += PAGE_SIZE) {
+			int nr = i / PAGE_SIZE, bytes;
+			char *kaddr = kmap(mm->context.ldt_pages[nr]);
+
+			bytes = size - i;
+			if (bytes > PAGE_SIZE)
+				bytes = PAGE_SIZE;
+			ctx->write(kaddr, bytes, ctx);
+			kunmap(mm->context.ldt_pages[nr]);
+		}
+#endif
+
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_object, ctx);
+	}
+#endif
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if ((err = dump_one_vma(obj, vma, ctx)) != 0)
+			goto out;
+	}
+
+	hlist_for_each_entry(aio_ctx, n, &mm->ioctx_list, list) {
+		if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
+			goto out;
+	}
+
+	dump_mm_auxv(mm, ctx);
+
+	cpt_close_object(ctx);
+
+	up_write(&mm->mmap_sem);
+
+	return 0;
+
+out:
+	up_write(&mm->mmap_sem);
+
+	return err;
+}
+
+int cpt_dump_vm(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	scnt = scnt0 = zcnt = 0;
+
+	cpt_open_section(ctx, CPT_SECT_MM);
+
+	for_each_object(obj, CPT_OBJ_MM) {
+		int err;
+
+		if ((err = dump_one_mm(obj, ctx)) != 0)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+
+	if (scnt)
+		dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt);
+	if (scnt0)
+		dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0);
+	if (zcnt)
+		dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt);
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_mm.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_mm.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_mm.h	2015-01-21 12:02:48.225093605 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_mm.h	2015-01-21 12:02:57.970834900 +0300
@@ -0,0 +1,31 @@
+int cpt_collect_mm(cpt_context_t *);
+
+int cpt_dump_vm(struct cpt_context *ctx);
+
+__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+int set_mlock_creds(int cap);
+
+int cpt_iteration(cpt_context_t *ctx);
+int rst_iteration(cpt_context_t *ctx);
+void rst_drop_iter_rbtree(cpt_context_t *ctx);
+int rst_iter(struct vm_area_struct *vma, u64 pfn,
+	     unsigned long addr, cpt_context_t * ctx);
+int rst_iter_chunk(struct file *file, loff_t pos, struct cpt_page_block * pgb,
+			cpt_context_t *ctx);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack,
+				       unsigned long map_address);
+#endif
+
+#ifdef CONFIG_X86
+extern struct page *vdso32_pages[1];
+#define vsyscall_addr page_address(vdso32_pages[0])
+#endif
+
+int cpt_check_page(struct vm_area_struct *vma, unsigned long address,
+		   struct page *page, int wrprot);
+int cpt_verify_wrprot(struct page * page, cpt_context_t * ctx);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_net.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_net.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_net.c	2015-01-21 12:02:48.225093605 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_net.c	2015-01-21 12:02:50.937021612 +0300
@@ -0,0 +1,741 @@
+/*
+ *
+ *  kernel/cpt/cpt_net.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/rtnetlink.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+#include <linux/cpt_image.h>
+#include <linux/if_tun.h>
+#include <linux/veth.h>
+#include <linux/fdtable.h>
+#include <net/ip.h>
+
+#include <linux/cpt_export.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_kernel.h"
+#include "cpt_syscalls.h"
+
+static void cpt_dump_netstats(struct net_device *dev, struct cpt_context * ctx)
+{
+	const struct net_device_stats *stats;
+	struct cpt_netstats_image *n;
+
+	n = cpt_get_buf(ctx);
+	stats = dev_get_stats(dev);
+	cpt_open_object(NULL, ctx);
+
+	n->cpt_next = CPT_NULL;
+	n->cpt_object = CPT_OBJ_NET_STATS;
+	n->cpt_hdrlen = sizeof(*n);
+	n->cpt_content = CPT_CONTENT_VOID;
+
+	n->cpt_rx_packets = stats->rx_packets;
+	n->cpt_tx_packets = stats->tx_packets;
+	n->cpt_rx_bytes = stats->rx_bytes;
+	n->cpt_tx_bytes = stats->tx_bytes;
+	n->cpt_rx_errors = stats->rx_errors;
+	n->cpt_tx_errors = stats->tx_errors;
+	n->cpt_rx_dropped = stats->rx_dropped;
+	n->cpt_tx_dropped = stats->tx_dropped;
+	n->cpt_multicast = stats->multicast;
+	n->cpt_collisions = stats->collisions;
+	n->cpt_rx_length_errors = stats->rx_length_errors;
+	n->cpt_rx_over_errors = stats->rx_over_errors;
+	n->cpt_rx_crc_errors = stats->rx_crc_errors;
+	n->cpt_rx_frame_errors = stats->rx_frame_errors;
+	n->cpt_rx_fifo_errors = stats->rx_fifo_errors;
+	n->cpt_rx_missed_errors = stats->rx_missed_errors;
+	n->cpt_tx_aborted_errors = stats->tx_aborted_errors;
+	n->cpt_tx_carrier_errors = stats->tx_carrier_errors;
+	n->cpt_tx_fifo_errors = stats->tx_fifo_errors;
+	n->cpt_tx_heartbeat_errors = stats->tx_heartbeat_errors;
+	n->cpt_tx_window_errors = stats->tx_window_errors;
+	n->cpt_rx_compressed = stats->rx_compressed;
+	n->cpt_tx_compressed = stats->tx_compressed;
+
+	ctx->write(n, sizeof(*n), ctx);
+	cpt_close_object(ctx);
+	cpt_release_buf(ctx);
+	return;
+}
+
+static void cpt_dump_idev_cnf(struct net_device *dev, struct cpt_context * ctx)
+{
+	struct in_device *idev;
+	struct cpt_idev_cnf_image *d;
+
+	d = cpt_get_buf(ctx);
+	idev = in_dev_get(dev);
+	if (!idev)
+		goto out;
+	cpt_open_object(NULL, ctx);
+
+	d->cpt_next = CPT_NULL;
+	d->cpt_object = CPT_OBJ_NET_IDEV_CNF;
+	d->cpt_hdrlen = sizeof(*d);
+	d->cpt_content = CPT_CONTENT_VOID;
+
+	memcpy(d->cpt_data, idev->cnf.data, sizeof(d->cpt_data));
+	ctx->write(d, sizeof(*d), ctx);
+	cpt_close_object(ctx);
+	in_dev_put(idev);
+out:
+	cpt_release_buf(ctx);
+	return;
+}
+
+int cpt_dump_link(struct cpt_context * ctx)
+{
+	struct net *net = get_exec_env()->ve_netns;
+	struct net_device *dev;
+	int dump_bridges = 0;
+
+	cpt_open_section(ctx, CPT_SECT_NET_DEVICE);
+dump:
+	for_each_netdev(net, dev) {
+		struct cpt_netdev_image v;
+		struct cpt_hwaddr_image hw;
+		loff_t saved_obj;
+
+		if (dev->netdev_ops->ndo_cpt == NULL) {
+			eprintk_ctx("unsupported netdev %s\n", dev->name);
+			cpt_close_section(ctx);
+			return -EBUSY;
+		}
+
+		/*
+		 * First dump all net devices except bridges.
+		 * Then dump bridges on next iteration.
+		 * This is done to make sure, that any othe than bridge network
+		 * devices will be restored prior to bridges, to make it able
+		 * to add them into a bridge.
+		 */
+		if (!dump_bridges && (dev->priv_flags & IFF_EBRIDGE))
+			continue;
+		if (dump_bridges && (!(dev->priv_flags & IFF_EBRIDGE)))
+			continue;
+
+		cpt_open_object(NULL, ctx);
+
+		v.cpt_next = CPT_NULL;
+		v.cpt_object = CPT_OBJ_NET_DEVICE;
+		v.cpt_hdrlen = sizeof(v);
+		v.cpt_content = CPT_CONTENT_ARRAY;
+
+		v.cpt_index = dev->ifindex;
+		v.cpt_flags = dev->flags;
+		v.cpt_mtu = dev->mtu;
+		memcpy(v.cpt_name, dev->name, IFNAMSIZ);
+		ctx->write(&v, sizeof(v), ctx);
+
+		cpt_push_object(&saved_obj, ctx);
+
+		cpt_open_object(NULL, ctx);
+		dev->netdev_ops->ndo_cpt(dev, &cpt_ops, ctx);
+
+		/* Dump hardware address */
+		cpt_open_object(NULL, ctx);
+		hw.cpt_next = CPT_NULL;
+		hw.cpt_object = CPT_OBJ_NET_HWADDR;
+		hw.cpt_hdrlen = sizeof(hw);
+		hw.cpt_content = CPT_CONTENT_VOID;
+
+		if (dev->dev_addrs.count != 1) {
+			eprintk_ctx("multiple hwaddrs on %s\n", dev->name);
+			return -EINVAL;
+		}
+
+		BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != MAX_ADDR_LEN);
+		memcpy(hw.cpt_dev_addr, dev->dev_addr, sizeof(hw.cpt_dev_addr));
+		ctx->write(&hw, sizeof(hw), ctx);
+		cpt_close_object(ctx);
+		
+		cpt_dump_netstats(dev, ctx);
+
+		cpt_dump_idev_cnf(dev, ctx);
+
+		cpt_pop_object(&saved_obj, ctx);
+
+		cpt_close_object(ctx);
+	}
+
+	if (!dump_bridges) {
+		dump_bridges = 1;
+		goto dump;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+int cpt_suspend_network(struct cpt_context *ctx)
+{
+	get_exec_env()->disable_net = 1;
+	synchronize_net();
+	return 0;
+}
+
+int cpt_resume_network(struct cpt_context *ctx)
+{
+	struct ve_struct *env;
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+	env->disable_net = 0;
+	put_ve(env);
+	return 0;
+}
+
+int cpt_dump_ifaddr(struct cpt_context * ctx)
+{
+	struct net *net = get_exec_env()->ve_netns;
+	struct net_device *dev;
+
+	cpt_open_section(ctx, CPT_SECT_NET_IFADDR);
+	for_each_netdev(net, dev) {
+		struct in_device *idev = in_dev_get(dev);
+		struct in_ifaddr *ifa;
+
+		if (!idev)
+			continue;
+
+		for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) {
+			struct cpt_ifaddr_image v;
+			cpt_open_object(NULL, ctx);
+
+			v.cpt_next = CPT_NULL;
+			v.cpt_object = CPT_OBJ_NET_IFADDR;
+			v.cpt_hdrlen = sizeof(v);
+			v.cpt_content = CPT_CONTENT_VOID;
+
+			v.cpt_index = dev->ifindex;
+			v.cpt_family = AF_INET;
+			v.cpt_masklen = ifa->ifa_prefixlen;
+			v.cpt_flags = ifa->ifa_flags;
+			v.cpt_scope = ifa->ifa_scope;
+			memset(&v.cpt_address, 0, sizeof(v.cpt_address));
+			memset(&v.cpt_peer, 0, sizeof(v.cpt_peer));
+			memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
+			v.cpt_address[0] = ifa->ifa_local;
+			v.cpt_peer[0] = ifa->ifa_address;
+			v.cpt_broadcast[0] = ifa->ifa_broadcast;
+			memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ);
+			ctx->write(&v, sizeof(v), ctx);
+			cpt_close_object(ctx);
+		}
+		in_dev_put(idev);
+	}
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	for_each_netdev(net, dev) {
+		struct inet6_dev *idev = in6_dev_get(dev);
+		struct inet6_ifaddr *ifa;
+
+		if (!idev)
+			continue;
+
+		for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) {
+			struct cpt_ifaddr_image v;
+
+			if (dev == net->loopback_dev &&
+			    ifa->prefix_len == 128 &&
+			    ifa->addr.s6_addr32[0] == 0 &&
+			    ifa->addr.s6_addr32[1] == 0 &&
+			    ifa->addr.s6_addr32[2] == 0 &&
+			    ifa->addr.s6_addr32[3] == htonl(1))
+				continue;
+
+			cpt_open_object(NULL, ctx);
+
+			v.cpt_next = CPT_NULL;
+			v.cpt_object = CPT_OBJ_NET_IFADDR;
+			v.cpt_hdrlen = sizeof(v);
+			v.cpt_content = CPT_CONTENT_VOID;
+
+			v.cpt_index = dev->ifindex;
+			v.cpt_family = AF_INET6;
+			v.cpt_masklen = ifa->prefix_len;
+			v.cpt_flags = ifa->flags;
+			v.cpt_scope = ifa->scope;
+			v.cpt_valid_lft = ifa->valid_lft;
+			v.cpt_prefered_lft = ifa->prefered_lft;
+			memcpy(&v.cpt_address, &ifa->addr, 16);
+			memcpy(&v.cpt_peer, &ifa->addr, 16);
+			memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
+			memcpy(v.cpt_label, dev->name, IFNAMSIZ);
+			ctx->write(&v, sizeof(v), ctx);
+			cpt_close_object(ctx);
+		}
+		__in6_dev_put(idev);
+	}
+#endif
+	cpt_close_section(ctx);
+	return 0;
+}
+
+#ifdef CONFIG_IP_FIB_TRIE
+#error "Trie fib rules are known not to be restored proprly yet"
+#endif
+
+static int cpt_dump_route(struct cpt_context * ctx)
+{
+	int err;
+	struct socket *sock;
+	struct msghdr msg;
+	struct iovec iov;
+	struct {
+		struct nlmsghdr nlh;
+		struct rtgenmsg g;
+	} req;
+	struct sockaddr_nl nladdr;
+	struct cpt_object_hdr v;
+	mm_segment_t oldfs;
+	char *pg;
+
+	err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
+	if (err)
+		return err;
+
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+
+	req.nlh.nlmsg_len = sizeof(req);
+	req.nlh.nlmsg_type = RTM_GETROUTE;
+	req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
+	req.nlh.nlmsg_pid = 0;
+	req.g.rtgen_family = AF_INET;
+
+	iov.iov_base=&req;
+	iov.iov_len=sizeof(req);
+	msg.msg_name=&nladdr;
+	msg.msg_namelen=sizeof(nladdr);
+	msg.msg_iov=&iov;
+	msg.msg_iovlen=1;
+	msg.msg_control=NULL;
+	msg.msg_controllen=0;
+	msg.msg_flags=MSG_DONTWAIT;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	err = sock_sendmsg(sock, &msg, sizeof(req));
+	set_fs(oldfs);
+
+	if (err < 0)
+		goto out_sock;
+
+	pg = (char*)__get_free_page(GFP_KERNEL);
+	if (pg == NULL) {
+		err = -ENOMEM;
+		goto out_sock;
+	}
+
+	cpt_open_section(ctx, CPT_SECT_NET_ROUTE);
+	cpt_open_object(NULL, ctx);
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_ROUTE;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_NLMARRAY;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+restart:
+#endif
+	for (;;) {
+		struct nlmsghdr *h;
+
+		iov.iov_base = pg;
+		iov.iov_len = PAGE_SIZE;
+
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+		set_fs(oldfs);
+
+		if (err < 0)
+			goto out_sock_pg;
+		if (msg.msg_flags & MSG_TRUNC) {
+			err = -ENOBUFS;
+			goto out_sock_pg;
+		}
+
+		h = (struct nlmsghdr*)pg;
+		while (NLMSG_OK(h, err)) {
+			if (h->nlmsg_type == NLMSG_DONE) {
+				err = 0;
+				goto done;
+			}
+			if (h->nlmsg_type == NLMSG_ERROR) {
+				struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h);
+				err = errm->error;
+				eprintk_ctx("NLMSG error: %d\n", errm->error);
+				goto done;
+			}
+			if (h->nlmsg_type != RTM_NEWROUTE) {
+				eprintk_ctx("NLMSG: %d\n", h->nlmsg_type);
+				err = -EINVAL;
+				goto done;
+			}
+			ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx);
+			h = NLMSG_NEXT(h, err);
+		}
+		if (err) {
+			eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type);
+			err = -EINVAL;
+			break;
+		}
+	}
+done:
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (!err && req.g.rtgen_family == AF_INET) {
+		req.g.rtgen_family = AF_INET6;
+		iov.iov_base=&req;
+		iov.iov_len=sizeof(req);
+		msg.msg_name=&nladdr;
+		msg.msg_namelen=sizeof(nladdr);
+		msg.msg_iov=&iov;
+		msg.msg_iovlen=1;
+		msg.msg_control=NULL;
+		msg.msg_controllen=0;
+		msg.msg_flags=MSG_DONTWAIT;
+
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = sock_sendmsg(sock, &msg, sizeof(req));
+		set_fs(oldfs);
+
+		if (err > 0)
+			goto restart;
+	}
+#endif
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+	cpt_close_section(ctx);
+
+out_sock_pg:
+	free_page((unsigned long)pg);
+out_sock:
+	sock_release(sock);
+	return err;
+}
+
+struct args_t
+{
+	int* pfd;
+	envid_t veid;
+	int is_ipv6;
+};
+
+static int dumpfn(void *arg)
+{
+	int i;
+	struct args_t *args = arg;
+	int *pfd = args->pfd;
+	char *argv[] = { "iptables-save", "-c", NULL };
+	bool may_fail = false;
+	const char *path1, *path2;
+
+	if (!args->is_ipv6) {
+		path1 = "/sbin/iptables-save";
+		path2 = "/usr/sbin/iptables-save";
+	} else {
+		argv[0]  = "ip6tables-save";
+		path1 = "/sbin/ip6tables-save";
+		path2 = "/usr/sbin/ip6tables-save";
+		/* We ignore nonexistent ip6-tools */
+		may_fail = true;
+	}
+
+	i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
+	if (i < 0) {
+		eprintk("cannot enter ve to execute %s\n", argv[0]);
+		module_put(THIS_MODULE);
+		return 255 << 8;
+	}
+
+	if (pfd[1] != 1)
+		sc_dup2(pfd[1], 1);
+
+	for (i=0; i<current->files->fdt->max_fds; i++) {
+		if (i != 1)
+			sc_close(i);
+	}
+
+	module_put(THIS_MODULE);
+
+	set_fs(KERNEL_DS);
+	i = kernel_execve(path1, argv, NULL);
+	if (i == -ENOENT)
+		i = kernel_execve(path2, argv, NULL);
+	if (i == -ENOENT && may_fail) {
+		sc_close(1);
+		eprintk("Can't find %s, ignoring...\n", argv[0]);
+		return 0;
+	}
+
+	eprintk("failed to exec %s: %d\n", argv[0], i);
+	return 255 << 8;
+}
+
+static int cpt_dump_xtables(struct cpt_context *ctx, bool is_ipv6)
+{
+	int err = 0;
+#ifdef CONFIG_VE_IPTABLES
+	int pid;
+	int pfd[2];
+	struct file *f;
+	struct cpt_object_hdr v;
+	char buf[16];
+	loff_t pos;
+	int n;
+	int status;
+	mm_segment_t oldfs;
+	sigset_t ignore, blocked;
+	struct args_t args;
+	struct ve_struct *oldenv;
+
+	err = sc_pipe(pfd);
+	if (err < 0) {
+		eprintk_ctx("sc_pipe: %d\n", err);
+		return err;
+	}
+	args.pfd = pfd;
+	args.veid = VEID(get_exec_env());
+	args.is_ipv6 = is_ipv6;
+	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+	sigprocmask(SIG_BLOCK, &ignore, &blocked);
+	oldenv = set_exec_env(get_ve0());
+	err = pid = local_kernel_thread(dumpfn, (void*)&args,
+			SIGCHLD | CLONE_VFORK, 0);
+	set_exec_env(oldenv);
+	if (err < 0) {
+		eprintk_ctx("local_kernel_thread: %d\n", err);
+		goto out;
+	}
+
+	f = fget(pfd[0]);
+	sc_close(pfd[1]);
+	sc_close(pfd[0]);
+
+	cpt_open_object(NULL, ctx);
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NAME;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = is_ipv6 ? CPT_NULL : CPT_CONTENT_NAME;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	pos = ctx->file->f_pos;
+	do {
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
+		set_fs(oldfs);
+		if (n > 0)
+			ctx->write(buf, n, ctx);
+	} while (n > 0);
+
+	if (n < 0)
+		eprintk_ctx("read: %d\n", n);
+
+	fput(f);
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if ((err = sc_waitx(pid, 0, &status)) < 0)
+		eprintk_ctx("wait4: %d\n", err);
+	else if ((status & 0x7f) == 0) {
+		err = (status & 0xff00) >> 8;
+		if (err != 0) {
+			eprintk_ctx("iptables-save exited with %d\n", err);
+			err = -EINVAL;
+		}
+	} else {
+		eprintk_ctx("iptables-save terminated\n");
+		err = -EINVAL;
+	}
+	set_fs(oldfs);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+	if (ctx->file->f_pos != pos) {
+		buf[0] = 0;
+		ctx->write(buf, 1, ctx);
+		ctx->align(ctx);
+		cpt_close_object(ctx);
+	} else {
+		pos = ctx->current_object;
+		cpt_close_object(ctx);
+		ctx->file->f_pos = pos;
+	}
+	return n ? : err;
+
+out:
+	if (pfd[1] >= 0)
+		sc_close(pfd[1]);
+	if (pfd[0] >= 0)
+		sc_close(pfd[0]);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+#endif
+	return err;
+}
+
+static int cpt_dump_iptables(struct cpt_context *ctx)
+{
+	u64 mask = get_exec_env()->_iptables_modules;
+	int pos, ret = 0;
+
+	if (!(mask & (VE_IP_IPTABLES_MOD|VE_IP_IPTABLES6_MOD)))
+		goto out;
+
+	cpt_open_section(ctx, CPT_SECT_NET_IPTABLES);
+	pos = ctx->file->f_pos;
+
+	if ((mask & VE_IP_IPTABLES_MOD) != 0) {
+		ret = cpt_dump_xtables(ctx, false);
+		if (ret)
+			goto close;
+	}
+
+	if (((mask & VE_IP_IPTABLES6_MOD) != 0) && ipv6_is_enabled())
+		ret = cpt_dump_xtables(ctx, true);
+close:
+	if (pos == ctx->file->f_pos || ret) {
+		pos = ctx->current_section;
+		cpt_close_section(ctx);
+		ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL;
+		ctx->file->f_pos = pos;
+	} else {
+		/* Already aligned */
+		cpt_close_section(ctx);
+	}
+out:
+	return ret;
+}
+
+static void __maybe_unused cpt_dump_snmp_stub(struct cpt_context *ctx);
+
+static void cpt_dump_snmp_stat(struct cpt_context *ctx, void *mib[], int n)
+{
+	int i;
+	struct cpt_object_hdr o;
+	__u32 *stats;
+
+	/*
+	 * IPv6 can be not loaded or disabled.
+	 */
+	if (mib[0] == NULL) {
+		cpt_dump_snmp_stub(ctx);
+		return;
+	}
+
+	stats = cpt_get_buf(ctx);
+
+	cpt_open_object(NULL, ctx);
+
+	for (i = 0; i < n; i++)
+		stats[i] = snmp_fold_field(mib, i);
+
+ 	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_BITS;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_DATA;
+
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->write(stats, n * sizeof(*stats), ctx);
+	ctx->align(ctx);
+
+	cpt_close_object(ctx);
+
+	cpt_release_buf(ctx);
+}
+
+static void __maybe_unused cpt_dump_snmp_stub(struct cpt_context *ctx)
+{
+	struct cpt_object_hdr o;
+
+	cpt_open_object(NULL, ctx);
+ 	o.cpt_next = CPT_NULL;
+	o.cpt_object = CPT_OBJ_BITS;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_VOID;
+	ctx->write(&o, sizeof(o), ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+}
+
+static int cpt_dump_snmp(struct cpt_context *ctx)
+{
+	struct ve_struct *ve;
+	struct net *net;
+
+	ve = get_exec_env();
+	net = ve->ve_netns;
+
+	cpt_open_section(ctx, CPT_SECT_SNMP_STATS);
+
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.net_statistics,
+				LINUX_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.ip_statistics,
+				IPSTATS_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.tcp_statistics,
+				TCP_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_statistics,
+				UDP_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmp_statistics,
+				ICMP_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics,
+				ICMPMSG_MIB_MAX);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.ipv6_statistics,
+				IPSTATS_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_stats_in6,
+				UDP_MIB_MAX);
+	cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpv6_statistics,
+				ICMP6_MIB_MAX);
+#else
+	cpt_dump_snmp_stub(ctx);
+	cpt_dump_snmp_stub(ctx);
+	cpt_dump_snmp_stub(ctx);
+#endif
+	cpt_close_section(ctx);
+
+	return 0;
+}
+
+int cpt_dump_ifinfo(struct cpt_context * ctx)
+{
+	int err;
+
+	rtnl_lock();
+	err = cpt_dump_link(ctx);
+	if (!err)
+		err = cpt_dump_ifaddr(ctx);
+	rtnl_unlock();
+	if (!err)
+		err = cpt_dump_route(ctx);
+	if (!err)
+		err = cpt_dump_iptables(ctx);
+	if (!err)
+		err = cpt_dump_snmp(ctx);
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_net.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_net.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_net.h	2015-01-21 12:02:48.225093605 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_net.h	2015-01-21 12:02:48.225093605 +0300
@@ -0,0 +1,7 @@
+int cpt_dump_ifinfo(struct cpt_context *ctx);
+int rst_restore_net(struct cpt_context *ctx);
+int cpt_suspend_network(struct cpt_context *ctx);
+int cpt_resume_network(struct cpt_context *ctx);
+int rst_resume_network(struct cpt_context *ctx);
+int cpt_dump_ip_conntrack(struct cpt_context *ctx);
+int rst_restore_ip_conntrack(struct cpt_context * ctx);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_obj.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_obj.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_obj.c	2015-01-21 12:02:48.225093605 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_obj.c	2015-01-21 12:02:49.751053096 +0300
@@ -0,0 +1,161 @@
+/*
+ *
+ *  kernel/cpt/cpt_obj.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = kmalloc(sizeof(cpt_object_t), gfp);
+	if (obj) {
+		INIT_LIST_HEAD(&obj->o_list);
+		INIT_LIST_HEAD(&obj->o_hash);
+		obj->o_count = 1;
+		obj->o_pos = CPT_NULL;
+		obj->o_lock = 0;
+		obj->o_parent = NULL;
+		obj->o_index = CPT_NOINDEX;
+		obj->o_obj = NULL;
+		obj->o_image = NULL;
+		obj->o_flags = 0;
+		ctx->objcount++;
+	}
+	return obj;
+}
+
+void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx)
+{
+	kfree(obj);
+	ctx->objcount--;
+}
+
+void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx)
+{
+	list_add_tail(&obj->o_list, &ctx->object_array[type]);
+}
+
+void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj,
+			cpt_object_t *head, cpt_context_t *ctx)
+{
+	list_add(&obj->o_list, &head->o_list);
+}
+
+cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p,
+		unsigned gfp_mask, cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = lookup_cpt_object(type, p, ctx);
+
+	if (obj) {
+		obj->o_count++;
+		return obj;
+	}
+
+	if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) {
+		if (p)
+			cpt_obj_setobj(obj, p, ctx);
+		intern_cpt_object(type, obj, ctx);
+		return obj;
+	}
+	return NULL;
+}
+
+cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
+{
+	return __cpt_object_add(type, p, GFP_KERNEL, ctx);
+}
+
+cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = lookup_cpt_object(type, p, ctx);
+
+	if (obj)
+		obj->o_count++;
+
+	return obj;
+}
+
+int cpt_object_init(cpt_context_t *ctx)
+{
+	int i;
+
+	for (i=0; i<CPT_OBJ_MAX; i++) {
+		INIT_LIST_HEAD(&ctx->object_array[i]);
+	}
+	return 0;
+}
+
+int cpt_object_destroy(cpt_context_t *ctx)
+{
+	int i;
+
+	for (i=0; i<CPT_OBJ_MAX; i++) {
+		while (!list_empty(&ctx->object_array[i])) {
+			struct list_head *head = ctx->object_array[i].next;
+			cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
+			list_del(head);
+			if (obj->o_image)
+				kfree(obj->o_image);
+			free_cpt_object(obj, ctx);
+		}
+	}
+	if (ctx->objcount != 0)
+		eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount);
+	return 0;
+}
+
+cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, type) {
+		if (obj->o_obj == p)
+			return obj;
+	}
+	return NULL;
+}
+
+cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, type) {
+		if (obj->o_pos == pos)
+			return obj;
+	}
+	return NULL;
+}
+
+cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, type) {
+		if (obj->o_index == index)
+			return obj;
+	}
+	return NULL;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_pagein.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_pagein.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_pagein.h	2015-01-21 12:02:48.692081209 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_pagein.h	2015-01-21 12:02:48.719080491 +0300
@@ -0,0 +1,21 @@
+
+#define PGIN_RMID	0xF1AD1966
+#define PGIN_STOP	0xFFFFFFFE
+
+#define ITER_PASS	0x1
+#define ITER_STOP	0x2
+
+struct pgin_request
+{
+	__u32	rmid;
+	__u32	size;
+	__u64	index;
+	__u64	handle;
+};
+
+struct pgin_reply
+{
+	__u32	rmid;
+	__u32	error;
+	__u64	handle;
+};
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_proc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_proc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_proc.c	2015-01-21 12:02:48.226093579 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_proc.c	2015-01-21 12:02:51.105017152 +0300
@@ -0,0 +1,822 @@
+/*
+ *
+ *  kernel/cpt/cpt_proc.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_ioctl.h>
+#include <linux/delay.h>
+#include <linux/ve_proto.h>
+#include <linux/kmod.h>
+#include <linux/freezer.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_dump.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+#include "cpt_files.h"
+
+MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>");
+MODULE_LICENSE("GPL");
+
+/* List of contexts and lock protecting the list */
+static struct list_head cpt_context_list;
+static spinlock_t cpt_context_lock;
+
+static int proc_read(char *buffer, char **start, off_t offset,
+		     int length, int *eof, void *data)
+{
+	off_t pos = 0;
+	off_t begin = 0;
+	int len = 0;
+	cpt_context_t *ctx;
+
+	len += sprintf(buffer, "Ctx      Id       VE       State\n");
+
+	spin_lock(&cpt_context_lock);
+
+	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+		len += sprintf(buffer+len,"%p %08x %-8u %d",
+			       ctx,
+			       ctx->contextid,
+			       ctx->ve_id,
+			       ctx->ctx_state
+			       );
+
+		buffer[len++] = '\n';
+
+		pos = begin+len;
+		if (pos < offset) {
+			len = 0;
+			begin = pos;
+		}
+		if (pos > offset+length)
+			goto done;
+	}
+	*eof = 1;
+
+done:
+	spin_unlock(&cpt_context_lock);
+	*start = buffer + (offset - begin);
+	len -= (offset - begin);
+	if(len > length)
+		len = length;
+	if(len < 0)
+		len = 0;
+	return len;
+}
+
+void cpt_context_release(cpt_context_t *ctx)
+{
+	int i;
+
+	list_del(&ctx->ctx_list);
+	spin_unlock(&cpt_context_lock);
+
+	cpt_close_pram(ctx, -1);
+
+	if (ctx->ctx_state > 0)
+		cpt_resume(ctx);
+	ctx->ctx_state = CPT_CTX_ERROR;
+
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+	if (ctx->pagein_file_out)
+		fput(ctx->pagein_file_out);
+	if (ctx->pagein_file_in)
+		fput(ctx->pagein_file_in);
+#endif
+	if (ctx->objcount)
+		eprintk_ctx("%d objects leaked\n", ctx->objcount);
+	if (ctx->file)
+		fput(ctx->file);
+	cpt_flush_error(ctx);
+	if (ctx->errorfile) {
+		fput(ctx->errorfile);
+		ctx->errorfile = NULL;
+	}
+	for (i = 0; i < ctx->linkdirs_num; i++)
+		fput(ctx->linkdirs[i]);
+	if (ctx->error_msg) {
+		free_page((unsigned long)ctx->error_msg);
+		ctx->error_msg = NULL;
+	}
+	if (ctx->statusfile)
+		fput(ctx->statusfile);
+	if (ctx->lockfile)
+		fput(ctx->lockfile);
+	kfree(ctx);
+
+	spin_lock(&cpt_context_lock);
+}
+
+static void __cpt_context_put(cpt_context_t *ctx)
+{
+	if (!--ctx->refcount)
+		cpt_context_release(ctx);
+}
+
+static void cpt_context_put(cpt_context_t *ctx)
+{
+	spin_lock(&cpt_context_lock);
+	__cpt_context_put(ctx);
+	spin_unlock(&cpt_context_lock);
+}
+
+cpt_context_t * cpt_context_open(void)
+{
+	cpt_context_t *ctx;
+
+	if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
+		cpt_context_init(ctx);
+		spin_lock(&cpt_context_lock);
+		list_add_tail(&ctx->ctx_list, &cpt_context_list);
+		spin_unlock(&cpt_context_lock);
+		ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
+		if (ctx->error_msg != NULL)
+			ctx->error_msg[0] = 0;
+	}
+	return ctx;
+}
+
+static cpt_context_t * cpt_context_lookup(unsigned int contextid)
+{
+	cpt_context_t *ctx;
+
+	spin_lock(&cpt_context_lock);
+	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+		if (ctx->contextid == contextid) {
+			ctx->refcount++;
+			spin_unlock(&cpt_context_lock);
+			return ctx;
+		}
+	}
+	spin_unlock(&cpt_context_lock);
+	return NULL;
+}
+
+int cpt_context_lookup_veid(unsigned int veid)
+{
+	cpt_context_t *ctx;
+
+	spin_lock(&cpt_context_lock);
+	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+		if (ctx->ve_id == veid && ctx->ctx_state > 0) {
+			spin_unlock(&cpt_context_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&cpt_context_lock);
+	return 0;
+}
+
+/*
+ * Check capabilities on destination node
+ *
+ * Note: there is no immediately return from inside function even if an error
+ * occured in case administrator should receive detailed information in the
+ * log about missing capabilities and modules.
+ */
+static int cpt_test_vecaps_features(cpt_context_t *ctx, __u32 dst_flags,
+				    __u32 *features)
+{
+	int err;
+	__u32 src_flags;
+
+	/* The only immidiately return allowed if capabilities failed to be got */
+	err = cpt_vps_caps(ctx, &src_flags);
+	if (err)
+		return err;
+
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE4_1, "sse4_1", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE4_2, "sse4_2", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE4A, "sse4a", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_XSAVE, "xsave", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_AVX, "avx", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_AESNI, "aesni", err);
+	test_one_flag(src_flags, dst_flags, CPT_CPU_X86_RDRAND, "rdrand", err);
+
+	if (dst_flags & (1 << CPT_SLM_DMPRST)) {
+		eprintk_ctx("SLM is enabled on destination node, but slm_dmprst module is not loaded\n");
+		err = VECAPS_NO_SLM_MODULE;
+	}
+
+	if (src_flags & CPT_UNSUPPORTED_MASK)
+		err = VECAPS_UNSUPPORTED_FEATURE;
+
+	if ((dst_flags & (1 << CPT_NO_IPV6)) &&
+	     !(src_flags & (1 << CPT_NO_IPV6))) {
+		eprintk_ctx("IPv6 not loaded or disabled on destination node\n");
+		err = VECAPS_NO_IPV6_MODULE;
+	}
+
+	if ((src_flags & (1 << CPT_NAMESPACES)) &&
+	    !(dst_flags & (1 << CPT_NAMESPACES))) {
+		eprintk_ctx("Mount namespaces migration support is not"
+			    " present on destination node\n");
+		err = VECAPS_NO_MNT_NAMESPACES;
+	}
+
+	if (features)
+		*features = src_flags & CPT_UNSUPPORTED_MASK;
+
+	return err;
+}
+
+static int cpt_test_vecaps(cpt_context_t *ctx, __u32 dst_flags)
+{
+	return cpt_test_vecaps_features(ctx, dst_flags, NULL);
+}
+
+static int cpt_test_vecaps2(cpt_context_t *ctx, void __user *data)
+{
+	struct vecaps caps;
+	int err;
+
+	if (copy_from_user(&caps, data, sizeof(caps)))
+		return -EFAULT;
+
+	err = cpt_test_vecaps_features(ctx, caps.dst_flags, &caps.features);
+
+	/* Userspace may want to know the mask of unsupported features */
+	if (copy_to_user(data, &caps, sizeof(caps)))
+		return -EFAULT;
+
+	return err;
+}
+
+static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
+{
+	int err = 0;
+	cpt_context_t *ctx;
+	struct file *dfile = NULL;
+	int try;
+
+	unlock_kernel();
+
+	request_module("vzcptpram");
+
+	if (cmd == CPT_TEST_CAPS) {
+		/* Obsoleted by CPT_TEST_VECAPS */
+		err = -ENOSYS;
+		goto out_lock;
+	}
+
+	if (cmd == CPT_TEST_VERSION) {
+		err = CPT_CURRENT_VERSION;
+		goto out_lock;
+	}
+
+	if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
+		cpt_context_t *old_ctx;
+
+		ctx = NULL;
+		if (cmd == CPT_JOIN_CONTEXT) {
+			err = -ENOENT;
+			ctx = cpt_context_lookup(arg);
+			if (!ctx)
+				goto out_lock;
+		}
+
+		spin_lock(&cpt_context_lock);
+		old_ctx = (cpt_context_t*)file->private_data;
+		file->private_data = ctx;
+
+		if (old_ctx) {
+			if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
+				old_ctx->sticky = 0;
+				old_ctx->refcount--;
+			}
+			__cpt_context_put(old_ctx);
+		}
+		spin_unlock(&cpt_context_lock);
+		err = 0;
+		goto out_lock;
+	}
+
+	spin_lock(&cpt_context_lock);
+	ctx = (cpt_context_t*)file->private_data;
+	if (ctx)
+		ctx->refcount++;
+	spin_unlock(&cpt_context_lock);
+
+	if (!ctx) {
+		cpt_context_t *old_ctx;
+
+		err = -ENOMEM;
+		ctx = cpt_context_open();
+		if (!ctx)
+			goto out_lock;
+
+		spin_lock(&cpt_context_lock);
+		old_ctx = (cpt_context_t*)file->private_data;
+		if (!old_ctx) {
+			ctx->refcount++;
+			file->private_data = ctx;
+		} else {
+			old_ctx->refcount++;
+		}
+		if (old_ctx) {
+			__cpt_context_put(ctx);
+			ctx = old_ctx;
+		}
+		spin_unlock(&cpt_context_lock);
+	}
+
+	if (cmd == CPT_GET_CONTEXT) {
+		unsigned int contextid = (unsigned int)arg;
+
+		if (ctx->contextid && ctx->contextid != contextid) {
+			err = -EINVAL;
+			goto out_nosem;
+		}
+		if (!ctx->contextid) {
+			cpt_context_t *c1 = cpt_context_lookup(contextid);
+			if (c1) {
+				cpt_context_put(c1);
+				err = -EEXIST;
+				goto out_nosem;
+			}
+			ctx->contextid = contextid;
+		}
+		spin_lock(&cpt_context_lock);
+		if (!ctx->sticky) {
+			ctx->sticky = 1;
+			ctx->refcount++;
+		}
+		spin_unlock(&cpt_context_lock);
+		goto out_nosem;
+	}
+
+	down(&ctx->main_sem);
+
+	err = -EBUSY;
+	if (ctx->ctx_state < 0)
+		goto out;
+
+	err = 0;
+	switch (cmd) {
+	case CPT_SET_DUMPFD:
+		if (ctx->ctx_state == CPT_CTX_DUMPING) {
+			err = -EBUSY;
+			break;
+		}
+		if (arg >= 0) {
+			err = -EBADF;
+			dfile = fget(arg);
+			if (dfile == NULL)
+				break;
+			if (dfile->f_op == NULL ||
+			    dfile->f_op->write == NULL) {
+				fput(dfile);
+				break;
+			}
+			err = 0;
+		}
+		if (ctx->file)
+			fput(ctx->file);
+		ctx->file = dfile;
+		break;
+	case CPT_LINKDIR_ADD:
+		if (ctx->linkdirs_num >= CPT_MAX_LINKDIRS) {
+			err = -EMLINK;
+			break;
+		}
+
+		dfile = fget(arg);
+		if (!dfile) {
+			err = -EBADFD;
+			break;
+		}
+
+		if (!S_ISDIR(dfile->f_dentry->d_inode->i_mode)) {
+			err = -ENOTDIR;
+			fput(dfile);
+			break;
+		}
+
+		ctx->linkdirs[ctx->linkdirs_num++] = dfile;
+		break;
+	case CPT_SET_ERRORFD:
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->errorfile)
+			fput(ctx->errorfile);
+		ctx->errorfile = dfile;
+		break;
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+	case CPT_SET_PAGEINFDIN:
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->pagein_file_in)
+			fput(ctx->pagein_file_in);
+		ctx->pagein_file_in = dfile;
+		break;
+	case CPT_SET_PAGEINFDOUT:
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->pagein_file_out)
+			fput(ctx->pagein_file_out);
+		ctx->pagein_file_out = dfile;
+		break;
+	case CPT_SET_LAZY:
+		if (!arg)
+			break;
+		printk(KERN_ERR "%s: CPT_SET_LAZY ioctl is obsolete.\n", __func__);
+		eprintk_ctx("CPT_SET_LAZY ioctl is obsolete.\n");
+		err = -EOPNOTSUPP;
+		break;
+	case CPT_ITER:
+		err = cpt_iteration(ctx);
+		break;
+#endif
+	case CPT_SET_VEID:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		ctx->ve_id = arg;
+		break;
+	case CPT_SET_CPU_FLAGS:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		ctx->dst_cpu_flags = arg;
+		ctx->src_cpu_flags = test_cpu_caps_and_features();
+		break;
+	case CPT_SET_PRAM:
+		if (arg)
+			err = cpt_open_pram(ctx);
+		else
+			cpt_close_pram(ctx, -1);
+		break;
+	case CPT_SUSPEND:
+		if (cpt_context_lookup_veid(ctx->ve_id) ||
+		    ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+
+#ifdef ITER_DEBUG
+		cpt_iteration(ctx);
+#endif
+
+		ctx->ctx_state = CPT_CTX_SUSPENDING;
+		try = 0;
+		do {
+			err = cpt_vps_suspend(ctx);
+			if (err)
+				cpt_resume(ctx);
+			if (err == -EAGAIN)
+				msleep(1000);
+			try++;
+		} while (err == -EAGAIN && try < 3);
+		if (err) {
+			ctx->ctx_state = CPT_CTX_IDLE;
+		} else {
+			ctx->ctx_state = CPT_CTX_SUSPENDED;
+		}
+		break;
+	case CPT_STOP_TRACKER:
+		if (ctx->ctx_state != CPT_CTX_SUSPENDED) {
+			err = -EBADRQC;
+			break;
+		}
+		cpt_stop_tracker(ctx);
+		break;
+	case CPT_DUMP:
+		if (!ctx->ctx_state) {
+			err = -ENOENT;
+			break;
+		}
+		if (!ctx->file) {
+			err = -EBADF;
+			break;
+		}
+		err = cpt_dump(ctx);
+		if (!err)
+			printk(KERN_INFO "CT: %d: checkpointed\n", ctx->ve_id);
+		break;
+	case CPT_RESUME:
+		if (ctx->ctx_state == CPT_CTX_IDLE) {
+			err = -ENOENT;
+			break;
+		}
+		err = cpt_resume(ctx);
+		if (!err)
+			ctx->ctx_state = CPT_CTX_IDLE;
+		break;
+	case CPT_KILL:
+		if (ctx->ctx_state == CPT_CTX_IDLE) {
+			err = -ENOENT;
+			break;
+		}
+		err = cpt_kill(ctx);
+		if (!err)
+			ctx->ctx_state = CPT_CTX_IDLE;
+		break;
+	case CPT_TEST_VECAPS:
+		err = cpt_test_vecaps(ctx, arg);
+		break;
+	case CPT_TEST_VECAPS2:
+		err = cpt_test_vecaps2(ctx, (void *)arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	cpt_flush_error(ctx);
+	up(&ctx->main_sem);
+out_nosem:
+	cpt_context_put(ctx);
+out_lock:
+	lock_kernel();
+	if (err == -ERESTARTSYS || err == -ERESTARTNOINTR ||
+	    err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK)
+		err = -EINTR;
+	return err;
+}
+
+static int cpt_open(struct inode *inode, struct file *file)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int cpt_release(struct inode * inode, struct file * file)
+{
+	cpt_context_t *ctx;
+
+	spin_lock(&cpt_context_lock);
+	ctx = (cpt_context_t*)file->private_data;
+	file->private_data = NULL;
+
+	if (ctx)
+		__cpt_context_put(ctx);
+	spin_unlock(&cpt_context_lock);
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+
+static struct file_operations cpt_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = cpt_open,
+	.release = cpt_release,
+	.ioctl	 = cpt_ioctl,
+};
+
+static ssize_t melt_write( struct file *file,
+			   const char __user *buffer,
+			   size_t len,
+			   loff_t *offset )
+{
+	struct task_struct *p, *g;
+	char veid_str[32];
+	unsigned long veid;
+	struct ve_struct *ve, *curr_ve;
+
+	memset(veid_str, 0, 32);
+
+	if (len >= sizeof(veid_str))
+	       return -ENOMEM;
+
+	if (copy_from_user(veid_str, buffer, len))
+		return -EFAULT;
+
+	if (strict_strtoul(veid_str, 10, &veid) < 0)
+		return -EINVAL;
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -ENOENT;
+
+	if (ve_is_super(ve)) {
+		len = -EPERM;
+		goto out;
+	}
+
+	curr_ve = set_exec_env(ve);
+
+	read_lock(&tasklist_lock);
+	do_each_thread_ve(g, p) {
+		if (freezing(p) || frozen(p)) {
+			if (!thaw_process(p)) {
+				printk(KERN_ERR "Failed to thaw: " CPT_FID " \n",
+						CPT_TID(p));
+			}
+		}
+	} while_each_thread_ve(g, p);
+	read_unlock(&tasklist_lock);
+
+	set_exec_env(curr_ve);
+	put_ve(ve);
+out:
+	return len;
+}
+
+static int melt_open(struct inode *inode, struct file *file)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int melt_release(struct inode * inode, struct file * file)
+{
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static struct file_operations melt_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = melt_open,
+	.write	 = melt_write,
+	.release = melt_release,
+};
+
+static struct proc_dir_entry *proc_ent;
+static struct proc_dir_entry *melt_ent;
+
+static struct ctl_table_header *cpt_control;
+
+static int zero = 0;
+static int one = 1;
+
+static ctl_table tunables_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "suspend_timeout_sec",
+		.data		= &suspend_timeout,
+		.maxlen		= sizeof(suspend_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &suspend_timeout_min,
+		.extra2		= &suspend_timeout_max,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kill_external_processes",
+		.data		= &kill_external,
+		.maxlen		= sizeof(kill_external),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{ .ctl_name = 0 }
+};
+static ctl_table control_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "cpt",
+		.mode		= 0555,
+		.child		= tunables_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+
+static struct ctl_table_header *ctl_header;
+
+static ctl_table debug_table[] = {
+	{
+		.procname	= "cpt",
+		.data		= &debug_level,
+		.maxlen		= sizeof(debug_level),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+static ctl_table root_table[] = {
+	{
+		.ctl_name	= CTL_DEBUG,
+		.procname	= "debug",
+		.mode		= 0555,
+		.child		= debug_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init init_cpt(void)
+{
+	int err;
+
+	err = -ENOMEM;
+	ctl_header = register_sysctl_table(root_table);
+	if (!ctl_header)
+		goto err_mon;
+
+	cpt_control = register_sysctl_table(control_table);
+	if (!ctl_header)
+		goto err_control;
+
+	spin_lock_init(&cpt_context_lock);
+	INIT_LIST_HEAD(&cpt_context_list);
+
+	err = -EINVAL;
+	proc_ent = proc_create("cpt", 0600, NULL, NULL);
+	if (!proc_ent)
+		goto err_out;
+
+	cpt_fops.read = proc_ent->proc_fops->read;
+	cpt_fops.write = proc_ent->proc_fops->write;
+	cpt_fops.llseek = proc_ent->proc_fops->llseek;
+	proc_ent->proc_fops = &cpt_fops;
+
+	proc_ent->read_proc = proc_read;
+	proc_ent->data = NULL;
+
+	melt_ent = proc_create("thaw", 0200, proc_vz_dir, &melt_fops);
+	if (!melt_ent)
+		goto err_melt;
+
+	return 0;
+
+err_melt:
+	remove_proc_entry("cpt", NULL);
+err_out:
+	unregister_sysctl_table(cpt_control);
+err_control:
+	unregister_sysctl_table(ctl_header);
+err_mon:
+	return err;
+}
+module_init(init_cpt);
+
+static void __exit exit_cpt(void)
+{
+	remove_proc_entry("thaw", proc_vz_dir);
+	remove_proc_entry("cpt", NULL);
+	unregister_sysctl_table(cpt_control);
+	unregister_sysctl_table(ctl_header);
+
+	spin_lock(&cpt_context_lock);
+	while (!list_empty(&cpt_context_list)) {
+		cpt_context_t *ctx;
+		ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
+
+		if (!ctx->sticky)
+			ctx->refcount++;
+		ctx->sticky = 0;
+
+		BUG_ON(ctx->refcount != 1);
+
+		__cpt_context_put(ctx);
+	}
+	spin_unlock(&cpt_context_lock);
+}
+module_exit(exit_cpt);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_process.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_process.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_process.c	2015-01-21 12:02:48.226093579 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_process.c	2015-01-21 12:02:51.087017630 +0300
@@ -0,0 +1,1532 @@
+/*
+ *
+ *  kernel/cpt/cpt_process.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/compat.h>
+#include <linux/cpt_image.h>
+#include <linux/nsproxy.h>
+#include <linux/futex.h>
+#include <linux/posix-timers.h>
+
+#ifdef CONFIG_X86
+#include <asm/i387.h>
+#endif
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_ubc.h"
+#include "cpt_process.h"
+#include "cpt_kernel.h"
+
+#ifdef CONFIG_X86_32
+#undef task_pt_regs
+#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1)
+#endif
+
+int check_task_state(struct task_struct *tsk, struct cpt_context *ctx)
+{
+#ifdef CONFIG_X86_64
+	struct vm_area_struct *vma;
+	if (!(task_thread_info(tsk)->flags&_TIF_IA32)) {
+		if (task_pt_regs(tsk)->ip >= VSYSCALL_START &&
+				task_pt_regs(tsk)->ip < VSYSCALL_END) {
+			eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk));
+			return -EAGAIN;
+		}
+		vma = find_vma(current->mm, task_pt_regs(tsk)->ip);
+		if (vma && vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) {
+			eprintk_ctx(CPT_FID "cannot be checkpointied while vdso, try later\n", CPT_TID(tsk));
+			return -EAGAIN;
+		}
+	}
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_X86
+
+static u32 encode_segment(u32 segreg)
+{
+	segreg &= 0xFFFF;
+
+	if (segreg == 0)
+		return CPT_SEG_ZERO;
+	if ((segreg & 3) != 3) {
+		wprintk("Invalid RPL of a segment reg %x\n", segreg);
+		return CPT_SEG_ZERO;
+	}
+
+	/* LDT descriptor, it is just an index to LDT array */
+	if (segreg & 4)
+		return CPT_SEG_LDT + (segreg >> 3);
+
+	/* TLS descriptor. */
+	if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN &&
+	    (segreg >> 3) <= GDT_ENTRY_TLS_MAX)
+		return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN);
+
+	/* One of standard desriptors */
+#ifdef CONFIG_X86_64
+	if (segreg == __USER32_DS)
+		return CPT_SEG_USER32_DS;
+	if (segreg == __USER32_CS)
+		return CPT_SEG_USER32_CS;
+	if (segreg == __USER_DS)
+		return CPT_SEG_USER64_DS;
+	if (segreg == __USER_CS)
+		return CPT_SEG_USER64_CS;
+#else
+	if (segreg == __USER_DS)
+		return CPT_SEG_USER32_DS;
+	if (segreg == __USER_CS)
+		return CPT_SEG_USER32_CS;
+#endif
+	wprintk("Invalid segment reg %x\n", segreg);
+	return CPT_SEG_ZERO;
+}
+
+#ifdef CONFIG_X86_64
+static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s,
+		struct task_struct *tsk)
+{
+	d->cpt_ebp = s->bp;
+	d->cpt_ebx = s->bx;
+	d->cpt_eax = s->ax;
+	d->cpt_ecx = s->cx;
+	d->cpt_edx = s->dx;
+	d->cpt_esi = s->si;
+	d->cpt_edi = s->di;
+	d->cpt_orig_eax = s->orig_ax;
+	d->cpt_eip = s->ip;
+	d->cpt_xcs = encode_segment(s->cs);
+	d->cpt_eflags = s->flags;
+	d->cpt_esp = s->sp;
+	d->cpt_xss = encode_segment(s->ss);
+	d->cpt_xds = encode_segment(tsk->thread.ds);
+	d->cpt_xes = encode_segment(tsk->thread.es);
+}
+
+static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
+{
+	cpt_open_object(NULL, ctx);
+
+	if (task_thread_info(tsk)->flags & _TIF_IA32) {
+		struct cpt_x86_regs ri;
+		ri.cpt_next = sizeof(ri);
+		ri.cpt_object = CPT_OBJ_X86_REGS;
+		ri.cpt_hdrlen = sizeof(ri);
+		ri.cpt_content = CPT_CONTENT_VOID;
+
+		ri.cpt_debugreg[0] = tsk->thread.debugreg0;
+		ri.cpt_debugreg[1] = tsk->thread.debugreg1;
+		ri.cpt_debugreg[2] = tsk->thread.debugreg2;
+		ri.cpt_debugreg[3] = tsk->thread.debugreg3;
+		ri.cpt_debugreg[4] = 0;
+		ri.cpt_debugreg[5] = 0;
+		ri.cpt_debugreg[6] = tsk->thread.debugreg6;
+		ri.cpt_debugreg[7] = tsk->thread.debugreg7;
+		ri.cpt_fs = encode_segment(tsk->thread.fsindex);
+		ri.cpt_gs = CPT_SEG_ZERO;
+		ri.cpt_ugs = encode_segment(tsk->thread.gsindex);
+
+		xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk);
+
+		ctx->write(&ri, sizeof(ri), ctx);
+	} else {
+		struct cpt_x86_64_regs ri;
+		ri.cpt_next = sizeof(ri);
+		ri.cpt_object = CPT_OBJ_X86_64_REGS;
+		ri.cpt_hdrlen = sizeof(ri);
+		ri.cpt_content = CPT_CONTENT_VOID;
+
+		ri.cpt_fsbase = tsk->thread.fs;
+		ri.cpt_gsbase = tsk->thread.gs;
+		ri.cpt_fsindex = encode_segment(tsk->thread.fsindex);
+		ri.cpt_gsindex = encode_segment(tsk->thread.gsindex);
+		ri.cpt_ds = encode_segment(tsk->thread.ds);
+		ri.cpt_es = encode_segment(tsk->thread.es);
+		ri.cpt_debugreg[0] = tsk->thread.debugreg0;
+		ri.cpt_debugreg[1] = tsk->thread.debugreg1;
+		ri.cpt_debugreg[2] = tsk->thread.debugreg2;
+		ri.cpt_debugreg[3] = tsk->thread.debugreg3;
+		ri.cpt_debugreg[4] = 0;
+		ri.cpt_debugreg[5] = 0;
+		ri.cpt_debugreg[6] = tsk->thread.debugreg6;
+		ri.cpt_debugreg[7] = tsk->thread.debugreg7;
+
+		memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs));
+
+		ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs);
+		ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss);
+
+		ctx->write(&ri, sizeof(ri), ctx);
+
+	}
+	cpt_close_object(ctx);
+
+	return 0;
+}
+
+#else
+
+static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
+{
+	struct cpt_x86_regs ri;
+	struct pt_regs *pt_regs;
+
+	cpt_open_object(NULL, ctx);
+
+	ri.cpt_next = sizeof(ri);
+	ri.cpt_object = CPT_OBJ_X86_REGS;
+	ri.cpt_hdrlen = sizeof(ri);
+	ri.cpt_content = CPT_CONTENT_VOID;
+
+	ri.cpt_debugreg[0] = tsk->thread.debugreg0;
+	ri.cpt_debugreg[1] = tsk->thread.debugreg1;
+	ri.cpt_debugreg[2] = tsk->thread.debugreg2;
+	ri.cpt_debugreg[3] = tsk->thread.debugreg3;
+	ri.cpt_debugreg[6] = tsk->thread.debugreg6;
+	ri.cpt_debugreg[7] = tsk->thread.debugreg7;
+
+	pt_regs = task_pt_regs(tsk);
+
+	ri.cpt_fs = encode_segment(pt_regs->fs);
+	ri.cpt_gs = encode_segment(tsk->thread.gs);
+	ri.cpt_ugs = encode_segment(task_user_gs(tsk));
+
+	ri.cpt_ebx = pt_regs->bx;
+	ri.cpt_ecx = pt_regs->cx;
+	ri.cpt_edx = pt_regs->dx;
+	ri.cpt_esi = pt_regs->si;
+	ri.cpt_edi = pt_regs->di;
+	ri.cpt_ebp = pt_regs->bp;
+	ri.cpt_eax = pt_regs->ax;
+	ri.cpt_xds = pt_regs->ds;
+	ri.cpt_xes = pt_regs->es;
+	ri.cpt_orig_eax = pt_regs->orig_ax;
+	ri.cpt_eip = pt_regs->ip;
+	ri.cpt_xcs = pt_regs->cs;
+	ri.cpt_eflags = pt_regs->flags;
+	ri.cpt_esp = pt_regs->sp;
+	ri.cpt_xss = pt_regs->ss;
+
+	ri.cpt_xcs = encode_segment(pt_regs->cs);
+	ri.cpt_xss = encode_segment(pt_regs->ss);
+	ri.cpt_xds = encode_segment(pt_regs->ds);
+	ri.cpt_xes = encode_segment(pt_regs->es);
+
+	ctx->write(&ri, sizeof(ri), ctx);
+	cpt_close_object(ctx);
+
+	return 0;
+}
+#endif
+#endif
+
+#ifdef CONFIG_IA64
+
+/*
+   PMD?
+ */
+
+#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \
+						 CPT_TID(tsk), err); return -EINVAL; } } while (0) 
+
+static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk,
+			struct cpt_context *ctx)
+{
+	int err;
+	struct unw_frame_info info;
+	struct ia64_fpreg fpval;
+	int i;
+
+	unw_init_from_blocked_task(&info, tsk);
+	_C(unw_unwind_to_user(&info));
+
+	/* NAT_BITS */
+	do {
+		unsigned long scratch_unat;
+
+		scratch_unat = info.sw->caller_unat;
+		if (info.pri_unat_loc)
+			scratch_unat = *info.pri_unat_loc;
+
+		r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat);
+		/* Just to be on safe side. */
+		r->nat[0] &= 0xFFFFFFFFUL;
+	} while (0);
+
+	/* R4-R7 */
+	for (i = 4; i <= 7; i++) {
+		char nat = 0;
+		_C(unw_access_gr(&info, i, &r->gr[i], &nat, 0));
+		r->nat[0] |= (nat != 0) << i;
+	}
+
+	/* B1-B5 */
+	for (i = 1; i <= 5; i++) {
+		_C(unw_access_br(&info, i, &r->br[i], 0));
+	}
+
+	/* AR_EC, AR_LC */
+	_C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0));
+	_C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0));
+
+	/* F2..F5, F16..F31 */
+	for (i = 2; i <= 5; i++) {
+		_C(unw_get_fr(&info, i, &fpval));
+		memcpy(&r->fr[i*2], &fpval, 16);
+	}
+	for (i = 16; i <= 31; i++) {
+		_C(unw_get_fr(&info, i, &fpval));
+		memcpy(&r->fr[i*2], &fpval, 16);
+	}
+	return 0;
+}
+
+#undef _C
+
+static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
+{
+	int err;
+	unsigned long pg;
+	struct cpt_ia64_regs *r;
+	struct ia64_psr *psr;
+	struct switch_stack *sw;
+	struct pt_regs *pt;
+	void *krbs = (void *)tsk + IA64_RBS_OFFSET;
+	unsigned long reg;
+
+	if (tsk->exit_state)
+		return 0;
+
+	pt = task_pt_regs(tsk);
+
+	sw = (struct switch_stack *) (tsk->thread.ksp + 16);
+
+	if ((pg = __get_free_page(GFP_KERNEL)) == 0)
+		return -ENOMEM;
+
+	r = (void*)pg;
+	/* To catch if we forgot some register */
+	memset(r, 0xA5, sizeof(*r));
+
+	r->gr[0] = 0;
+	r->fr[0] = r->fr[1] = 0;
+	r->fr[2] = 0x8000000000000000UL;
+	r->fr[3] = 0xffff;
+
+	r->nat[0] = r->nat[1] = 0;
+
+	err = ass_to_mouth(r, tsk, ctx);
+	if (err) {
+		printk("ass_to_mouth error %d\n", err);
+		goto out;
+	}
+
+	/* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */
+	memcpy(&r->gr[1], &pt->r1, 8*(2-1));
+	memcpy(&r->gr[2], &pt->r2, 8*(4-2));
+	memcpy(&r->gr[8], &pt->r8, 8*(12-8));
+	memcpy(&r->gr[12], &pt->r12, 8*(14-12));
+	memcpy(&r->gr[14], &pt->r14, 8*(15-14));
+	memcpy(&r->gr[15], &pt->r15, 8*(16-15));
+	memcpy(&r->gr[16], &pt->r16, 8*(32-16));
+
+	r->br[0] = pt->b0;
+	r->br[6] = pt->b6;
+	r->br[7] = pt->b7;
+
+	r->ar_bspstore = pt->ar_bspstore;
+	r->ar_unat = pt->ar_unat;
+	r->ar_pfs = pt->ar_pfs;
+	r->ar_ccv = pt->ar_ccv;
+	r->ar_fpsr = pt->ar_fpsr;
+	r->ar_csd = pt->ar_csd;
+	r->ar_ssd = pt->ar_ssd;
+	r->ar_rsc = pt->ar_rsc;
+
+	r->cr_iip = pt->cr_iip;
+	r->cr_ipsr = pt->cr_ipsr;
+
+	r->pr = pt->pr;
+
+	r->cfm = pt->cr_ifs;
+	r->ar_rnat = pt->ar_rnat;
+
+	/* fpregs 6..9,10..11 are in pt_regs */
+	memcpy(&r->fr[2*6], &pt->f6, 16*(10-6));
+	memcpy(&r->fr[2*10], &pt->f10, 16*(12-10));
+	/* fpreg 12..15 are on switch stack */
+	memcpy(&r->fr[2*12], &sw->f12, 16*(16-12));
+	/* fpregs 32...127 */
+	psr = ia64_psr(task_pt_regs(tsk));
+	preempt_disable();
+	if (ia64_is_local_fpu_owner(tsk) && psr->mfh) {
+		psr->mfh = 0;
+		tsk->thread.flags |= IA64_THREAD_FPH_VALID;
+		ia64_save_fpu(&tsk->thread.fph[0]);
+	}
+	preempt_enable();
+	memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32));
+
+	if (tsk->thread.flags & IA64_THREAD_DBG_VALID) {
+		memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr));
+		memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr));
+	} else {
+		memset(r->ibr, 0, sizeof(r->ibr));
+		memset(r->dbr, 0, sizeof(r->dbr));
+	}
+
+	r->loadrs = pt->loadrs;
+	r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19));
+	if ((long)pt->cr_ifs > 0)
+		r->num_regs += (pt->cr_ifs & 0x7f);
+
+	if (r->num_regs > 96) {
+		eprintk_ctx(CPT_FID " too much RSE regs %lu\n",
+			    CPT_TID(tsk), r->num_regs);
+		return -EINVAL;
+	}
+
+	for (reg = 0; reg < r->num_regs; reg++) {
+		unsigned long *ptr = ia64_rse_skip_regs(krbs, reg);
+		unsigned long *rnatp = ia64_rse_rnat_addr(ptr);
+
+		r->gr[32+reg] = *ptr;
+
+		if ((unsigned long)rnatp >= sw->ar_bspstore)
+			rnatp = &sw->ar_rnat;
+		if (*rnatp & (1UL<<ia64_rse_slot_num(ptr))) {
+			if (reg < 32)
+				r->nat[0] |= (1UL<<(reg+32));
+			else
+				r->nat[1] |= (1UL<<(reg-32));
+		}
+	}
+	if (r->nat[0] | r->nat[1])
+		wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk),
+			    r->nat[1], r->nat[0]);
+
+	cpt_open_object(NULL, ctx);
+	r->cpt_next = sizeof(*r);
+	r->cpt_object = CPT_OBJ_IA64_REGS;
+	r->cpt_hdrlen = sizeof(*r);
+	r->cpt_content = CPT_CONTENT_VOID;
+	ctx->write(r, sizeof(*r), ctx);
+	cpt_close_object(ctx);
+	err = 0;
+
+out:
+	free_page(pg);
+	return err;
+}
+#endif
+
+static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx)
+{
+	struct cpt_obj_bits hdr;
+	unsigned long size;
+	void *start;
+
+	cpt_open_object(NULL, ctx);
+
+#ifdef CONFIG_X86_64
+	size = tsk->thread.sp0 - tsk->thread.sp;
+	start = (void*)tsk->thread.sp;
+#elif defined(CONFIG_X86_32)
+	size = tsk->thread.sp0 - tsk->thread.sp;
+	start = (void*)tsk->thread.sp;
+#elif defined(CONFIG_IA64)
+	size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp;
+	start = (void*)tsk->thread.ksp;
+#else
+#error Arch is not supported
+#endif
+
+	hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
+	hdr.cpt_object = CPT_OBJ_BITS;
+	hdr.cpt_hdrlen = sizeof(hdr);
+	hdr.cpt_content = CPT_CONTENT_STACK;
+	hdr.cpt_size = size;
+
+	ctx->write(&hdr, sizeof(hdr), ctx);
+	ctx->write(start, size, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+	return 0;
+}
+
+#ifdef CONFIG_X86
+/* Determine size and type of FPU struct to store */
+static void init_fpu_hdr(struct cpt_obj_bits *hdr)
+{
+	unsigned long size;
+	int type;
+
+	if (likely(cpu_has_xsave)) {
+		type = CPT_CONTENT_X86_XSAVE;
+		size = xstate_size;
+	} else
+#ifndef CONFIG_X86_64
+	if (!cpu_has_fxsr) {
+		size = sizeof(struct i387_fsave_struct);
+		type = CPT_CONTENT_X86_FPUSTATE_OLD;
+	} else
+#endif
+	{
+		type = CPT_CONTENT_X86_FPUSTATE;
+		size = sizeof(struct i387_fxsave_struct);
+	}
+
+	hdr->cpt_next = sizeof(struct cpt_obj_bits) + CPT_ALIGN(size);
+	hdr->cpt_object = CPT_OBJ_BITS;
+	hdr->cpt_hdrlen = sizeof(struct cpt_obj_bits);
+	hdr->cpt_content = type;
+	hdr->cpt_size = size;
+}
+
+/* Formats of i387_fxsave_struct are the same for x86_64
+ * and i386. Plain luck. */
+
+static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
+{
+	struct cpt_obj_bits hdr;
+
+	if (!tsk->thread.xstate)
+		return 0;
+
+	cpt_open_object(NULL, ctx);
+
+	init_fpu_hdr(&hdr);
+
+	ctx->write(&hdr, sizeof(hdr), ctx);
+	ctx->write(tsk->thread.xstate, hdr.cpt_size, ctx);
+	ctx->align(ctx);
+	cpt_close_object(ctx);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_IA64
+
+static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
+{
+	return 0;
+}
+#endif
+
+static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info)
+{
+	si->cpt_signo = info->si_signo;
+	si->cpt_errno = info->si_errno;
+	si->cpt_code = info->si_code;
+
+	/* Allow old kernels (i.e. which does not save _sifields) to restore */
+	switch(si->cpt_code & __SI_MASK) {
+	case __SI_POLL:
+		si->cpt_pid = info->si_band;
+		si->cpt_uid = info->si_fd;
+		break;
+	case __SI_FAULT:
+		si->cpt_sigval = cpt_ptr_export(info->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		si->cpt_pid = info->si_trapno;
+#endif
+		break;
+	case __SI_CHLD:
+		si->cpt_pid = info->si_pid;
+		si->cpt_uid = info->si_uid;
+		si->cpt_sigval = info->si_status;
+		si->cpt_stime = info->si_stime;
+		si->cpt_utime = info->si_utime;
+		break;
+	case __SI_KILL:
+	case __SI_RT:
+	case __SI_MESGQ:
+	default:
+		si->cpt_pid = info->si_pid;
+		si->cpt_uid = info->si_uid;
+		si->cpt_sigval = cpt_ptr_export(info->si_ptr);
+		break;
+	}
+
+	/* Modern kernel will restore whole _sifields */
+	memcpy(si->cpt_sifields, &info->_sifields, sizeof(info->_sifields));
+	BUILD_BUG_ON(sizeof(info->_sifields) != sizeof(si->cpt_sifields));
+
+	return 0;
+}
+
+static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx)
+{
+	struct sigqueue *q;
+	loff_t saved_obj;
+
+	if (list_empty(&list->list))
+		return 0;
+
+	cpt_push_object(&saved_obj, ctx);
+	list_for_each_entry(q, &list->list, list) {
+		struct cpt_siginfo_image si;
+
+		/* posix timers are collected separately */
+		if (q->info.si_code == SI_TIMER)
+			continue;
+
+		si.cpt_next = sizeof(si);
+		si.cpt_object = CPT_OBJ_SIGINFO;
+		si.cpt_hdrlen = sizeof(si);
+		si.cpt_content = CPT_CONTENT_VOID;
+
+		si.cpt_qflags = q->flags;
+		si.cpt_user = q->user->uid;
+
+		if (encode_siginfo(&si, &q->info))
+			return -EINVAL;
+
+		ctx->write(&si, sizeof(si), ctx);
+	}
+	cpt_pop_object(&saved_obj, ctx);
+	return 0;
+}
+
+
+
+static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct signal_struct *sig = obj->o_obj;
+	struct cpt_signal_image *v = cpt_get_buf(ctx);
+	struct task_struct *tsk;
+	int i;
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_SIGNAL_STRUCT;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_pgrp_type = CPT_PGRP_NORMAL;
+	v->cpt_pgrp = 0;
+
+#if 0 /* the code below seems to be unneeded */
+	if (sig->__pgrp <= 0) {
+		eprintk_ctx("bad pgid\n");
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_pid_ns(sig->__pgrp, &init_pid_ns);
+	if (tsk == NULL)
+		v->cpt_pgrp_type = CPT_PGRP_ORPHAN;
+	read_unlock(&tasklist_lock);
+	v->cpt_pgrp = pid_to_vpid(sig->__pgrp);
+#endif
+
+	v->cpt_old_pgrp = 0;
+/*	if (!sig->tty_old_pgrp) {
+		eprintk_ctx("bad tty_old_pgrp\n");
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}*/
+	if (sig->tty_old_pgrp) {
+		v->cpt_old_pgrp_type = CPT_PGRP_NORMAL;
+		read_lock(&tasklist_lock);
+		tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID);
+		if (tsk == NULL) {
+			v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN;
+			tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID);
+		}
+		read_unlock(&tasklist_lock);
+		if (tsk == NULL) {
+			eprintk_ctx("tty_old_pgrp does not exist anymore\n");
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+		v->cpt_old_pgrp = cpt_pid_nr(sig->tty_old_pgrp);
+		if ((int)v->cpt_old_pgrp < 0) {
+			dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp));
+			v->cpt_old_pgrp = -1;
+			v->cpt_old_pgrp_type = CPT_PGRP_STRAY;
+		}
+	}
+
+	v->cpt_session_type = CPT_PGRP_NORMAL;
+	v->cpt_session = 0;
+
+#if 0 /* the code below seems to be unneeded */
+	if (sig->__session <= 0) {
+		eprintk_ctx("bad session\n");
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_pid_ns(sig->__session, &init_pid_ns);
+	if (tsk == NULL)
+		v->cpt_session_type = CPT_PGRP_ORPHAN;
+	read_unlock(&tasklist_lock);
+	v->cpt_session = pid_to_vpid(sig->__session);
+#endif
+
+	v->cpt_leader = sig->leader;
+	v->cpt_ctty = CPT_NULL;
+	if (sig->tty) {
+		cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx);
+		if (cobj)
+			v->cpt_ctty = cobj->o_pos;
+		else {
+			eprintk_ctx("controlling tty is not found\n");
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+	}
+	memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8);
+
+	v->cpt_curr_target = 0;
+	if (sig->curr_target)
+		v->cpt_curr_target = cpt_task_pid_nr(sig->curr_target, PIDTYPE_PID);
+	v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0);
+	v->cpt_group_exit_code = sig->group_exit_code;
+	v->cpt_group_exit_task = 0;
+	if (sig->group_exit_task)
+		v->cpt_group_exit_task = cpt_task_pid_nr(sig->group_exit_task, PIDTYPE_PID);
+	v->cpt_notify_count = sig->notify_count;
+	v->cpt_group_stop_count = sig->group_stop_count;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8)
+	v->cpt_utime = sig->utime;
+	v->cpt_stime = sig->stime;
+	v->cpt_cutime = sig->cutime;
+	v->cpt_cstime = sig->cstime;
+	v->cpt_nvcsw = sig->nvcsw;
+	v->cpt_nivcsw = sig->nivcsw;
+	v->cpt_cnvcsw = sig->cnvcsw;
+	v->cpt_cnivcsw = sig->cnivcsw;
+	v->cpt_min_flt = sig->min_flt;
+	v->cpt_maj_flt = sig->maj_flt;
+	v->cpt_cmin_flt = sig->cmin_flt;
+	v->cpt_cmaj_flt = sig->cmaj_flt;
+
+	v->cpt_flags = 0;
+	if (sig->flags & SIGNAL_STOP_STOPPED)
+		v->cpt_flags |= CPT_SIGNAL_STOP_STOPPED;
+	if (sig->flags & SIGNAL_STOP_CONTINUED)
+		v->cpt_flags |= CPT_SIGNAL_STOP_CONTINUED;
+	if (sig->flags & SIGNAL_CLD_STOPPED)
+		v->cpt_flags |= CPT_SIGNAL_CLD_STOPPED;
+	if (sig->flags & SIGNAL_CLD_CONTINUED)
+		v->cpt_flags |= CPT_SIGNAL_CLD_CONTINUED;
+
+	if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
+		__asm__("undefined\n");
+
+	for (i=0; i<CPT_RLIM_NLIMITS; i++) {
+		if (i < RLIM_NLIMITS) {
+			v->cpt_rlim_cur[i] = sig->rlim[i].rlim_cur;
+			v->cpt_rlim_max[i] = sig->rlim[i].rlim_max;
+		} else {
+			v->cpt_rlim_cur[i] = CPT_NULL;
+			v->cpt_rlim_max[i] = CPT_NULL;
+		}
+	}
+#endif
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	dump_sigqueue(&sig->shared_pending, ctx);
+
+	cpt_close_object(ctx);
+	return 0;
+}
+
+int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx)
+{
+#ifdef CONFIG_KEYS
+	if (tsk->cred->request_key_auth || tsk->cred->thread_keyring) {
+		eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk));
+		return -EBUSY;
+	}
+#endif
+#ifdef CONFIG_NUMA
+	if (tsk->mempolicy) {
+		eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk));
+		return -EBUSY;
+	}
+#endif
+#ifdef CONFIG_TUX
+	if (tsk->tux_info) {
+		eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk));
+		return -EBUSY;
+	}
+#endif
+	return 0;
+}
+
+int cpt_skip_task(struct task_struct *tsk)
+{
+	if (tsk->flags & PF_KTHREAD)
+		return 1;
+
+	if (tsk == current)
+		return 1;
+
+	return 0;
+}
+
+static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct task_struct *tsk = obj->o_obj;
+	const struct cred *cred;
+	int last_thread;
+	struct cpt_task_image *v;
+	cpt_object_t *tobj;
+	cpt_object_t *tg_obj;
+	loff_t saved_obj;
+	int i;
+	int err;
+	struct timespec delta;
+	struct mm_struct * tsk_mm;
+	struct files_struct * tsk_files;
+	struct fs_struct * tsk_fs;
+	struct mnt_namespace * tsk_ns;
+
+	if (cpt_skip_task(tsk))
+		return 0;
+
+	cpt_open_object(obj, ctx);
+
+	v = cpt_get_buf(ctx);
+	v->cpt_signal = CPT_NULL;
+	tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx);
+	if (!tg_obj) BUG();
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_TASK;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_state = tsk->state;
+	if (tsk->exit_state) {
+		v->cpt_state = tsk->exit_state;
+		if (tsk->state != TASK_DEAD) {
+			eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n",
+				tsk->state, tsk->exit_state, CPT_TID(tsk));
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+	}
+	if (cpt_check_unsupported(tsk, ctx)) {
+		cpt_release_buf(ctx);
+		return -EBUSY;
+	}
+
+	v->cpt_flags = tsk->flags & CPT_TASK_FLAGS_MASK;
+	v->cpt_ptrace = tsk->ptrace;
+	v->cpt_prio = tsk->prio;
+	v->cpt_exit_code = tsk->exit_code;
+	v->cpt_exit_signal = tsk->exit_signal;
+	v->cpt_pdeath_signal = tsk->pdeath_signal;
+	v->cpt_static_prio = tsk->static_prio;
+	v->cpt_rt_priority = tsk->rt_priority;
+	v->cpt_policy = tsk->policy;
+	if (v->cpt_policy != SCHED_NORMAL && v->cpt_policy != SCHED_BATCH && v->cpt_policy != SCHED_IDLE) {
+		eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n",
+				cpt_task_pid_nr(tsk, PIDTYPE_PID), tsk->pid, tsk->comm);
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+
+	/* Unpleasant moment. When leader of thread group exits,
+	 * it remains in zombie state until all the group exits.
+	 * We save not-NULL pointers to process mm/files/fs, so
+	 * that we can restore this thread group.
+	 */
+	tsk_mm = tsk->mm;
+	tsk_files = tsk->files;
+	tsk_fs = tsk->fs;
+	tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL;
+
+	if (tsk->exit_state && !thread_group_empty(tsk) &&
+	    thread_group_leader(tsk)) {
+		struct task_struct * p = tsk;
+
+		read_lock(&tasklist_lock);
+		do {
+			if (p->mm)
+				tsk_mm = p->mm;
+			if (p->files)
+				tsk_files = p->files;
+			if (p->fs)
+				tsk_fs = p->fs;
+			if (p->nsproxy && p->nsproxy->mnt_ns)
+				tsk_ns = p->nsproxy->mnt_ns;
+			p = next_thread(p);
+		} while (p != tsk);
+		read_unlock(&tasklist_lock);
+	}
+
+	v->cpt_mm = CPT_NULL;
+	if (tsk_mm) {
+		tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx);
+		if (!tobj) BUG();
+		v->cpt_mm = tobj->o_pos;
+	}
+	v->cpt_files = CPT_NULL;
+	if (tsk_files) {
+		tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx);
+		if (!tobj) BUG();
+		v->cpt_files = tobj->o_pos;
+	}
+	v->cpt_fs = CPT_NULL;
+	if (tsk_fs) {
+		tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx);
+		if (!tobj) BUG();
+		v->cpt_fs = tobj->o_pos;
+	}
+	v->cpt_namespace = CPT_NULL;
+	if (tsk_ns) {
+		tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx);
+		if (!tobj) BUG();
+		v->cpt_namespace = tobj->o_pos;
+	}
+	v->cpt_sysvsem_undo = CPT_NULL;
+	if (tsk->sysvsem.undo_list && !tsk->exit_state) {
+		tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx);
+		if (!tobj) BUG();
+		v->cpt_sysvsem_undo = tobj->o_pos;
+	}
+	v->cpt_sighand = CPT_NULL;
+	if (tsk->sighand) {
+		tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx);
+		if (!tobj) BUG();
+		v->cpt_sighand = tobj->o_pos;
+	}
+	v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked);
+	v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked);
+	v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask);
+
+	v->cpt_posix_timers = CPT_NULL;
+	if (thread_group_leader(tsk) && tsk->signal &&
+	    !list_empty(&tsk->signal->posix_timers)) {
+		tobj = lookup_cpt_object(CPT_OBJ_POSIX_TIMER_LIST,
+					 &tsk->signal->posix_timers, ctx);
+		if (!tobj) BUG();
+		v->cpt_posix_timers = tobj->o_pos;
+	}
+
+	v->cpt_pid = cpt_task_pid_nr(tsk, PIDTYPE_PID);
+	v->cpt_tgid = cpt_pid_nr(task_tgid(tsk));
+	v->cpt_ppid = 0;
+	if (tsk->parent) {
+		if (tsk->parent != tsk->real_parent &&
+		    !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) {
+			eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid,
+					cpt_task_pid_nr(tsk, PIDTYPE_PID), tsk->comm);
+			cpt_release_buf(ctx);
+			return -EBUSY;
+		}
+		v->cpt_ppid = cpt_task_pid_nr(tsk->parent, PIDTYPE_PID);
+	}
+	v->cpt_rppid = tsk->real_parent ? cpt_task_pid_nr(tsk->real_parent, PIDTYPE_PID) : 0;
+	v->cpt_pgrp = cpt_task_pid_nr(tsk, PIDTYPE_PGID);
+	v->cpt_session = cpt_task_pid_nr(tsk, PIDTYPE_SID);
+	v->cpt_old_pgrp = 0;
+	if (tsk->signal->tty_old_pgrp)
+		v->cpt_old_pgrp = cpt_pid_nr(tsk->signal->tty_old_pgrp);
+	v->cpt_leader = tsk->group_leader ? cpt_task_pid_nr(tsk->group_leader, PIDTYPE_PID) : 0;
+	v->cpt_set_tid = (unsigned long)tsk->set_child_tid;
+	v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid;
+	memcpy(v->cpt_comm, tsk->comm, 16);
+
+	cred = tsk->cred;
+	v->cpt_user = cred->user->uid;
+	v->cpt_uid = cred->uid;
+	v->cpt_euid = cred->euid;
+	v->cpt_suid = cred->suid;
+	v->cpt_fsuid = cred->fsuid;
+	v->cpt_gid = cred->gid;
+	v->cpt_egid = cred->egid;
+	v->cpt_sgid = cred->sgid;
+	v->cpt_fsgid = cred->fsgid;
+	v->cpt_ngids = 0;
+	if (cred->group_info && cred->group_info->ngroups != 0) {
+		int i = cred->group_info->ngroups;
+		if (i > 32) {
+			/* Shame... I did a simplified version and _forgot_
+			 * about this. Later, later. */
+			eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk));
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+		v->cpt_ngids = i;
+		for (i--; i>=0; i--)
+			v->cpt_gids[i] = cred->group_info->small_block[i];
+	}
+	v->cpt_prctl_uac = 0;
+	v->cpt_prctl_fpemu = 0;
+	v->__cpt_pad1 = 0;
+#ifdef CONFIG_IA64
+	v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT;
+	v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT;
+#endif
+	memcpy(&v->cpt_ecap, &cred->cap_effective, 8);
+	memcpy(&v->cpt_icap, &cred->cap_inheritable, 8);
+	memcpy(&v->cpt_pcap, &cred->cap_permitted, 8);
+	memcpy(&v->cpt_bcap, &cred->cap_bset, 8);
+	v->cpt_keepcap = cred->securebits;
+
+	v->cpt_did_exec = tsk->did_exec;
+	v->cpt_exec_domain = -1;
+	v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<<TIF_FREEZE);
+	v->cpt_64bit = 0;
+#ifdef CONFIG_X86_64
+	/* Clear x86_64 specific flags */
+	v->cpt_thrflags &= ~(_TIF_FORK|_TIF_IA32);
+	if (!(task_thread_info(tsk)->flags & _TIF_IA32)) {
+		ctx->tasks64++;
+		v->cpt_64bit = 1;
+	}
+#endif
+#ifdef CONFIG_IA64
+	/* Clear ia64 specific flags */
+	//// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32);
+	if (!IS_IA32_PROCESS(task_pt_regs(tsk))) {
+		ctx->tasks64++;
+		v->cpt_64bit = 1;
+	}
+#endif
+	v->cpt_thrstatus = task_thread_info(tsk)->status;
+	v->cpt_addr_limit = -1;
+
+	v->cpt_personality = tsk->personality;
+
+#ifdef CONFIG_X86
+	for (i=0; i<GDT_ENTRY_TLS_ENTRIES; i++) {
+		if (i>=3) {
+			eprintk_ctx("too many tls descs\n");
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+		v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a;
+	}
+#endif
+
+	v->cpt_restart.fn = CPT_RBL_0;
+	if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) {
+		struct restart_block *rb = &task_thread_info(tsk)->restart_block;
+		ktime_t e;
+
+		if (rb->fn == hrtimer_nanosleep_restart) {
+			v->cpt_restart.fn = CPT_RBL_NANOSLEEP;
+
+			e.tv64 = rb->nanosleep.expires;
+			e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+			v->cpt_restart.arg0 = (__u64)rb->nanosleep.index;
+			v->cpt_restart.arg1 = (unsigned long)rb->nanosleep.rmtp;
+			v->cpt_restart.arg2 = 0;
+			v->cpt_restart.arg3 = ktime_to_ns(e);
+			dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (__u64)v->cpt_restart.arg0);
+			goto continue_dump;
+		}
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+		if (rb->fn == compat_nanosleep_restart) {
+			v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP;
+
+			e.tv64 = rb->nanosleep.expires;
+			e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+			v->cpt_restart.arg0 = (__u64)rb->nanosleep.index;
+			v->cpt_restart.arg1 = (__u64)rb->nanosleep.rmtp;
+			v->cpt_restart.arg2 = (__u64)rb->nanosleep.compat_rmtp;
+			v->cpt_restart.arg3 = ktime_to_ns(e);
+			dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (__u64)v->cpt_restart.arg0);
+			goto continue_dump;
+		}
+#endif
+		if (rb->fn == do_restart_poll) {
+			struct timespec ts;
+
+			ts.tv_sec = rb->poll.tv_sec;
+			ts.tv_nsec = rb->poll.tv_nsec;
+
+			v->cpt_restart.fn = CPT_RBL_POLL;
+			v->cpt_restart.arg0 = (unsigned long)rb->poll.ufds;
+			v->cpt_restart.arg1 = (__u64)rb->poll.has_timeout << 32 | rb->poll.nfds;
+			v->cpt_restart.arg2 = timespec_to_ns(&ts);
+			v->cpt_restart.arg3 = 0;
+			dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (__u64)v->cpt_restart.arg0);
+			goto continue_dump;
+		}
+		if (rb->fn == futex_wait_restart) {
+			v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT;
+
+			e.tv64 = rb->futex.time;
+			e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+			v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr;
+			v->cpt_restart.arg1 = rb->futex.val;
+			v->cpt_restart.arg2 = ktime_to_ns(e);
+			v->cpt_restart.arg3 = rb->futex.flags;
+			goto continue_dump;
+		}
+		if (rb->fn == posix_cpu_nsleep_restart) {
+			v->cpt_restart.fn = CPT_RBL_POSIX_CPU_NSLEEP;
+			v->cpt_restart.arg0 = rb->arg0;
+			v->cpt_restart.arg1 = rb->arg1;
+			v->cpt_restart.arg2 = rb->arg2;
+			v->cpt_restart.arg3 = rb->arg3;
+			goto continue_dump;
+		}
+		eprintk_ctx("unknown restart block %pS\n", rb->fn);
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+
+continue_dump:
+	v->cpt_it_real_incr = 0;
+	v->cpt_it_prof_incr = 0;
+	v->cpt_it_virt_incr = 0;
+	v->cpt_it_real_value = 0;
+	v->cpt_it_prof_value = 0;
+	v->cpt_it_virt_value = 0;
+	if (thread_group_leader(tsk) && tsk->exit_state == 0) {
+		ktime_t rem;
+
+		v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr);
+		v->cpt_it_prof_incr = tsk->signal->it[CPUCLOCK_PROF].incr;
+		v->cpt_it_virt_incr = tsk->signal->it[CPUCLOCK_VIRT].incr;
+
+		rem = hrtimer_get_remaining(&tsk->signal->real_timer);
+
+		if (hrtimer_active(&tsk->signal->real_timer)) {
+			if (rem.tv64 <= 0)
+				rem.tv64 = NSEC_PER_USEC;
+			v->cpt_it_real_value = ktime_to_ns(rem);
+			dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value);
+		}
+		v->cpt_it_prof_value = tsk->signal->it[CPUCLOCK_PROF].expires;
+		v->cpt_it_virt_value = tsk->signal->it[CPUCLOCK_VIRT].expires;
+	}
+	v->cpt_used_math = (tsk_used_math(tsk) != 0);
+
+	if (tsk->notifier) {
+		eprintk_ctx("task notifier is in use: process %d/%d(%s)\n",
+				cpt_task_pid_nr(tsk, PIDTYPE_PID), tsk->pid, tsk->comm);
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+
+	v->cpt_utime = tsk->utime;
+	v->cpt_stime = tsk->stime;
+	delta = tsk->start_time;
+	_set_normalized_timespec(&delta,
+			delta.tv_sec - get_exec_env()->start_timespec.tv_sec,
+			delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec);
+	v->cpt_starttime = cpt_timespec_export(&delta);
+	v->cpt_nvcsw = tsk->nvcsw;
+	v->cpt_nivcsw = tsk->nivcsw;
+	v->cpt_min_flt = tsk->min_flt;
+	v->cpt_maj_flt = tsk->maj_flt;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
+	v->cpt_cutime = tsk->cutime;
+	v->cpt_cstime = tsk->cstime;
+	v->cpt_cnvcsw = tsk->cnvcsw;
+	v->cpt_cnivcsw = tsk->cnivcsw;
+	v->cpt_cmin_flt = tsk->cmin_flt;
+	v->cpt_cmaj_flt = tsk->cmaj_flt;
+
+	if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
+		__asm__("undefined\n");
+
+	for (i=0; i<CPT_RLIM_NLIMITS; i++) {
+		if (i < RLIM_NLIMITS) {
+			v->cpt_rlim_cur[i] = tsk->rlim[i].rlim_cur;
+			v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max;
+		} else {
+			v->cpt_rlim_cur[i] = CPT_NULL;
+			v->cpt_rlim_max[i] = CPT_NULL;
+		}
+	}
+#else
+	v->cpt_cutime = tsk->signal->cutime;
+	v->cpt_cstime = tsk->signal->cstime;
+	v->cpt_cnvcsw = tsk->signal->cnvcsw;
+	v->cpt_cnivcsw = tsk->signal->cnivcsw;
+	v->cpt_cmin_flt = tsk->signal->cmin_flt;
+	v->cpt_cmaj_flt = tsk->signal->cmaj_flt;
+
+	if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
+		__asm__("undefined\n");
+
+	for (i=0; i<CPT_RLIM_NLIMITS; i++) {
+		if (i < RLIM_NLIMITS) {
+			v->cpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur;
+			v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max;
+		} else {
+			v->cpt_rlim_cur[i] = CPT_NULL;
+			v->cpt_rlim_max[i] = CPT_NULL;
+		}
+	}
+#endif
+
+#ifdef CONFIG_BEANCOUNTERS
+	if (tsk->mm)
+		v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx);
+	else
+		v->cpt_mm_ub = CPT_NULL;
+	v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx);
+	v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx);
+	v->cpt_fork_sub = v->cpt_exec_ub;
+#endif
+
+	v->cpt_ptrace_message = tsk->ptrace_message;
+	v->cpt_stopped_state = tsk->stopped_state;
+
+#ifdef CONFIG_X86_32
+	if (tsk->thread.vm86_info) {
+		eprintk_ctx("vm86 task is running\n");
+		cpt_release_buf(ctx);
+		return -EBUSY;
+	}
+#endif
+
+	v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal);
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	dump_kstack(tsk, ctx);
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	err = dump_registers(tsk, ctx);
+	cpt_pop_object(&saved_obj, ctx);
+	if (err)
+		return err;
+
+	if (tsk_used_math(tsk)) {
+		cpt_push_object(&saved_obj, ctx);
+		dump_fpustate(tsk, ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+
+	if (tsk->last_siginfo &&
+	    tsk->last_siginfo->si_code != SI_TIMER) {
+		struct cpt_siginfo_image si;
+		cpt_push_object(&saved_obj, ctx);
+
+		si.cpt_next = sizeof(si);
+		si.cpt_object = CPT_OBJ_LASTSIGINFO;
+		si.cpt_hdrlen = sizeof(si);
+		si.cpt_content = CPT_CONTENT_VOID;
+
+		if (encode_siginfo(&si, tsk->last_siginfo))
+			return -EINVAL;
+
+		ctx->write(&si, sizeof(si), ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+
+	if (tsk->sas_ss_size) {
+		struct cpt_sigaltstack_image si;
+		cpt_push_object(&saved_obj, ctx);
+
+		si.cpt_next = sizeof(si);
+		si.cpt_object = CPT_OBJ_SIGALTSTACK;
+		si.cpt_hdrlen = sizeof(si);
+		si.cpt_content = CPT_CONTENT_VOID;
+
+		si.cpt_stack = tsk->sas_ss_sp;
+		si.cpt_stacksize = tsk->sas_ss_size;
+
+		ctx->write(&si, sizeof(si), ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+
+	if (tsk->robust_list
+#ifdef CONFIG_COMPAT
+	    || tsk->compat_robust_list
+#endif
+	    ) {
+		struct cpt_task_aux_image ai;
+		cpt_push_object(&saved_obj, ctx);
+
+		ai.cpt_next = sizeof(ai);
+		ai.cpt_object = CPT_OBJ_TASK_AUX;
+		ai.cpt_hdrlen = sizeof(ai);
+		ai.cpt_content = CPT_CONTENT_VOID;
+
+		ai.cpt_robust_list = (unsigned long)tsk->robust_list;
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_COMPAT
+		if (task_thread_info(tsk)->flags & _TIF_IA32)
+			ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list;
+#endif
+#endif
+		ctx->write(&ai, sizeof(ai), ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+
+	dump_sigqueue(&tsk->pending, ctx);
+
+	last_thread = 1;
+	read_lock(&tasklist_lock);
+	do {
+		struct task_struct * next = next_thread(tsk);
+		if (next != tsk && !thread_group_leader(next))
+			last_thread = 0;
+	} while (0);
+	read_unlock(&tasklist_lock);
+
+	if (last_thread) {
+		struct task_struct *prev_tsk;
+		int err;
+		loff_t pos = ctx->file->f_pos;
+
+		cpt_push_object(&saved_obj, ctx);
+		err = dump_one_signal_struct(tg_obj, ctx);
+		cpt_pop_object(&saved_obj, ctx);
+		if (err)
+			return err;
+
+		prev_tsk = tsk;
+		for (;;) {
+			if (prev_tsk->tgid == tsk->tgid) {
+				loff_t tg_pos;
+
+				tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal);
+				ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos);
+				if (thread_group_leader(prev_tsk))
+					break;
+			}
+
+			if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) {
+				eprintk_ctx("bug: thread group leader is lost\n");
+				return -EINVAL;
+			}
+
+			obj = list_entry(obj->o_list.prev, cpt_object_t, o_list);
+			prev_tsk = obj->o_obj;
+		}
+	}
+
+	cpt_close_object(ctx);
+	return 0;
+}
+
+int cpt_dump_tasks(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_TASKS);
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		int err;
+
+		if ((err = dump_one_process(obj, ctx)) != 0)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+int cpt_collect_signals(cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	/* Collect process fd sets */
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL)
+			return -ENOMEM;
+		if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+
+static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct sighand_struct *sig = obj->o_obj;
+	struct cpt_sighand_image *v = cpt_get_buf(ctx);
+	int i;
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_SIGHAND_STRUCT;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	for (i=0; i< _NSIG; i++) {
+		if (sig->action[i].sa.sa_handler != SIG_DFL ||
+		    sig->action[i].sa.sa_flags) {
+			loff_t saved_obj;
+			struct cpt_sighandler_image *o = cpt_get_buf(ctx);
+
+			cpt_push_object(&saved_obj, ctx);
+			cpt_open_object(NULL, ctx);
+
+			o->cpt_next = CPT_NULL;
+			o->cpt_object = CPT_OBJ_SIGHANDLER;
+			o->cpt_hdrlen = sizeof(*o);
+			o->cpt_content = CPT_CONTENT_VOID;
+
+			o->cpt_signo = i;
+			o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler;
+			o->cpt_restorer = 0;
+#ifdef CONFIG_X86
+			o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer;
+#endif
+			o->cpt_flags = sig->action[i].sa.sa_flags;
+			memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8);
+			ctx->write(o, sizeof(*o), ctx);
+			cpt_release_buf(ctx);
+			cpt_close_object(ctx);
+			cpt_pop_object(&saved_obj, ctx);
+		}
+	}
+
+	cpt_close_object(ctx);
+	return 0;
+}
+
+int cpt_dump_sighand(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT);
+
+	for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) {
+		int err;
+
+		if ((err = dump_one_sighand_struct(obj, ctx)) != 0)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
+
+int cpt_collect_posix_timers(cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+
+		if (!thread_group_leader(tsk) || !tsk->signal ||
+		    list_empty(&tsk->signal->posix_timers))
+			continue;
+
+		if (!cpt_object_add(CPT_OBJ_POSIX_TIMER_LIST,
+				    &tsk->signal->posix_timers, ctx))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static int dump_one_posix_timer_list(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct list_head *timer_list = obj->o_obj;
+	struct cpt_object_hdr v;
+	struct k_itimer *timer;
+
+	cpt_open_object(obj, ctx);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_POSIX_TIMER_LIST;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_ARRAY;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	list_for_each_entry(timer, timer_list, list) {
+		loff_t saved_obj;
+		struct timespec dump_time;
+		struct itimerspec setting;
+		int overrun, overrun_last;
+		int signal_pending;
+		struct cpt_posix_timer_image o;
+
+		get_timer_setting(timer, &setting,
+				  &overrun, &overrun_last, &signal_pending);
+
+		cpt_push_object(&saved_obj, ctx);
+		cpt_open_object(NULL, ctx);
+
+		o.cpt_next = CPT_NULL;
+		o.cpt_object = CPT_OBJ_POSIX_TIMER;
+		o.cpt_hdrlen = sizeof(o);
+		o.cpt_content = CPT_CONTENT_VOID;
+
+		o.cpt_timer_id = timer->it_id;
+		o.cpt_timer_clock = timer->it_clock;
+		o.cpt_timer_overrun = overrun;
+		o.cpt_timer_overrun_last = overrun_last;
+		o.cpt_timer_signal_pending = signal_pending;
+		o.cpt_timer_interval =
+			cpt_timespec_export(&setting.it_interval);
+		o.cpt_timer_value =
+			cpt_timespec_export(&setting.it_value);
+
+		o.cpt_sigev_value =
+			cpt_ptr_export(timer->sigq->info.si_value.sival_ptr);
+		o.cpt_sigev_signo = timer->sigq->info.si_signo;
+		o.cpt_sigev_notify = timer->it_sigev_notify;
+		o.cpt_sigev_notify_tid = cpt_pid_nr(timer->it_pid);
+
+		do_gettimespec(&dump_time);
+		o.cpt_dump_time = cpt_timespec_export(&dump_time);
+
+		ctx->write(&o, sizeof(o), ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+
+	cpt_close_object(ctx);
+	return 0;
+}
+
+int cpt_dump_posix_timers(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	cpt_open_section(ctx, CPT_SECT_POSIX_TIMERS);
+
+	for_each_object(obj, CPT_OBJ_POSIX_TIMER_LIST) {
+		int err;
+
+		err = dump_one_posix_timer_list(obj, ctx);
+		if (err)
+			return err;
+	}
+
+	cpt_close_section(ctx);
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_process.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_process.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_process.h	2015-01-21 12:02:48.227093553 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_process.h	2015-01-21 12:02:50.525032549 +0300
@@ -0,0 +1,31 @@
+#include <linux/sched.h>
+
+int cpt_collect_signals(cpt_context_t *);
+int cpt_dump_signal(struct cpt_context *);
+int cpt_dump_sighand(struct cpt_context *);
+int cpt_collect_posix_timers(struct cpt_context *);
+int cpt_dump_posix_timers(struct cpt_context *);
+int cpt_dump_tasks(struct cpt_context *);
+
+int rst_posix_timers(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx);
+int restore_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx);
+__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+
+int rst_restore_process(struct cpt_context *ctx);
+int rst_process_linkage(struct cpt_context *ctx);
+
+int check_task_state(struct task_struct *tsk, struct cpt_context *ctx);
+struct pid *alloc_vpid_safe(pid_t vnr);
+struct pid *alloc_dummy_vpid(pid_t vnr);
+int cpt_skip_task(struct task_struct *tsk);
+
+static inline pid_t cpt_pid_nr(struct pid *pid)
+{
+	return pid_nr_ns(pid, current->nsproxy->pid_ns);
+}
+
+static inline pid_t cpt_task_pid_nr(struct task_struct *p, enum pid_type type)
+{
+	return __task_pid_nr_ns(p, type, current->nsproxy->pid_ns);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_socket.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_socket.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_socket.c	2015-01-21 12:02:48.227093553 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_socket.c	2015-01-21 12:02:50.929021825 +0300
@@ -0,0 +1,948 @@
+/*
+ *
+ *  kernel/cpt/cpt_socket.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/tcp.h>
+#include <linux/mount.h>
+#include <net/sock.h>
+#include <net/scm.h>
+#include <net/af_unix.h>
+#include <net/tcp.h>
+#include <net/netlink_sock.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_socket.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_process.h"
+
+static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx);
+
+
+/* Sockets are quite different of another kinds of files.
+ * There is one simplification: only one struct file can refer to a socket,
+ * so we could store information about socket directly in section FILES as
+ * a description of a file and append f.e. array of not-yet-accepted
+ * connections of listening socket as array of auxiliary data.
+ *
+ * Complications are:
+ * 1. TCP sockets can be orphans. We have to relocate orphans as well,
+ *    so we have to create special section for orphans.
+ * 2. AF_UNIX sockets are distinguished objects: set of links between
+ *    AF_UNIX sockets is quite arbitrary.
+ *    A. Each socket can refers to many of files due to FD passing.
+ *    B. Each socket except for connected ones can have in queue skbs
+ *       sent by any of sockets.
+ *
+ *    2A is relatively easy: after our tasks are frozen we make an additional
+ *    recursive pass throgh set of collected files and get referenced to
+ *    FD passed files. After end of recursion, all the files are treated
+ *    in the same way. All they will be stored in section FILES.
+ *
+ *    2B. We have to resolve all those references at some point.
+ *    It is the place where pipe-like approach to image fails.
+ *
+ * All this makes socket checkpointing quite chumbersome.
+ * Right now we collect all the sockets and assign some numeric index value
+ * to each of them. The socket section is separate and put after section FILES,
+ * so section FILES refers to sockets by index, section SOCKET refers to FILES
+ * as usual by position in image. All the refs inside socket section are
+ * by index. When restoring we read socket section, create objects to hold
+ * mappings index <-> pos. At the second pass we open sockets (simultaneosly
+ * with their pairs) and create FILE objects.
+ */ 
+
+
+/* ====== FD passing ====== */
+
+/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we
+ * have to implement this. A problem is that in general case we receive
+ * skbs from an unknown context, so new files can arrive to checkpointed
+ * set of processes even after they are stopped. Well, we are going just
+ * to ignore unknown fds while doing real checkpointing. It is fair because
+ * links outside checkpointed set are going to fail anyway.
+ *
+ * ATTN: the procedure is recursive. We linearize the recursion adding
+ * newly found files to the end of file list, so they will be analyzed
+ * in the same loop.
+ */
+
+static int collect_one_passedfd(struct file *file, cpt_context_t * ctx)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct socket *sock;
+	struct sock *sk;
+	struct sk_buff *skb;
+
+	if (!S_ISSOCK(inode->i_mode))
+		return -ENOTSOCK;
+
+	sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+
+	if (sock->ops->family != AF_UNIX)
+		return 0;
+
+	sk = sock->sk;
+
+	/* Subtle locking issue. skbs cannot be removed while
+	 * we are scanning, because all the processes are stopped.
+	 * They still can be added to tail of queue. Locking while
+	 * we dereference skb->next is enough to resolve this.
+	 * See above about collision with skbs added after we started
+	 * checkpointing.
+	 */
+
+	skb = skb_peek(&sk->sk_receive_queue);
+	while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
+		if (UNIXCB(skb).fp && skb->sk &&
+		    (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) {
+			struct scm_fp_list *fpl = UNIXCB(skb).fp;
+			int i;
+
+			for (i = fpl->count-1; i >= 0; i--) {
+				if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL)
+					return -ENOMEM;
+			}
+		}
+
+		spin_lock_irq(&sk->sk_receive_queue.lock);
+		skb = skb->next;
+		spin_unlock_irq(&sk->sk_receive_queue.lock);
+	}
+
+	return 0;
+}
+
+int cpt_collect_passedfds(cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+
+		if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
+			int err;
+
+			if ((err = collect_one_passedfd(file, ctx)) < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+/* ====== End of FD passing ====== */
+
+/* Must be called under bh_lock_sock() */
+
+void clear_backlog(struct sock *sk)
+{
+	struct sk_buff *skb = sk->sk_backlog.head;
+
+	sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+	while (skb) {
+		struct sk_buff *next = skb->next;
+
+		skb->next = NULL;
+		kfree_skb(skb);
+		skb = next;
+	}
+}
+
+void release_sock_nobacklog(struct sock *sk)
+{
+	spin_lock_bh(&(sk->sk_lock.slock));
+	clear_backlog(sk);
+	sk->sk_lock.owned = 0;
+        if (waitqueue_active(&(sk->sk_lock.wq)))
+		wake_up(&(sk->sk_lock.wq));
+	spin_unlock_bh(&(sk->sk_lock.slock));
+}
+
+static void generic_dump_skb_cb(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+	memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb));
+}
+
+static void dump_inet_skb_cb(struct cpt_skb_image *v, struct sk_buff *skb,
+			     struct sock *sk, struct cpt_context *ctx)
+{
+	/*
+	 * IPv6 enabled 'tcp_skb_cb' does not fit into 'cpt_skb_image.cb'.
+	 * 'ack_seq' is missing, but hopefully it is not needed while
+	 * skb is in queue.
+	 * BUILD_BUG_ON(sizeof(v->cpt_cb) < sizeof(skb->cb));
+	 */
+	BUILD_BUG_ON(sizeof(v->cpt_cb) != 40);
+	BUILD_BUG_ON(sizeof(struct inet_skb_parm) != 16);
+	BUILD_BUG_ON(sizeof(struct inet6_skb_parm) != 24);
+	BUILD_BUG_ON(sizeof(*TCP_SKB_CB(skb)) -
+		     sizeof(TCP_SKB_CB(skb)->header) != 20);
+#if !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE)
+	if (sk->sk_protocol == IPPROTO_TCP) {
+		/* Save control block according to tcp_skb_cb with IPv6 */
+
+		/*
+		 * IPv6 enabled 'tcp_skb_cb' does not fit into 'cpt_skb_image.cb'.
+		 * BUILD_BUG_ON(sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm) <
+		 *	sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm));
+		 */
+		memcpy(v->cpt_cb, skb->cb, sizeof(struct inet_skb_parm));
+		memcpy((void *)v->cpt_cb + sizeof(struct inet6_skb_parm),
+		       skb->cb + sizeof(struct inet_skb_parm),
+		       min(sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm),
+			   sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)));
+	} else
+#endif
+		generic_dump_skb_cb(v, skb);
+}
+
+static void dump_unix_skb_cb(struct cpt_skb_image *v, struct sk_buff *skb,
+			     struct sock *sk, struct cpt_context *ctx)
+{
+	/*
+	 * UNIXCB keeps pointers to pid and cred. Convert them to
+	 * numbers.
+	 */
+	struct ucred *ucred = (struct ucred *)v->cpt_cb;
+
+	BUILD_BUG_ON(sizeof(*ucred) > sizeof(v->cpt_cb));
+	ucred->pid = cpt_pid_nr(UNIXCB(skb).pid);
+	ucred->uid = UNIXCB(skb).cred ? UNIXCB(skb).cred->uid : -1;
+	ucred->gid = UNIXCB(skb).cred ? UNIXCB(skb).cred->gid : -1;
+}
+
+int cpt_dump_skb(int type, int owner, struct sk_buff *skb,
+		 struct sock *sk, struct cpt_context *ctx)
+{
+	struct cpt_skb_image *v = cpt_get_buf(ctx);
+	loff_t saved_obj;
+	struct timeval tmptv;
+	int ret = 0;
+
+	cpt_push_object(&saved_obj, ctx);
+	cpt_open_object(NULL, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_SKB;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_owner = owner;
+	v->cpt_queue = type;
+	skb_get_timestamp(skb, &tmptv);
+	v->cpt_stamp = cpt_timeval_export(&tmptv);
+	v->cpt_hspace = skb->data - skb->head;
+	v->cpt_tspace = skb->end - skb->tail;
+	v->cpt_h = skb_transport_header(skb) - skb->head;
+	v->cpt_nh = skb_network_header(skb) - skb->head;
+	v->cpt_mac = skb_mac_header(skb) - skb->head;
+	memset(v->cpt_cb, 0, sizeof(v->cpt_cb));
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		dump_inet_skb_cb(v, skb, sk, ctx);
+		break;
+	case AF_UNIX:
+		dump_unix_skb_cb(v, skb, sk, ctx);
+		break;
+	default:
+		generic_dump_skb_cb(v, skb);
+		break;
+	}
+
+	if (sizeof(skb->cb) > sizeof(v->cpt_cb)) {
+		int i;
+		for (i=sizeof(v->cpt_cb); i<sizeof(skb->cb); i++) {
+			if (skb->cb[i]) {
+				wprintk_ctx("dirty skb cb");
+				break;
+			}
+		}
+	}
+	v->cpt_len = skb->len;
+	v->cpt_mac_len = skb->mac_len;
+	v->cpt_csum = skb->csum;
+	v->cpt_local_df = skb->local_df;
+	v->cpt_pkt_type = skb->pkt_type;
+	v->cpt_ip_summed = skb->ip_summed;
+	v->cpt_priority = skb->priority;
+	v->cpt_protocol = skb->protocol;
+	v->cpt_security = 0;
+	v->cpt_gso_segs = skb_shinfo(skb)->gso_segs;
+	v->cpt_gso_size = skb_shinfo(skb)->gso_size;
+	v->cpt_gso_type = skb_shinfo(skb)->gso_type;
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+		eprintk_ctx("skb ufo is not supported\n");
+		cpt_release_buf(ctx);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	if (skb->len + (skb->data - skb->head) > 0) {
+		struct cpt_obj_bits ob;
+		loff_t saved_obj2;
+
+		cpt_push_object(&saved_obj2, ctx);
+		cpt_open_object(NULL, ctx);
+		ob.cpt_next = CPT_NULL;
+		ob.cpt_object = CPT_OBJ_BITS;
+		ob.cpt_hdrlen = sizeof(ob);
+		ob.cpt_content = CPT_CONTENT_DATA;
+		ob.cpt_size = skb->len + v->cpt_hspace;
+
+		ctx->write(&ob, sizeof(ob), ctx);
+
+		ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx);
+		if (skb->data_len) {
+			int offset = skb->len - skb->data_len;
+			while (offset < skb->len) {
+				int copy = skb->len - offset;
+				if (copy > PAGE_SIZE)
+					copy = PAGE_SIZE;
+				(void)cpt_get_buf(ctx);
+				if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy))
+					BUG();
+				ctx->write(ctx->tmpbuf, copy, ctx);
+				__cpt_release_buf(ctx);
+				offset += copy;
+			}
+		}
+
+		ctx->align(ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_obj2, ctx);
+	}
+
+	if (skb->sk && skb->sk->sk_family == AF_UNIX) {
+		struct scm_fp_list *fpl = UNIXCB(skb).fp;
+
+		if (fpl) {
+			int i;
+
+			for (i = 0; i < fpl->count; i++) {
+				struct cpt_fd_image v;
+				cpt_object_t *obj;
+				loff_t saved_obj2;
+
+				obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx);
+
+				if (!obj) {
+					eprintk_ctx("lost passed FD\n");
+					ret = -EINVAL;
+					goto out;
+				}
+
+				cpt_push_object(&saved_obj2, ctx);
+				cpt_open_object(NULL, ctx);
+				v.cpt_next = CPT_NULL;
+				v.cpt_object = CPT_OBJ_FILEDESC;
+				v.cpt_hdrlen = sizeof(v);
+				v.cpt_content = CPT_CONTENT_VOID;
+
+				v.cpt_fd = i;
+				v.cpt_file = obj->o_pos;
+				v.cpt_flags = 0;
+				ctx->write(&v, sizeof(v), ctx);
+				cpt_close_object(ctx);
+				cpt_pop_object(&saved_obj2, ctx);
+			}
+		}
+	}
+
+out:
+	cpt_close_object(ctx);
+	cpt_pop_object(&saved_obj, ctx);
+	return ret;
+}
+
+static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx)
+{
+	struct sk_buff *skb;
+	struct sock *sk_cache = NULL;
+
+	skb = skb_peek(&sk->sk_receive_queue);
+	while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
+		int err;
+
+		if (sk->sk_family == AF_UNIX) {
+			cpt_object_t *obj;
+			if (skb->sk != sk_cache) {
+				idx = -1;
+				sk_cache = NULL;
+				obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx);
+				if (obj) {
+					idx = obj->o_index;
+					sk_cache = skb->sk;
+				} else if (unix_peer(sk) != skb->sk)
+					goto next_skb;
+			}
+		}
+
+		err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, sk, ctx);
+		if (err)
+			return err;
+
+next_skb:
+		spin_lock_irq(&sk->sk_receive_queue.lock);
+		skb = skb->next;
+		spin_unlock_irq(&sk->sk_receive_queue.lock);
+	}
+	return 0;
+}
+
+static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx)
+{
+	struct sk_buff *skb;
+
+	skb = skb_peek(&sk->sk_write_queue);
+	while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) {
+		int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, sk, ctx);
+		if (err)
+			return err;
+
+		spin_lock_irq(&sk->sk_write_queue.lock);
+		skb = skb->next;
+		spin_unlock_irq(&sk->sk_write_queue.lock);
+	}
+	return 0;
+}
+
+static void cpt_dump_sock_packet_mclist(struct sock *sk,
+					struct cpt_context *ctx)
+{
+	struct cpt_sock_packet_mc_image mi;
+	loff_t saved_obj;
+	void *iter = NULL;
+
+	cpt_push_object(&saved_obj, ctx);
+	while ((iter = sock_packet_cpt_one_mc(sk, &mi, iter)) != NULL) {
+		cpt_open_object(NULL, ctx);
+		mi.cpt_next = CPT_NULL;
+		mi.cpt_object = CPT_OBJ_SOCK_PACKET_MC;
+		mi.cpt_hdrlen = sizeof(mi);
+		mi.cpt_content = CPT_CONTENT_VOID;
+		ctx->write(&mi, sizeof(mi), ctx);
+		cpt_close_object(ctx);
+	}
+	cpt_pop_object(&saved_obj, ctx);
+}
+
+void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx)
+{
+	loff_t saved_obj;
+	if (sk->sk_filter) {
+		struct cpt_obj_bits v;
+
+		cpt_push_object(&saved_obj, ctx);
+		cpt_open_object(NULL, ctx);
+
+		v.cpt_next = CPT_NULL;
+		v.cpt_object = CPT_OBJ_SKFILTER;
+		v.cpt_hdrlen = sizeof(v);
+		v.cpt_content = CPT_CONTENT_DATA;
+		v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter);
+
+		ctx->write(&v, sizeof(v), ctx);
+		ctx->write(sk->sk_filter->insns, v.cpt_size, ctx);
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
+		cpt_push_object(&saved_obj, ctx);
+		cpt_dump_mcfilter(sk, ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+	if (sk->sk_family == AF_PACKET) {
+		struct cpt_sock_packet_image v;
+
+		memset(&v, 0, sizeof(v));
+
+		cpt_push_object(&saved_obj, ctx);
+		cpt_open_object(NULL, ctx);
+
+		v.cpt_next = CPT_NULL;
+		v.cpt_object = CPT_OBJ_SOCK_PACKET;
+		v.cpt_hdrlen = sizeof(v);
+		v.cpt_content = CPT_CONTENT_ARRAY;
+		sock_packet_cpt_attr(sk, &v);
+
+		ctx->write(&v, sizeof(v), ctx);
+		cpt_dump_sock_packet_mclist(sk, ctx);
+
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_obj, ctx);
+	}
+}
+
+static int cpt_dump_unix_mount(struct sock *sk, struct cpt_sock_image *v,
+		cpt_context_t *ctx)
+{
+	cpt_object_t *mntobj;
+
+	mntobj = cpt_lookup_vfsmount_obj(unix_sk(sk)->mnt, ctx);
+	if (mntobj == NULL) {
+		eprintk_ctx("can't get unix vfsmount\n");
+		return -EINVAL;
+	}
+
+	v->cpt_vfsmount_ref = mntobj->o_pos;
+	return 0;
+}
+
+static int cpt_dump_unix_socket(struct sock *sk, struct cpt_sock_image *v, cpt_context_t *ctx)
+{
+	v->cpt_vfsmount_ref = CPT_NULL;
+	v->cpt_i_uid = -1;
+	v->cpt_i_gid = -1;
+
+	if (unix_sk(sk)->dentry) {
+		struct dentry *d = unix_sk(sk)->dentry;
+		v->cpt_i_uid = d->d_inode->i_uid;
+		v->cpt_i_gid = d->d_inode->i_gid;
+
+		if (IS_ROOT(d) || !d_unhashed(d)) {
+			int err = 0;
+			struct path p = {unix_sk(sk)->mnt, d};
+			char *path;
+			unsigned long pg = __get_free_page(GFP_KERNEL);
+
+			if (!pg)
+				return -ENOMEM;
+
+			path = d_path(&p, (char *)pg, PAGE_SIZE);
+
+			if (!IS_ERR(path)) {
+				int len = strlen(path);
+				if (len < 126) {
+					strcpy(((char*)v->cpt_laddr)+2, path); 
+					v->cpt_laddrlen = len + 2;
+				} else {
+					wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2);
+				}
+				if (cpt_need_delayfs(unix_sk(sk)->mnt))
+					v->cpt_sockflags |= CPT_SOCK_DELAYED;
+
+				v->cpt_i_mode = d->d_inode->i_mode & S_IALLUGO;
+
+				err = cpt_dump_unix_mount(sk, v, ctx);
+			} else {
+				eprintk_ctx("cannot get path of an af_unix socket\n");
+				err = PTR_ERR(path);
+			}
+			free_page(pg);
+			if (err)
+				return err;
+		} else
+			v->cpt_sockflags |= CPT_SOCK_DELETED;
+	}
+
+	/* If the socket is connected, find its peer. If peer is not
+	 * in our table, the socket is connected to external process
+	 * and we consider it disconnected.
+	 */
+	if (unix_peer(sk)) {
+		cpt_object_t *pobj;
+		pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx);
+		if (pobj)
+			v->cpt_peer = pobj->o_index;
+		else
+			v->cpt_shutdown = SHUTDOWN_MASK;
+
+		/*
+		 * There could be a situation, then socket is connected to
+		 * itself. Stupid, but valid.
+		 * Let's don't mix it with socket pairs...
+		 */
+		if (unix_peer(sk) != sk && unix_peer(unix_peer(sk)) == sk)
+			v->cpt_socketpair = 1;
+	}
+
+	/* If the socket shares address with another socket it is
+	 * child of some listening socket. Find and record it. */
+	if (unix_sk(sk)->addr &&
+			atomic_read(&unix_sk(sk)->addr->refcnt) > 1 &&
+			sk->sk_state != TCP_LISTEN) {
+		cpt_object_t *pobj;
+		for_each_object(pobj, CPT_OBJ_SOCKET) {
+			struct sock *psk = pobj->o_obj;
+			if (psk->sk_family == AF_UNIX &&
+					psk->sk_state == TCP_LISTEN &&
+					unix_sk(psk)->addr == unix_sk(sk)->addr) {
+				v->cpt_parent = pobj->o_index;
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* Dump socket content */
+
+int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx)
+{
+	struct cpt_sock_image *v = cpt_get_buf(ctx);
+	struct socket *sock;
+	struct timeval tmptv;
+
+	cpt_open_object(obj, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_SOCKET;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_file = CPT_NULL;
+	sock = sk->sk_socket;
+	if (sock && sock->file) {
+		cpt_object_t *tobj;
+		tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx);
+		if (tobj)
+			v->cpt_file = tobj->o_pos;
+	}
+	v->cpt_index = index;
+	v->cpt_parent = parent;
+
+	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
+		if (sock && !obj->o_lock) {
+			lockdep_off();
+			lock_sock(sk);
+			lockdep_on();
+			obj->o_lock = 1;
+		}
+	}
+
+	/* Some bits stored in inode */
+	v->cpt_ssflags = sock ? sock->flags : 0;
+	v->cpt_sstate = sock ? sock->state : 0;
+	v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0;
+
+	/* Common data */
+	v->cpt_family = sk->sk_family;
+	v->cpt_type = sk->sk_type;
+	v->cpt_state = sk->sk_state;
+	v->cpt_reuse = sk->sk_reuse;
+	v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED);
+	v->cpt_shutdown = sk->sk_shutdown;
+	v->cpt_userlocks = sk->sk_userlocks;
+	v->cpt_no_check = sk->sk_no_check;
+	v->cpt_zapped = sock_flag(sk, SOCK_DBG);
+	v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+	v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE);
+	v->cpt_protocol = sk->sk_protocol;
+	v->cpt_err = sk->sk_err;
+	v->cpt_err_soft = sk->sk_err_soft;
+	v->cpt_max_ack_backlog = sk->sk_max_ack_backlog;
+	v->cpt_priority = sk->sk_priority;
+	v->cpt_rcvlowat = sk->sk_rcvlowat;
+	v->cpt_rcvtimeo = CPT_NULL;
+	if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT)
+		v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo;
+	v->cpt_sndtimeo = CPT_NULL;
+	if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT)
+		v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo;
+	v->cpt_rcvbuf = sk->sk_rcvbuf;
+	v->cpt_sndbuf = sk->sk_sndbuf;
+	v->cpt_bound_dev_if = sk->sk_bound_dev_if;
+	v->cpt_flags = sk->sk_flags;
+	v->cpt_lingertime = CPT_NULL;
+	if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT)
+		v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime;
+	v->cpt_peer_pid = cpt_pid_nr(sk_extended(sk)->sk_peer_pid);
+	v->cpt_peer_uid = sk_extended(sk)->sk_peer_cred ? sk_extended(sk)->sk_peer_cred->euid : -1;
+	v->cpt_peer_gid = sk_extended(sk)->sk_peer_cred ? sk_extended(sk)->sk_peer_cred->egid : -1;
+	tmptv = ktime_to_timeval(sk->sk_stamp);
+	v->cpt_stamp = cpt_timeval_export(&tmptv);
+
+	v->cpt_peer = -1;
+	v->cpt_socketpair = 0;
+	v->cpt_sockflags = 0;
+
+	v->cpt_laddrlen = 0;
+	if (sock) {
+		int alen = sizeof(v->cpt_laddr);
+		int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+		v->cpt_laddrlen = alen;
+	}
+	v->cpt_raddrlen = 0;
+	if (sock) {
+		int alen = sizeof(v->cpt_raddr);
+		int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2);
+		if (!err)
+			v->cpt_raddrlen = alen;
+	}
+
+	if (sk->sk_family == AF_UNIX) {
+		int err;
+		
+		err = cpt_dump_unix_socket(sk, v, ctx);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+	}
+
+	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
+		cpt_dump_socket_in(v, sk, ctx);
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_dump_sock_attr(sk, ctx);
+
+	dump_rqueue(index, sk, ctx);
+	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
+		dump_wqueue(index, sk, ctx);
+		cpt_dump_ofo_queue(index, sk, ctx);
+	}
+
+	if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
+	    && sk->sk_state == TCP_LISTEN)
+		cpt_dump_synwait_queue(sk, index, ctx);
+
+	cpt_close_object(ctx);
+
+	if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
+	    && sk->sk_state == TCP_LISTEN) {
+		int err = cpt_dump_accept_queue(sk, index, ctx);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int cpt_dump_orphaned_sockets(struct cpt_context *ctx)
+{
+	int i, err = 0;
+
+	cpt_open_section(ctx, CPT_SECT_ORPHANS);
+
+	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
+		struct sock *sk;
+		struct hlist_nulls_node *node;
+		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i);
+retry:
+		spin_lock_bh(lock);
+		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) {
+
+			if (sk->owner_env != get_exec_env())
+				continue;
+			if (sk->sk_socket)
+				continue;
+			if (!sock_flag(sk, SOCK_DEAD))
+				continue;
+			if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx))
+				continue;
+			sock_hold(sk);
+			spin_unlock_bh(lock);
+
+			local_bh_disable();
+			bh_lock_sock(sk);
+			if (sock_owned_by_user(sk))
+				eprintk_ctx("BUG: sk locked by whom?\n");
+			sk->sk_lock.owned = 1;
+			bh_unlock_sock(sk);
+			local_bh_enable();
+
+			err = cpt_dump_socket(NULL, sk, -1, -1, ctx);
+
+			local_bh_disable();
+			bh_lock_sock(sk);
+			sk->sk_lock.owned = 0;
+			clear_backlog(sk);
+			tcp_done(sk);
+			bh_unlock_sock(sk);
+			local_bh_enable();
+			sock_put(sk);
+
+			if (err)
+				return err;
+
+			goto retry;
+		}
+		spin_unlock_bh(lock);
+	}
+	cpt_close_section(ctx);
+	return err;
+}
+
+static int can_dump(struct sock *sk, cpt_context_t *ctx)
+{
+	switch (sk->sk_family) {
+	case AF_NETLINK:
+		if (((struct netlink_sock *)sk)->cb) {
+			eprintk_ctx("netlink socket has active callback\n");
+			return 0;
+		}
+		break;
+	}
+	return 1;
+}
+
+/* We are not going to block suspend when we have external AF_UNIX connections.
+ * But we cannot stop feed of new packets/connections to our environment
+ * from outside. Taking into account that it is intrincically unreliable,
+ * we collect some amount of data, but when checkpointing/restoring we
+ * are going to drop everything, which does not make sense: skbs sent
+ * by outside processes, connections from outside etc. etc.
+ */
+
+/* The first pass. When we see socket referenced by a file, we just
+ * add it to socket table */
+int cpt_collect_socket(struct file *file, cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+	struct socket *sock;
+	struct sock *sk;
+
+	if (!S_ISSOCK(file->f_dentry->d_inode->i_mode))
+		return -ENOTSOCK;
+	sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket;
+	sk = sock->sk;
+	if (!can_dump(sk, ctx))
+		return -EAGAIN;
+	if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL)
+		return -ENOMEM;
+	obj->o_parent = file;
+
+	return 0;
+}
+
+/*
+ * We should end with table containing:
+ *  * all sockets opened by our processes in the table.
+ *  * all the sockets queued in listening queues on _our_ listening sockets,
+ *    which are connected to our opened sockets.
+ */
+
+static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx)
+{
+	struct sock *sk = obj->o_obj;
+	cpt_object_t *cobj;
+	struct sk_buff *skb;
+
+	skb = skb_peek(&sk->sk_receive_queue);
+	while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
+		struct sock *lsk = skb->sk;
+		if (unix_peer(lsk) &&
+		    lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) {
+			if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL)
+				return -ENOMEM;
+			cobj->o_parent = obj->o_parent;
+		}
+		spin_lock_irq(&sk->sk_receive_queue.lock);
+		skb = skb->next;
+		spin_unlock_irq(&sk->sk_receive_queue.lock);
+	}
+
+	return 0;
+}
+
+int cpt_index_sockets(cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+	unsigned long index = 0;
+
+	/* Collect not-yet-accepted children of listening sockets. */
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct sock *sk = obj->o_obj;
+
+		if (sk->sk_state != TCP_LISTEN)
+			continue;
+
+		if (sk->sk_family == AF_UNIX)
+			collect_one_unix_listening_sock(obj, ctx);
+	}
+
+	/* Assign indices to all the sockets. */
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct sock *sk = obj->o_obj;
+		cpt_obj_setindex(obj, index++, ctx);
+
+		if (sk->sk_socket && sk->sk_socket->file) {
+			cpt_object_t *tobj;
+			tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx);
+			if (tobj)
+				cpt_obj_setindex(tobj, obj->o_index, ctx);
+		}
+	}
+
+	return 0;
+}
+
+void cpt_unlock_sockets(cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+
+	lockdep_off();
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct sock *sk = obj->o_obj;
+		if (sk && obj->o_lock) {
+			if (sk->sk_socket)
+				release_sock(sk);
+		}
+	}
+	lockdep_on();
+}
+
+void cpt_kill_sockets(cpt_context_t * ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct sock *sk = obj->o_obj;
+		if (sk && obj->o_lock) {
+			struct ve_struct *old_env;
+			old_env = set_exec_env(sk->owner_env);
+			cpt_kill_socket(sk, ctx);
+			if (sk->sk_socket)
+				release_sock_nobacklog(sk);
+			set_exec_env(old_env);
+		}
+	}
+}
+
+__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx)
+{
+	struct fasync_struct *fa;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct socket *sock;
+
+	sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+
+	for (fa = sock->fasync_list; fa; fa = fa->fa_next) {
+		if (fa->fa_file == file)
+			return fa->fa_fd;
+	}
+	return -1;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_socket.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_socket.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_socket.h	2015-01-21 12:02:48.227093553 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_socket.h	2015-01-21 12:02:50.860023656 +0300
@@ -0,0 +1,61 @@
+struct sock;
+
+int cpt_collect_passedfds(cpt_context_t *);
+int cpt_index_sockets(cpt_context_t *);
+int cpt_collect_socket(struct file *, cpt_context_t *);
+int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx);
+int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx);
+int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx);
+int rst_sockets(struct cpt_context *ctx);
+int rst_sockets_complete(struct cpt_context *ctx);
+void rst_rollback_sockets(struct cpt_context *ctx);
+int cpt_dump_orphaned_sockets(struct cpt_context *ctx);
+
+int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx);
+struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner,
+			 __u32 *queue, struct cpt_context *ctx);
+
+void cpt_unlock_sockets(cpt_context_t *);
+void cpt_kill_sockets(cpt_context_t *);
+
+
+int cpt_kill_socket(struct sock *, cpt_context_t *);
+int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*);
+int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx);
+int rst_listen_socket_in(struct sock *sk, struct cpt_sock_image *si,
+			 loff_t pos, struct cpt_context *ctx);
+__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx);
+int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *);
+int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx);
+int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx);
+int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct sock *sk,
+		 struct cpt_context *ctx);
+int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx);
+
+int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
+		       loff_t pos, cpt_context_t *ctx);
+int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
+			loff_t pos, cpt_context_t *ctx);
+
+int rst_delay_unix_bind(struct sock *,
+			struct cpt_sock_image *, cpt_context_t *);
+
+struct unix_bind_info {
+	struct sock *sk;
+	char path[128];
+	int path_off;
+	u32 uid, gid;
+	umode_t i_mode;
+	struct unix_bind_info *next;
+};
+
+int rebind_unix_socket(struct vfsmount *rmnt, struct unix_bind_info *bii,
+			int flags);
+void rst_put_delayed_sockets(cpt_context_t *);
+
+void sock_packet_cpt_attr(struct sock *sk, struct cpt_sock_packet_image *v);
+int sock_packet_rst_attr(struct sock *sk, struct cpt_sock_packet_image *v);
+void *sock_packet_cpt_one_mc(struct sock *sk,
+		struct cpt_sock_packet_mc_image *mi, void *prev);
+int sock_packet_rst_one_mc(struct sock *sk,
+		struct cpt_sock_packet_mc_image *mi);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_socket_in.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_socket_in.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_socket_in.c	2015-01-21 12:02:48.227093553 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_socket_in.c	2015-01-21 12:02:50.593030744 +0300
@@ -0,0 +1,458 @@
+/*
+ *
+ *  kernel/cpt/cpt_socket_in.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/if_inet6.h>
+#include <linux/igmp.h>
+#include <linux/ipv6.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+
+static inline __u32 jiffies_export(unsigned long tmo)
+{
+	__s32 delta = (long)(tmo - jiffies);
+	return delta;
+}
+
+static inline __u32 tcp_jiffies_export(__u32 tmo)
+{
+	__s32 delta = tmo - tcp_time_stamp;
+	return delta;
+}
+
+int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp;
+
+	if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP)
+		return 0;
+
+	tp = tcp_sk(sk);
+
+	skb = skb_peek(&tp->out_of_order_queue);
+	while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) {
+		int err;
+
+		err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, sk, ctx);
+		if (err)
+			return err;
+
+		spin_lock_irq(&tp->out_of_order_queue.lock);
+		skb = skb->next;
+		spin_unlock_irq(&tp->out_of_order_queue.lock);
+	}
+	return 0;
+}
+
+static inline int sk_ipv6_mapped(struct sock *sk)
+{
+	const struct inet_connection_sock_af_ops *ops;
+
+	ops = inet_csk(sk)->icsk_af_ops;
+
+	BUILD_BUG_ON(sizeof(struct iphdr) == sizeof(struct ipv6hdr));
+	BUILD_BUG_ON(sizeof(struct sockaddr_in) == sizeof(struct sockaddr_in6));
+
+	return sk->sk_family == AF_INET6 &&
+		ops->net_header_len == sizeof(struct iphdr) &&
+		ops->sockaddr_len == sizeof(struct sockaddr_in6);
+}
+
+static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk,
+			       struct cpt_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	si->cpt_pred_flags = tp->pred_flags;
+	si->cpt_rcv_nxt = tp->rcv_nxt;
+	si->cpt_snd_nxt = tp->snd_nxt;
+	si->cpt_snd_una = tp->snd_una;
+	si->cpt_snd_sml = tp->snd_sml;
+	si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp);
+	si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime);
+	si->cpt_tcp_header_len = tp->tcp_header_len;
+	si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending;
+	si->cpt_quick = inet_csk(sk)->icsk_ack.quick;
+	si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong;
+	si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked;
+	si->cpt_ato = inet_csk(sk)->icsk_ack.ato;
+	si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout);
+	si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime);
+	si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size;
+	si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss;
+	si->cpt_snd_wl1 = tp->snd_wl1;
+	si->cpt_snd_wnd = tp->snd_wnd;
+	si->cpt_max_window = tp->max_window;
+	si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie;
+	si->cpt_mss_cache = tp->mss_cache;
+	si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */
+	si->cpt_mss_clamp = tp->rx_opt.mss_clamp;
+	si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len;
+	si->cpt_ext2_header_len = 0;
+	si->cpt_ca_state = inet_csk(sk)->icsk_ca_state;
+	si->cpt_retransmits = inet_csk(sk)->icsk_retransmits;
+	si->cpt_reordering = tp->reordering;
+	si->cpt_frto_counter = tp->frto_counter;
+	si->cpt_frto_highmark = tp->frto_highmark;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+	// // si->cpt_adv_cong = tp->adv_cong;
+#endif
+	si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept;
+	si->cpt_backoff = inet_csk(sk)->icsk_backoff;
+	si->cpt_srtt = tp->srtt;
+	si->cpt_mdev = tp->mdev;
+	si->cpt_mdev_max = tp->mdev_max;
+	si->cpt_rttvar = tp->rttvar;
+	si->cpt_rtt_seq = tp->rtt_seq;
+	si->cpt_rto = inet_csk(sk)->icsk_rto;
+	si->cpt_packets_out = tp->packets_out;
+	si->cpt_left_out = tp->sacked_out + tp->lost_out;
+	si->cpt_retrans_out = tp->retrans_out;
+	si->cpt_lost_out = tp->lost_out;
+	si->cpt_sacked_out = tp->sacked_out;
+	si->cpt_fackets_out = tp->fackets_out;
+	si->cpt_snd_ssthresh = tp->snd_ssthresh;
+	si->cpt_snd_cwnd = tp->snd_cwnd;
+	si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt;
+	si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp;
+	si->cpt_snd_cwnd_used = tp->snd_cwnd_used;
+	si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp);
+	si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout);
+	si->cpt_ka_timeout = 0;
+	si->cpt_rcv_wnd = tp->rcv_wnd;
+	si->cpt_rcv_wup = tp->rcv_wup;
+	si->cpt_write_seq = tp->write_seq;
+	si->cpt_pushed_seq = tp->pushed_seq;
+	si->cpt_copied_seq = tp->copied_seq;
+	si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok;
+	si->cpt_wscale_ok = tp->rx_opt.wscale_ok;
+	si->cpt_sack_ok = tp->rx_opt.sack_ok;
+	si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp;
+	si->cpt_snd_wscale = tp->rx_opt.snd_wscale;
+	si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale;
+	si->cpt_nonagle = tp->nonagle;
+	si->cpt_keepalive_probes = tp->keepalive_probes;
+	si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval;
+	si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr;
+	si->cpt_ts_recent = tp->rx_opt.ts_recent;
+	si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+	si->cpt_user_mss = tp->rx_opt.user_mss;
+	si->cpt_dsack = tp->rx_opt.dsack;
+	si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq;
+	si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq;
+	si->cpt_sack_array[2] = tp->selective_acks[0].start_seq;
+	si->cpt_sack_array[3] = tp->selective_acks[0].end_seq;
+	si->cpt_sack_array[4] = tp->selective_acks[1].start_seq;
+	si->cpt_sack_array[5] = tp->selective_acks[1].end_seq;
+	si->cpt_sack_array[6] = tp->selective_acks[2].start_seq;
+	si->cpt_sack_array[7] = tp->selective_acks[2].end_seq;
+	si->cpt_sack_array[8] = tp->selective_acks[3].start_seq;
+	si->cpt_sack_array[9] = tp->selective_acks[3].end_seq;
+	si->cpt_window_clamp = tp->window_clamp;
+	si->cpt_rcv_ssthresh = tp->rcv_ssthresh;
+	si->cpt_probes_out = inet_csk(sk)->icsk_probes_out;
+	si->cpt_num_sacks = tp->rx_opt.num_sacks;
+	si->cpt_advmss = tp->advmss;
+	si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries;
+	si->cpt_ecn_flags = tp->ecn_flags;
+	si->cpt_prior_ssthresh = tp->prior_ssthresh;
+	si->cpt_high_seq = tp->high_seq;
+	si->cpt_retrans_stamp = tp->retrans_stamp;
+	si->cpt_undo_marker = tp->undo_marker;
+	si->cpt_undo_retrans = tp->undo_retrans;
+	si->cpt_urg_seq = tp->urg_seq;
+	si->cpt_urg_data = tp->urg_data;
+	si->cpt_pending = inet_csk(sk)->icsk_pending;
+	si->cpt_snd_up = tp->snd_up;
+	si->cpt_keepalive_time = tp->keepalive_time;
+	si->cpt_keepalive_intvl = tp->keepalive_intvl;
+	si->cpt_linger2 = tp->linger2;
+
+	if (sk->sk_state != TCP_LISTEN &&
+	    sk->sk_state != TCP_CLOSE &&
+	    sock_flag(sk, SOCK_KEEPOPEN)) {
+		si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires);
+	}
+
+	if (sk_ipv6_mapped(sk))
+		si->cpt_mapped = 1;
+	return 0;
+}
+
+
+int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk,
+		       struct cpt_context *ctx)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	if (sk->sk_family == AF_INET) {
+		struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr);
+		sin->sin_family = AF_INET;
+		sin->sin_port = inet->sport;
+		sin->sin_addr.s_addr = inet->rcv_saddr;
+		si->cpt_laddrlen = sizeof(*sin);
+	} else if (sk->sk_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr);
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_port = inet->sport;
+		memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16);
+		si->cpt_laddrlen = sizeof(*sin6);
+	}
+	if (!inet->num)
+		si->cpt_laddrlen = 0;
+
+	si->cpt_daddr = inet->daddr;
+	si->cpt_dport = inet->dport;
+	si->cpt_saddr = inet->saddr;
+	si->cpt_rcv_saddr = inet->rcv_saddr;
+	si->cpt_sport = inet->sport;
+	si->cpt_uc_ttl = inet->uc_ttl;
+	si->cpt_tos = inet->tos;
+	si->cpt_cmsg_flags = inet->cmsg_flags;
+	si->cpt_mc_index = inet->mc_index;
+	si->cpt_mc_addr = inet->mc_addr;
+	si->cpt_hdrincl = inet->hdrincl;
+	si->cpt_mc_ttl = inet->mc_ttl;
+	si->cpt_mc_loop = inet->mc_loop;
+	si->cpt_pmtudisc = inet->pmtudisc;
+	si->cpt_recverr = inet->recverr;
+	si->cpt_freebind = inet->freebind;
+	si->cpt_idcounter = inet->id;
+
+	si->cpt_cork_flags = inet->cork.flags;
+	si->cpt_cork_fragsize = 0;
+	si->cpt_cork_length = inet->cork.length;
+	si->cpt_cork_addr = inet->cork.addr;
+	si->cpt_cork_saddr = inet->cork.fl.fl4_src;
+	si->cpt_cork_daddr = inet->cork.fl.fl4_dst;
+	si->cpt_cork_oif = inet->cork.fl.oif;
+	if (inet->cork.dst) {
+		struct rtable *rt = (struct rtable *)inet->cork.dst;
+		si->cpt_cork_fragsize = inet->cork.fragsize;
+		si->cpt_cork_saddr = rt->fl.fl4_src;
+		si->cpt_cork_daddr = rt->fl.fl4_dst;
+		si->cpt_cork_oif = rt->fl.oif;
+	}
+
+	if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
+		struct udp_sock *up = udp_sk(sk);
+		si->cpt_udp_pending  = up->pending;
+		si->cpt_udp_corkflag  = up->corkflag;
+		si->cpt_udp_encap  = up->encap_type;
+		si->cpt_udp_len  = up->len;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		memcpy(si->cpt_saddr6, &np->saddr, 16);
+		memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16);
+		memcpy(si->cpt_daddr6, &np->daddr, 16);
+		si->cpt_flow_label6 = np->flow_label;
+		si->cpt_frag_size6 = np->frag_size;
+		si->cpt_hop_limit6 = np->hop_limit;
+		si->cpt_mcast_hops6 = np->mcast_hops;
+		si->cpt_mcast_oif6 = np->mcast_oif;
+		si->cpt_rxopt6 = np->rxopt.all;
+		si->cpt_mc_loop6 = np->mc_loop;
+		si->cpt_recverr6 = np->recverr;
+		si->cpt_sndflow6 = np->sndflow;
+		si->cpt_pmtudisc6 = np->pmtudisc;
+		si->cpt_ipv6only6 = np->ipv6only;
+		si->cpt_mapped = 0;
+	}
+
+	if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
+		cpt_dump_socket_tcp(si, sk, ctx);
+
+	return 0;
+}
+
+int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx)
+{
+	struct request_sock *req;
+
+	for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) {
+		int err = cpt_dump_socket(NULL, req->sk, -1, index, ctx);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+
+static int dump_openreq(struct request_sock *req, struct sock *sk, int index,
+			struct cpt_context *ctx)
+{
+	struct cpt_openreq_image *v = cpt_get_buf(ctx);
+
+	cpt_open_object(NULL, ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_OPENREQ;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_VOID;
+
+	v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn;
+	v->cpt_snt_isn = tcp_rsk(req)->snt_isn;
+	v->cpt_rmt_port = inet_rsk(req)->rmt_port;
+	v->cpt_mss = req->mss;
+	v->cpt_family = req->rsk_ops->family;
+	v->cpt_retrans = req->retrans;
+	v->cpt_snd_wscale = inet_rsk(req)->snd_wscale;
+	v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale;
+	v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok;
+	v->cpt_sack_ok = inet_rsk(req)->sack_ok;
+	v->cpt_wscale_ok = inet_rsk(req)->wscale_ok;
+	v->cpt_ecn_ok = inet_rsk(req)->ecn_ok;
+	v->cpt_acked = inet_rsk(req)->acked;
+	v->cpt_window_clamp = req->window_clamp;
+	v->cpt_rcv_wnd = req->rcv_wnd;
+	v->cpt_ts_recent = req->ts_recent;
+	v->cpt_expires = jiffies_export(req->expires);
+
+	if (v->cpt_family == AF_INET) {
+		memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4);
+		memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4);
+	} else {
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16);
+		memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16);
+		v->cpt_iif = inet6_rsk(req)->iif;
+#endif
+	}
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	cpt_close_object(ctx);
+	return 0;
+}
+
+int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx)
+{
+	struct inet_connection_sock *icsk;
+	struct listen_sock *lopt;
+	struct request_sock *req;
+	int nr_entries;
+	int i;
+
+	icsk = inet_csk(sk);
+	lopt = icsk->icsk_accept_queue.listen_opt;
+	nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries;
+
+	for (i=0; i < nr_entries; i++) {
+		for (req=lopt->syn_table[i]; req; req=req->dl_next) {
+			loff_t saved_obj;
+			cpt_push_object(&saved_obj, ctx);
+			dump_openreq(req, sk, index, ctx);
+			cpt_pop_object(&saved_obj, ctx);
+		}
+	}
+	return 0;
+}
+
+
+int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx)
+{
+	if (sk->sk_state != TCP_CLOSE &&
+	    (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
+	    sk->sk_protocol == IPPROTO_TCP) {
+		if (sk->sk_state != TCP_LISTEN)
+			tcp_set_state(sk, TCP_CLOSE);
+		else
+			sk->sk_prot->disconnect(sk, 0);
+	}
+	return 0;
+}
+
+int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml;
+
+	for (iml = inet->mc_list; iml; iml = iml->next) {
+		struct cpt_sockmc_image smi;
+		int scnt = 0;
+		int i;
+
+		if (iml->sflist)
+			scnt = iml->sflist->sl_count*16;
+
+		smi.cpt_next = sizeof(smi) + scnt;
+		smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
+		smi.cpt_hdrlen = sizeof(smi);
+		smi.cpt_content = CPT_CONTENT_DATA;
+
+		smi.cpt_family = AF_INET;
+		smi.cpt_mode = iml->sfmode;
+		smi.cpt_ifindex = iml->multi.imr_ifindex;
+		memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr));
+		smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr;
+
+		ctx->write(&smi, sizeof(smi), ctx);
+
+		for (i = 0; i < scnt; i++) {
+			u32 addr[4];
+			memset(&addr, 0, sizeof(addr));
+			addr[0] = iml->sflist->sl_addr[i];
+			ctx->write(&addr, sizeof(addr), ctx);
+		}
+	}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_mc_socklist *mcl;
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) {
+			struct cpt_sockmc_image smi;
+			int scnt = 0;
+			int i;
+
+			if (mcl->sflist)
+				scnt = mcl->sflist->sl_count*16;
+
+			smi.cpt_next = sizeof(smi) + scnt;
+			smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
+			smi.cpt_hdrlen = sizeof(smi);
+			smi.cpt_content = CPT_CONTENT_DATA;
+
+			smi.cpt_family = AF_INET6;
+			smi.cpt_mode = mcl->sfmode;
+			smi.cpt_ifindex = mcl->ifindex;
+			memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr));
+
+			ctx->write(&smi, sizeof(smi), ctx);
+			for (i = 0; i < scnt; i++)
+				ctx->write(&mcl->sflist->sl_addr[i], 16, ctx);
+		}
+	}
+#endif
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_syscalls.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_syscalls.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_syscalls.h	2015-01-21 12:02:48.228093526 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_syscalls.h	2015-01-21 12:02:50.328037778 +0300
@@ -0,0 +1,99 @@
+#include <linux/unistd.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+
+#define WRAP(c, args) return sys_##c args
+#define WRAP2(c, args) int err; mm_segment_t oldfs; \
+	               oldfs = get_fs(); set_fs(KERNEL_DS); \
+                       err = sys_##c args ;\
+                       set_fs(oldfs); \
+                       return err
+
+static inline int sc_close(int fd)
+{
+	WRAP(close, (fd));
+}
+
+static inline int sc_dup2(int fd1, int fd2)
+{
+	WRAP(dup2, (fd1, fd2));
+}
+
+static inline int sc_unlink(char *name)
+{
+	WRAP2(unlink, (name));
+}
+
+static inline int sc_pipe(int *pfd)
+{
+	return do_pipe_flags(pfd, 0);
+}
+
+static inline int sc_mknod(char *name, int mode, int dev)
+{
+	WRAP2(mknod, (name, mode, dev));
+}
+
+static inline int sc_chmod(char *name, int mode)
+{
+	WRAP2(chmod, (name, mode));
+}
+
+static inline int sc_chown(char *name, int uid, int gid)
+{
+	WRAP2(chown, (name, uid, gid));
+}
+
+static inline int sc_mkdir(char *name, int mode)
+{
+	WRAP2(mkdir, (name, mode));
+}
+
+static inline int sc_rmdir(char *name)
+{
+	WRAP2(rmdir, (name));
+}
+
+static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags)
+{
+	WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL));
+}
+
+static inline int sc_mprotect(unsigned long start, size_t len,
+			      unsigned long prot)
+{
+	WRAP(mprotect, (start, len, prot));
+}
+
+static inline int sc_mlock(unsigned long start, size_t len)
+{
+	WRAP(mlock, (start, len));
+}
+
+static inline int sc_munlock(unsigned long start, size_t len)
+{
+	WRAP(munlock, (start, len));
+}
+
+static inline int sc_remap_file_pages(unsigned long start, size_t len,
+				      unsigned long prot, unsigned long pgoff,
+				      unsigned long flags)
+{
+	WRAP(remap_file_pages, (start, len, prot, pgoff, flags));
+}
+
+static inline int sc_waitx(int pid, int opt, int *stat_addr)
+{
+	WRAP(wait4, (pid, stat_addr, opt, NULL));
+}
+
+static inline int sc_flock(int fd, int flags)
+{
+	WRAP(flock, (fd, flags));
+}
+
+static inline int sc_open(char* path, int flags, int mode)
+{
+	WRAP(open, (path, flags, mode));
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_sysvipc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_sysvipc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_sysvipc.c	2015-01-21 12:02:48.228093526 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_sysvipc.c	2015-01-21 12:02:49.871049910 +0300
@@ -0,0 +1,423 @@
+/*
+ *
+ *  kernel/cpt/cpt_sysvipc.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/shm.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_kernel.h"
+
+struct _warg {
+		struct file			*file;
+		struct cpt_sysvshm_image	*v;
+};
+
+static int dump_one_shm(struct shmid_kernel *shp, void *arg)
+{
+	struct _warg *warg = arg;
+	struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v;
+
+	if (shp->shm_file != warg->file)
+		return 0;
+
+	v->cpt_key = shp->shm_perm.key;
+	v->cpt_uid = shp->shm_perm.uid;
+	v->cpt_gid = shp->shm_perm.gid;
+	v->cpt_cuid = shp->shm_perm.cuid;
+	v->cpt_cgid = shp->shm_perm.cgid;
+	v->cpt_mode = shp->shm_perm.mode;
+	v->cpt_seq = shp->shm_perm.seq;
+
+	v->cpt_id = shp->shm_perm.id;
+	v->cpt_segsz = shp->shm_segsz;
+	v->cpt_atime = shp->shm_atim;
+	v->cpt_ctime = shp->shm_ctim;
+	v->cpt_dtime = shp->shm_dtim;
+	v->cpt_creator = shp->shm_cprid;
+	v->cpt_last = shp->shm_lprid;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+	v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1;
+#else
+	v->cpt_mlockuser = -1;
+#endif
+	return 1;
+}
+
+int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx)
+{
+	struct cpt_sysvshm_image *v = cpt_get_buf(ctx);
+	struct _warg warg;
+
+	v->cpt_next = sizeof(*v);
+	v->cpt_object = CPT_OBJ_SYSV_SHM;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_VOID;
+
+	warg.file = file;
+	warg.v = v;
+	if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) {
+		cpt_release_buf(ctx);
+		return -ESRCH;
+	}
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+	return 0;
+}
+
+
+int match_sem(int id, struct sem_array *sema, void *arg)
+{
+	if (id != (unsigned long)arg)
+		return 0;
+	return sema->sem_nsems + 1;
+}
+
+static int get_sem_nsem(int id, cpt_context_t *ctx)
+{
+	int res;
+	res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id);
+	if (res > 0)
+		return res - 1;
+	eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id);
+	return -ESRCH;
+}
+
+static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx)
+{
+	struct cpt_sysvsem_undo_image v;
+	loff_t saved_obj;
+
+	cpt_open_object(NULL, ctx);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_SEMUNDO;
+	v.cpt_id = su->semid;
+	v.cpt_nsem = get_sem_nsem(su->semid, ctx);
+	if ((int)v.cpt_nsem < 0)
+		return -ESRCH;
+
+	ctx->write(&v, sizeof(v), ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx);
+	cpt_pop_object(&saved_obj, ctx);
+
+	cpt_close_object(ctx);
+	return 0;
+}
+
+struct sem_warg {
+	int				last_id;
+	struct cpt_sysvsem_image	*v;
+};
+
+static int dump_one_sem(int id, struct sem_array *sma, void *arg)
+{
+	struct sem_warg * warg = (struct sem_warg *)arg;
+	struct cpt_sysvsem_image *v = warg->v;
+	int i;
+
+	if (warg->last_id != -1) {
+		if ((id % IPCMNI) <= warg->last_id)
+			return 0;
+	}
+
+	v->cpt_next = sizeof(*v);
+	v->cpt_object = CPT_OBJ_SYSV_SEM;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_SEMARRAY;
+
+	v->cpt_key = sma->sem_perm.key;
+	v->cpt_uid = sma->sem_perm.uid;
+	v->cpt_gid = sma->sem_perm.gid;
+	v->cpt_cuid = sma->sem_perm.cuid;
+	v->cpt_cgid = sma->sem_perm.cgid;
+	v->cpt_mode = sma->sem_perm.mode;
+	v->cpt_seq = sma->sem_perm.seq;
+
+	v->cpt_id = id;
+	v->cpt_ctime = sma->sem_ctime;
+	v->cpt_otime = sma->sem_otime;
+
+	for (i=0; i<sma->sem_nsems; i++) {
+		struct {
+			__u32 semval;
+			__u32 sempid;
+		} *s = (void*)v + v->cpt_next;
+		if (v->cpt_next >= PAGE_SIZE - sizeof(*s))
+			return -EINVAL;
+		s->semval = sma->sem_base[i].semval;
+		s->sempid = sma->sem_base[i].sempid;
+		v->cpt_next += sizeof(*s);
+	}
+
+	warg->last_id = id % IPCMNI;
+	return 1;
+}
+
+
+int cpt_dump_sysvsem(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+	struct sem_warg warg;
+
+	/* Dumping semaphores is quite tricky because we cannot
+	 * write to dump file under lock inside sysvipc_walk_sem().
+	 */
+	cpt_open_section(ctx, CPT_SECT_SYSV_SEM);
+	warg.last_id = -1;
+	warg.v = cpt_get_buf(ctx);
+	for (;;) {
+		if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0)
+			break;
+		ctx->write(warg.v, warg.v->cpt_next, ctx);
+	}
+	cpt_release_buf(ctx);
+	cpt_close_section(ctx);
+
+	cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO);
+	for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
+		struct sem_undo_list *semu = obj->o_obj;
+		struct sem_undo *su;
+		struct cpt_object_hdr v;
+		loff_t saved_obj;
+
+		cpt_open_object(obj, ctx);
+
+		v.cpt_next = CPT_NULL;
+		v.cpt_object = CPT_OBJ_SYSVSEM_UNDO;
+		v.cpt_hdrlen = sizeof(v);
+		v.cpt_content = CPT_CONTENT_ARRAY;
+
+		ctx->write(&v, sizeof(v), ctx);
+
+		cpt_push_object(&saved_obj, ctx);
+		list_for_each_entry(su, &semu->list_proc, list_proc) {
+			if (su->semid != -1) {
+				int err;
+				err = dump_one_semundo(su, ctx);
+				if (err < 0)
+					return err;
+			}
+		}
+		cpt_pop_object(&saved_obj, ctx);
+
+		cpt_close_object(ctx);
+	}
+	cpt_close_section(ctx);
+	return 0;
+}
+
+struct msg_warg {
+	int				last_id;
+	struct msg_queue		*msq;
+	struct cpt_sysvmsg_image	*v;
+};
+
+static int dump_one_msg(int id, struct msg_queue *msq, void *arg)
+{
+	struct msg_warg * warg = (struct msg_warg *)arg;
+	struct cpt_sysvmsg_image *v = warg->v;
+
+	if (warg->last_id != -1) {
+		if ((id % IPCMNI) <= warg->last_id)
+			return 0;
+	}
+
+	v->cpt_next = sizeof(*v);
+	v->cpt_object = CPT_OBJ_SYSVMSG;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_key = msq->q_perm.key;
+	v->cpt_uid = msq->q_perm.uid;
+	v->cpt_gid = msq->q_perm.gid;
+	v->cpt_cuid = msq->q_perm.cuid;
+	v->cpt_cgid = msq->q_perm.cgid;
+	v->cpt_mode = msq->q_perm.mode;
+	v->cpt_seq = msq->q_perm.seq;
+
+	v->cpt_id = id;
+	v->cpt_stime = msq->q_stime;
+	v->cpt_rtime = msq->q_rtime;
+	v->cpt_ctime = msq->q_ctime;
+	v->cpt_last_sender = msq->q_lspid;
+	v->cpt_last_receiver = msq->q_lrpid;
+	v->cpt_qbytes = msq->q_qbytes;
+
+	warg->msq = msq;
+	warg->last_id = id % IPCMNI;
+	return 1;
+}
+
+static int do_store(void * src, int len, int offset, void * data)
+{
+	cpt_context_t * ctx = data;
+	ctx->write(src, len, ctx);
+	return 0;
+}
+
+static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx)
+{
+	loff_t saved_obj;
+	struct cpt_sysvmsg_msg_image mv;
+			
+	cpt_open_object(NULL, ctx);
+	mv.cpt_next = CPT_NULL;
+	mv.cpt_object = CPT_OBJ_SYSVMSG_MSG;
+	mv.cpt_hdrlen = sizeof(mv);
+	mv.cpt_content = CPT_CONTENT_DATA;
+
+	mv.cpt_type = m->m_type;
+	mv.cpt_size = m->m_ts;
+
+	ctx->write(&mv, sizeof(mv), ctx);
+
+	cpt_push_object(&saved_obj, ctx);
+	sysv_msg_store(m, do_store, m->m_ts, ctx);
+	cpt_pop_object(&saved_obj, ctx);
+	cpt_close_object(ctx);
+}
+
+int cpt_dump_sysvmsg(struct cpt_context *ctx)
+{
+	struct msg_warg warg;
+
+	/* Dumping msg queues is tricky because we cannot
+	 * write to dump file under lock inside sysvipc_walk_msg().
+	 *
+	 * And even worse, we have to access msg list in an unserialized
+	 * context. It is fragile. But VE is still frozen, remember?
+	 */
+	cpt_open_section(ctx, CPT_SECT_SYSV_MSG);
+	warg.last_id = -1;
+	warg.v = cpt_get_buf(ctx);
+	for (;;) {
+		loff_t saved_obj;
+		struct msg_msg * m;
+
+		if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0)
+			break;
+
+		cpt_open_object(NULL, ctx);
+
+		ctx->write(warg.v, warg.v->cpt_next, ctx);
+
+		cpt_push_object(&saved_obj, ctx);
+		list_for_each_entry(m, &warg.msq->q_messages, m_list) {
+			cpt_dump_one_sysvmsg(m, ctx);
+		}
+		cpt_pop_object(&saved_obj, ctx);
+
+		cpt_close_object(ctx);
+	}
+	cpt_release_buf(ctx);
+	cpt_close_section(ctx);
+	return 0;
+}
+
+static int cpt_collect_sysvsem_undo(cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		if (tsk->exit_state) {
+			/* ipc/sem.c forgets to clear tsk->sysvsem.undo_list
+			 * on exit. Grrr... */
+			continue;
+		}
+		if (tsk->sysvsem.undo_list &&
+		    cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL)
+			return -ENOMEM;
+	}
+
+	for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
+		struct sem_undo_list *semu = obj->o_obj;
+
+		if (atomic_read(&semu->refcnt) != obj->o_count) {
+			eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt));
+			return -EBUSY;
+		}
+	}
+	return 0;
+}
+
+static int collect_one_shm(struct shmid_kernel *shp, void *arg)
+{
+	cpt_context_t *ctx = arg;
+	cpt_object_t *obj;
+
+	obj = __cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx);
+	if (!obj)
+		return -ENOMEM;
+	obj->o_flags |= CPT_FILE_SYSVIPC;
+	return 0;
+}
+
+int cpt_collect_sysvshm(cpt_context_t * ctx)
+{
+	int err;
+
+	err = sysvipc_walk_shm(collect_one_shm, ctx);
+
+	return err < 0 ? err : 0;
+}
+
+static int cpt_check_posix_mqueue(cpt_context_t * ctx)
+{
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+
+	if (!list_is_singular(&ipc_ns->mq_mnt->mnt_sb->s_inodes)) {
+		eprintk_ctx("posix message queues are not supported\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+int cpt_collect_sysv(cpt_context_t * ctx)
+{
+	int err;
+
+	err = cpt_check_posix_mqueue(ctx);
+	if (err)
+		return err;
+	err = cpt_collect_sysvsem_undo(ctx);
+	if (err)
+		return err;
+	err = cpt_collect_sysvshm(ctx);
+	if (err)
+		return err;
+
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_tty.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_tty.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_tty.c	2015-01-21 12:02:48.228093526 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_tty.c	2015-01-21 12:02:50.515032815 +0300
@@ -0,0 +1,225 @@
+/*
+ *
+ *  kernel/cpt/cpt_tty.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/tty.h>
+#include <linux/tty.h>
+#include <linux/nsproxy.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+#include "cpt_process.h"
+
+/* We must support at least N_TTY. */
+
+int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx)
+{
+	struct tty_struct *tty = file_tty(file);
+	cpt_object_t *obj;
+	struct cpt_obj_ref o;
+	loff_t saved_pos;
+
+	obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx);
+	if (!obj)
+		return -EINVAL;
+
+	cpt_push_object(&saved_pos, ctx);
+
+	o.cpt_next = sizeof(o);
+	o.cpt_object = CPT_OBJ_REF;
+	o.cpt_hdrlen = sizeof(o);
+	o.cpt_content = CPT_CONTENT_VOID;
+	o.cpt_pos = obj->o_pos;
+	ctx->write(&o, sizeof(o), ctx);
+
+	cpt_pop_object(&saved_pos, ctx);
+
+	return 0;
+}
+
+int cpt_collect_tty(struct file *file, cpt_context_t * ctx)
+{
+	struct tty_struct *tty = file_tty(file);
+	cpt_object_t *obj;
+	dev_t dev = file->f_dentry->d_inode->i_rdev;
+
+	if (tty) {
+		obj = cpt_object_add(CPT_OBJ_TTY, tty, ctx);
+		if (obj == NULL)
+			return -ENOMEM;
+		if (MAJOR(dev) == TTY_MAJOR || dev == MKDEV(TTYAUX_MAJOR, 1)) {
+			obj->o_flags |= CPT_TTY_NOPAIR;
+		} else if (tty->link) {
+			obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx);
+			if (obj == NULL)
+				return -ENOMEM;
+			/* Undo o_count, tty->link is not a reference */
+			obj->o_count--;
+		}
+	}
+	return 0;
+}
+
+int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct tty_struct *tty = obj->o_obj;
+	struct cpt_tty_image *v;
+
+	if (tty->link) {
+		if (!(obj->o_flags & CPT_TTY_NOPAIR) &&
+		    lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) {
+			eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE);
+			return -EINVAL;
+		}
+		if (tty->link->link != tty) {
+			eprintk_ctx("bad pty pair\n");
+			return -EINVAL;
+		}
+		if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+		    tty->driver->subtype == PTY_TYPE_SLAVE &&
+		    tty->link->count)
+			obj->o_count++;
+		if (test_bit(TTY_EXTRA_REFERENCE, &tty->flags))
+			obj->o_count++;
+	}
+	if (obj->o_count != tty->count) {
+		eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count);
+		return -EBUSY;
+	}
+
+	cpt_open_object(obj, ctx);
+
+	v = cpt_get_buf(ctx);
+	v->cpt_next = -1;
+	v->cpt_object = CPT_OBJ_TTY;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	v->cpt_index = tty->index;
+	v->cpt_link = -1;
+	if (tty->link)
+		v->cpt_link = tty->link->index;
+	v->cpt_drv_type = tty->driver->type;
+	v->cpt_drv_subtype = tty->driver->subtype;
+	v->cpt_drv_flags = tty->driver->flags;
+	v->cpt_packet = tty->packet;
+	v->cpt_stopped = tty->stopped;
+	v->cpt_hw_stopped = tty->hw_stopped;
+	v->cpt_flow_stopped = tty->flow_stopped;
+	v->cpt_flags = tty->flags;
+	v->cpt_ctrl_status = tty->ctrl_status;
+	v->cpt_canon_data = tty->canon_data;
+	v->cpt_canon_head = tty->canon_head - tty->read_tail;
+	v->cpt_canon_column = tty->canon_column;
+	v->cpt_column = tty->column;
+	v->cpt_erasing = tty->erasing;
+	v->cpt_lnext = tty->lnext;
+	v->cpt_icanon = tty->icanon;
+	v->cpt_raw = tty->raw;
+	v->cpt_real_raw = tty->real_raw;
+	v->cpt_closing = tty->closing;
+	v->cpt_minimum_to_wake = tty->minimum_to_wake;
+	v->cpt_pgrp = 0;
+	if (tty->pgrp) {
+		v->cpt_pgrp = cpt_pid_nr(tty->pgrp);
+		if ((int)v->cpt_pgrp < 0) {
+			dprintk_ctx("cannot map tty->pgrp %d -> %d\n", cpt_pid_nr(tty->pgrp), (int)v->cpt_pgrp);
+			v->cpt_pgrp = -1;
+		}
+	}
+	v->cpt_session = 0;
+	if (tty->session) {
+		v->cpt_session = cpt_pid_nr(tty->session);
+		if ((int)v->cpt_session < 0) {
+			eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session);
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+	}
+	memcpy(v->cpt_name, tty->name, 64);
+	v->cpt_ws_row = tty->winsize.ws_row;
+	v->cpt_ws_col = tty->winsize.ws_col;
+	v->cpt_ws_prow = tty->winsize.ws_ypixel;
+	v->cpt_ws_pcol = tty->winsize.ws_xpixel;
+	if (tty->termios == NULL) {
+		eprintk_ctx("NULL termios\n");
+		cpt_release_buf(ctx);
+		return -EINVAL;
+	}
+	v->cpt_c_line = tty->termios->c_line;
+	v->cpt_c_iflag = tty->termios->c_iflag;
+	v->cpt_c_oflag = tty->termios->c_oflag;
+	v->cpt_c_cflag = tty->termios->c_cflag;
+	v->cpt_c_lflag = tty->termios->c_lflag;
+	memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS);
+	if (NCCS < 32)
+		memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS);
+	memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags));
+
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_release_buf(ctx);
+
+	if (tty->read_buf && tty->read_cnt) {
+		struct cpt_obj_bits *v = cpt_get_buf(ctx);
+		loff_t saved_pos;
+
+		cpt_push_object(&saved_pos, ctx);
+		cpt_open_object(NULL, ctx);
+		v->cpt_next = CPT_NULL;
+		v->cpt_object = CPT_OBJ_BITS;
+		v->cpt_hdrlen = sizeof(*v);
+		v->cpt_content = CPT_CONTENT_DATA;
+		v->cpt_size = tty->read_cnt;
+		ctx->write(v, sizeof(*v), ctx);
+		cpt_release_buf(ctx);
+
+		if (tty->read_cnt) {
+			int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail);
+			ctx->write(tty->read_buf + tty->read_tail, n, ctx);
+			if (tty->read_cnt > n)
+				ctx->write(tty->read_buf, tty->read_cnt-n, ctx);
+			ctx->align(ctx);
+		}
+
+		cpt_close_object(ctx);
+		cpt_pop_object(&saved_pos, ctx);
+	}
+
+	cpt_close_object(ctx);
+
+	return 0;
+}
+
+__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx)
+{
+	struct tty_struct * tty;
+	struct fasync_struct *fa;
+
+	tty = (struct tty_struct *)file_tty(file);
+
+	for (fa = tty->fasync; fa; fa = fa->fa_next) {
+		if (fa->fa_file == file)
+			return fa->fa_fd;
+	}
+	return -1;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_ubc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_ubc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_ubc.c	2015-01-21 12:02:48.228093526 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_ubc.c	2015-01-21 12:02:49.996046592 +0300
@@ -0,0 +1,139 @@
+/*
+ *
+ *  kernel/cpt/cpt_ubc.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/types.h>
+#include <bc/beancounter.h>
+#include <asm/signal.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx);
+	if (obj != NULL) {
+		if (obj->o_count == 1)
+			get_beancounter(bc);
+	}
+	return obj;
+}
+
+__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx);
+	if (obj == NULL) {
+		eprintk("CPT: unknown ub %u (%p)\n", bc->ub_uid, bc);
+		dump_stack();
+		return CPT_NULL;
+	}
+	return obj->o_pos;
+}
+
+static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm,
+		int held)
+{
+	dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL);
+	dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL);
+	dmp->held = (held ? prm->held : CPT_NULL);
+	dmp->maxheld = prm->maxheld;
+	dmp->minheld = prm->minheld;
+	dmp->failcnt = prm->failcnt;
+}
+
+static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct user_beancounter *bc;
+	struct cpt_beancounter_image *v;
+	int i;
+
+	bc = obj->o_obj;
+	ub_update_resources(bc);
+	v = cpt_get_buf(ctx);
+
+	v->cpt_next = CPT_NULL;
+	v->cpt_object = CPT_OBJ_UBC;
+	v->cpt_hdrlen = sizeof(*v);
+	v->cpt_content = CPT_CONTENT_ARRAY;
+
+	if (obj->o_parent != NULL)
+		v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos;
+	else
+		v->cpt_parent = CPT_NULL;
+	v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0;
+	v->cpt_ub_resources = UB_RESOURCES;
+	BUILD_BUG_ON(ARRAY_SIZE(v->cpt_parms) < UB_RESOURCES * 2);
+
+	if (bc->ub_store == NULL)
+		v->cpt_ub_flags = CPT_UB_NOSTORE;
+	else
+		v->cpt_ub_flags = 0;
+
+	for (i = 0; i < UB_RESOURCES; i++) {
+		dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0);
+		if (bc->ub_store != NULL)
+			dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1);
+	}
+	memset(v->cpt_parms + UB_RESOURCES * 2, 0,
+			sizeof(v->cpt_parms)
+				- UB_RESOURCES * 2 * sizeof(v->cpt_parms[0]));
+
+	cpt_open_object(obj, ctx);
+	ctx->write(v, sizeof(*v), ctx);
+	cpt_close_object(ctx);
+
+	cpt_release_buf(ctx);
+	return 0;
+}
+
+int cpt_dump_ubc(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+	int skipped;
+	int top;
+
+	cpt_open_section(ctx, CPT_SECT_UBC);
+
+	do {
+		skipped = 0;
+		top = 0;
+		for_each_object(obj, CPT_OBJ_UBC) {
+			if (obj->o_parent == NULL)
+				top++;
+			if (obj->o_pos != CPT_NULL)
+				continue;
+			if (obj->o_parent != NULL &&
+			    ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL)
+				skipped++;
+			else
+				dump_one_bc(obj, ctx);
+		}
+	} while (skipped && (top < 2));
+
+	cpt_close_section(ctx);
+	if (top > 1) {
+		eprintk_ctx("More than one top level ub exist\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void cpt_finish_ubc(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_UBC)
+		put_beancounter(obj->o_obj);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_ubc.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_ubc.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_ubc.h	2015-01-21 12:02:48.228093526 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_ubc.h	2015-01-21 12:02:50.398035919 +0300
@@ -0,0 +1,54 @@
+#ifdef CONFIG_BEANCOUNTERS
+cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
+__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
+int cpt_dump_ubc(struct cpt_context *ctx);
+
+struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx);
+int rst_undump_ubc(struct cpt_context *ctx);
+
+void cpt_finish_ubc(struct cpt_context *ctx);
+void rst_finish_ubc(struct cpt_context *ctx);
+
+static inline void set_ubc_unlimited(struct cpt_context *ctx,
+				     struct user_beancounter *bc)
+{
+	int i;
+
+	spin_lock_irq(&bc->ub_lock);
+	for ( i = 0 ; i < UB_RESOURCES ; i++ ) {
+		ctx->saved_ubc[i] = bc->ub_parms[i];
+		bc->ub_parms[i].barrier = bc->ub_parms[i].limit = UB_MAXVALUE;
+	}
+	spin_unlock_irq(&bc->ub_lock);
+}
+
+static inline void restore_ubc_limits(struct cpt_context *ctx,
+				      struct user_beancounter *bc)
+{
+	int i;
+
+	spin_lock_irq(&bc->ub_lock);
+	for ( i = 0 ; i < UB_RESOURCES ; i++ ) {
+		bc->ub_parms[i].barrier = ctx->saved_ubc[i].barrier;
+		bc->ub_parms[i].limit   = ctx->saved_ubc[i].limit;
+		bc->ub_parms[i].maxheld = max(ctx->saved_ubc[i].maxheld,
+					      bc->ub_parms[i].maxheld);
+		bc->ub_parms[i].minheld = min(ctx->saved_ubc[i].minheld,
+					      bc->ub_parms[i].minheld);
+		bc->ub_parms[i].failcnt = max(ctx->saved_ubc[i].failcnt,
+					      bc->ub_parms[i].failcnt);
+	}
+	spin_unlock_irq(&bc->ub_lock);
+}
+
+#else
+static int inline cpt_dump_ubc(struct cpt_context *ctx)
+{ return 0; }
+static int inline rst_undump_ubc(struct cpt_context *ctx)
+{ return 0; }
+static void inline cpt_finish_ubc(struct cpt_context *ctx)
+{ return; }
+static void inline rst_finish_ubc(struct cpt_context *ctx)
+{ return; }
+#endif
+
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_x8664.S linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_x8664.S
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/cpt_x8664.S	2015-01-21 12:02:48.228093526 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/cpt_x8664.S	2015-01-21 12:02:49.348063793 +0300
@@ -0,0 +1,67 @@
+#define ASSEMBLY 1
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/errno.h>
+
+	.code64
+
+	.macro FAKE_STACK_FRAME child_rip
+	/* push in order ss, rsp, eflags, cs, rip */
+	xorq %rax, %rax
+	pushq $__KERNEL_DS /* ss */
+	pushq %rax /* rsp */
+	pushq $(1<<9) /* eflags - interrupts on */
+	pushq $__KERNEL_CS /* cs */
+	pushq \child_rip /* rip */
+	pushq	%rax /* orig rax */
+	.endm
+
+	.macro UNFAKE_STACK_FRAME
+	addq $8*6, %rsp
+	.endm
+
+ENTRY(asm_kernel_thread)
+	CFI_STARTPROC
+	FAKE_STACK_FRAME $child_rip
+	SAVE_ALL
+
+	# rdi: flags, rsi: usp, rdx: will be &pt_regs
+	movq %rdx,%rdi
+	orq  $0x00800000,%rdi
+	movq $-1, %rsi
+	movq %rsp, %rdx
+
+	xorl %r8d,%r8d
+	xorl %r9d,%r9d
+	pushq %rcx
+	call do_fork_pid
+	addq $8, %rsp
+	/* call do_fork */
+	movq %rax,RAX(%rsp)
+	xorl %edi,%edi
+	RESTORE_ALL
+	UNFAKE_STACK_FRAME
+	ret
+	CFI_ENDPROC
+ENDPROC(asm_kernel_thread)
+
+child_rip:
+	pushq $0		# fake return address
+	CFI_STARTPROC
+	movq %rdi, %rax
+	movq %rsi, %rdi
+	call *%rax
+	movq %rax, %rdi
+	call do_exit
+	CFI_ENDPROC
+ENDPROC(child_rip)
+
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_cgroup.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_cgroup.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_cgroup.c	2015-01-21 12:02:49.274065759 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_cgroup.c	2015-01-21 12:02:49.753053042 +0300
@@ -0,0 +1,150 @@
+#include <linux/cgroup.h>
+#include <linux/mount.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+
+static int rst_restore_one_cgroup_mnt(loff_t *pos, struct cpt_context * ctx)
+{
+	struct cpt_object_hdr gi;
+	struct cgroup *cgrp;
+	cpt_object_t *pobj, *obj;
+	loff_t endpos;
+	char *name, *mntdata;
+	int err, first = 1;
+	struct vfsmount *mnt;
+
+	err = rst_get_object(CPT_OBJ_CGROUPS, *pos, &gi, ctx);
+	if (err)
+		return err;
+
+	endpos = *pos + gi.cpt_next;
+	*pos += gi.cpt_hdrlen;
+
+	mntdata = __rst_get_name(pos, ctx);
+	if (!mntdata)
+		return -EINVAL;
+
+	mnt = vfs_kern_mount(&cgroup_fs_type, 0, cgroup_fs_type.name, mntdata);
+	rst_put_name(mntdata, ctx);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	obj = cpt_object_add(CPT_OBJ_CGROUPS, mnt->mnt_sb, ctx);
+	if (!obj) {
+		mntput(mnt);
+		return -ENOMEM;
+	}
+	obj->o_parent = mnt;
+
+	cgrp = mnt->mnt_root->d_fsdata;
+
+	while (*pos < endpos) {
+		struct cpt_cgroup_image ci;
+		loff_t p;
+
+		err = rst_get_object(CPT_OBJ_CGROUP, *pos, &ci, ctx);
+		if (err)
+			return err;
+
+		/* The root cgroup should be first */
+		if (first) {
+			if (ci.cpt_parent != -1)
+				return -EINVAL;
+			first = 0;
+		} else if (ci.cpt_parent == -1)
+			return -EINVAL;
+
+		p = *pos + ci.cpt_hdrlen;
+		*pos += ci.cpt_next;
+
+		if (ci.cpt_parent != -1) {
+			pobj = lookup_cpt_obj_byindex(CPT_OBJ_CGROUP, ci.cpt_parent, ctx);
+			if (!pobj)
+				return -ENOENT;
+
+			name =__rst_get_name(&p, ctx);
+			cgrp = cgroup_kernel_open(pobj->o_obj, CGRP_CREAT, name);
+			rst_put_name(name, ctx);
+			if (IS_ERR(cgrp)) {
+				return PTR_ERR(cgrp);
+			}
+		} else
+			__cgroup_kernel_open(cgrp);
+
+		if (ci.cpt_flags & CPT_CGRP_NOTIFY_ON_RELEASE)
+			set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+		if (ci.cpt_flags & CPT_CGRP_SELF_DESTRUCTION)
+			set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
+
+		obj = cpt_object_add(CPT_OBJ_CGROUP, cgrp, ctx);
+		if (obj) {
+			cpt_obj_setindex(obj, ci.cpt_index, ctx);
+			cpt_obj_setpos(obj, p, ctx);
+		} else
+			return -ENOMEM;
+	}
+
+	return err;
+}
+
+int rst_cgroups(struct cpt_context *ctx)
+{
+	int err = 0;
+	loff_t sec = ctx->sections[CPT_SECT_CGROUPS];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_CGROUPS || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec && !err)
+		err = rst_restore_one_cgroup_mnt(&sec, ctx);
+
+	return err;
+}
+
+int rst_cgroup_task(struct cpt_context * ctx)
+{
+	cpt_object_t *obj;
+	struct task_struct *tsk;
+	u32 pid;
+	struct cgroup *cgrp;
+
+	for_each_object(obj, CPT_OBJ_CGROUP) {
+		cgrp = obj->o_obj;
+		ctx->file->f_pos = obj->o_pos;
+		while (1) {
+			ctx->read(&pid, sizeof(pid), ctx);
+			if (!pid)
+				break;
+			tsk = find_task_by_vpid(pid);
+			if (!tsk) {
+				eprintk_ctx("can't get task with pid %d\n", pid);
+				return -ENOENT;
+			}
+			cgroup_kernel_attach(cgrp, tsk);
+		}
+	}
+	return 0;
+}
+
+void rst_cgroup_close(struct cpt_context * ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_CGROUP)
+		cgroup_kernel_close(obj->o_obj);
+
+	for_each_object(obj, CPT_OBJ_CGROUPS)
+		mntput(obj->o_parent);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_conntrack.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_conntrack.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_conntrack.c	2015-01-21 12:02:48.229093499 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_conntrack.c	2015-01-21 12:02:51.077017894 +0300
@@ -0,0 +1,490 @@
+/*
+ *
+ *  kernel/cpt/rst_conntrack.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/unistd.h>
+#include <linux/ve.h>
+#include <linux/vzcalluser.h>
+#include <linux/cpt_image.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+
+#if defined(CONFIG_VE_IPTABLES) && \
+    (defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE))
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+#define ASSERT_READ_LOCK(x) do { } while (0)
+#define ASSERT_WRITE_LOCK(x) do { } while (0)
+
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+struct ct_holder
+{
+	struct ct_holder *next;
+	struct nf_conn *ct;
+	int index;
+};
+
+static int decode_tuple(struct cpt_ipct_tuple *v,
+			 struct nf_conntrack_tuple *tuple, int dir,
+			 cpt_context_t *ctx)
+{
+	tuple->dst.u3.ip = v->cpt_dst;
+	tuple->dst.u3.all[1] = 0;
+	tuple->dst.u3.all[2] = 0;
+	tuple->dst.u3.all[3] = 0;
+
+	tuple->dst.u.all = v->cpt_dstport;
+	tuple->src.l3num = v->cpt_l3num;
+	tuple->dst.protonum = v->cpt_protonum;
+
+	tuple->dst.dir = v->cpt_dir;
+	if (dir != tuple->dst.dir) {
+		eprintk_ctx("dir != tuple->dst.dir\n");
+		return -EINVAL;
+	}
+
+	if (ctx->image_version < CPT_VERSION_32)
+		tuple->src.l3num = AF_INET;
+	else
+		tuple->src.l3num = v->cpt_l3num;
+
+	tuple->src.u3.ip = v->cpt_src;
+	tuple->src.u3.all[1] = 0;
+	tuple->src.u3.all[2] = 0;
+	tuple->src.u3.all[3] = 0;
+
+	tuple->src.u.all = v->cpt_srcport;
+	return 0;
+}
+
+static int decode_tuple_mask(struct cpt_ipct_tuple *v,
+			 struct nf_conntrack_tuple_mask *tuple, int dir,
+			 cpt_context_t *ctx)
+{
+	tuple->src.u3.ip = v->cpt_src;
+	tuple->src.u.all = v->cpt_srcport;
+	return 0;
+}
+
+static void convert_connexpect_image(struct cpt_ip_connexpect_image *ci)
+{
+	struct cpt_ip_connexpect_image_compat img;
+	void *po, *pt;
+	unsigned long size;
+
+	memcpy(&img, ci, sizeof(struct cpt_ip_connexpect_image_compat));
+
+	/* skip cpt_ct_tuple */
+	po = &img.cpt_ct_tuple;
+	size = (long) po - (long) &img;
+	memcpy(&img, ci, size);
+
+	/* convert cpt_tuple and cpt_mask */
+	pt = &ci->cpt_tuple;
+	po = &img.cpt_tuple;
+	memcpy(pt, po, sizeof(struct cpt_ipct_tuple_compat));
+	ci->cpt_tuple.cpt_l3num = AF_INET;
+	pt = &ci->cpt_mask;
+	po = &img.cpt_mask;
+	memcpy(pt, po, sizeof(struct cpt_ipct_tuple_compat));
+	ci->cpt_mask.cpt_l3num = AF_INET;
+
+	pt = &ci->cpt_dir;
+	po = &img.cpt_dir;
+	size = sizeof(struct cpt_ip_connexpect_image) + (long) ci - (long) pt;
+	memcpy(pt, po, size);
+}
+
+static int undump_expect_list(struct nf_conn *ct,
+			      struct cpt_ip_conntrack_image *ci,
+			      loff_t pos, struct ct_holder *ct_list,
+			      cpt_context_t *ctx)
+{
+	loff_t end;
+	int err;
+
+	end = pos + ci->cpt_next;
+	pos += ci->cpt_hdrlen;
+	while (pos < end) {
+		struct cpt_ip_connexpect_image v;
+		struct nf_conntrack_expect *exp;
+		struct nf_conn *sibling;
+
+		err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx);
+		if (err)
+			return err;
+
+		if (ctx->image_version < CPT_VERSION_32)
+			convert_connexpect_image(&v);
+
+		sibling = NULL;
+		if (v.cpt_sibling_conntrack) {
+			struct ct_holder *c;
+
+			for (c = ct_list; c; c = c->next) {
+				if (c->index == v.cpt_sibling_conntrack) {
+					sibling = c->ct;
+					break;
+				}
+			}
+			if (!sibling) {
+				eprintk_ctx("lost sibling of expectation\n");
+				return -EINVAL;
+			}
+		}
+
+		/* It is possible. Helper module could be just unregistered,
+		 * if expectation were on the list, it would be destroyed. */
+		if (nfct_help(ct) == NULL) {
+			dprintk_ctx("conntrack: no helper and non-trivial expectation\n");
+			continue;
+		}
+
+		exp = nf_ct_expect_alloc(ct);
+		if (exp == NULL) {
+			return -ENOMEM;
+		}
+
+		if (cpt_object_has(&v, cpt_class))
+			exp->class = v.cpt_class;
+		else
+			exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
+
+		exp->flags = v.cpt_flags;
+		exp->dir = 0;
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+		exp->saved_proto.all = v.cpt_manip_proto;
+		exp->dir = v.cpt_dir;
+#endif
+
+		if (decode_tuple(&v.cpt_tuple, &exp->tuple, exp->dir, ctx) ||
+		    decode_tuple_mask(&v.cpt_mask, &exp->mask, exp->dir, ctx)) {
+			nf_ct_expect_put(exp);
+			return -EINVAL;
+		}
+
+		nf_conntrack_get(&ct->ct_general);
+		if (nf_ct_expect_related(exp)) {
+			nf_ct_expect_put(exp);
+			nf_ct_put(ct);
+			return -EINVAL;
+		}
+
+		spin_lock_bh(&nf_conntrack_lock);
+#if 0
+		if (sibling) {
+			exp->sibling = sibling;
+			sibling->master = exp;
+			LIST_DELETE(&ve_ip_conntrack_expect_list, exp);
+			ct->expecting--;
+			nf_conntrack_get(&master_ct(sibling)->infos[0]);
+		} else
+#endif
+		if (del_timer(&exp->timeout)) {
+			exp->timeout.expires = jiffies + v.cpt_timeout;
+			add_timer(&exp->timeout);
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
+
+		nf_ct_expect_put(exp);
+		nf_ct_put(ct);
+
+		pos += v.cpt_next;
+	}
+	return 0;
+}
+
+static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos,
+			 struct ct_holder **ct_list, cpt_context_t *ctx)
+{
+	int err = 0;
+	struct nf_conn *ct;
+	struct ct_holder *c;
+	struct nf_conntrack_tuple orig, repl;
+	struct nf_conn_nat *nat;
+
+	/*
+	 * We do not support ipv6 conntracks (we don't save dst.u3.all[1-3],
+	 * and they restore wrong).
+	 */
+	if (ctx->image_version >= CPT_VERSION_32 &&
+	    ci->cpt_tuple[0].cpt_l3num == AF_INET6)
+		return 0;
+
+	c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
+	if (c == NULL)
+		return -ENOMEM;
+
+	if (decode_tuple(&ci->cpt_tuple[0], &orig, 0, ctx) ||
+	    decode_tuple(&ci->cpt_tuple[1], &repl, 1, ctx)) {
+		kfree(c);
+		return -EINVAL;
+	}
+
+	ct = nf_conntrack_alloc(get_exec_env()->ve_netns, &orig, &repl,
+						get_exec_ub(), GFP_KERNEL);
+	if (!ct || IS_ERR(ct)) {
+		kfree(c);
+		return -ENOMEM;
+	}
+
+	c->ct = ct;
+	c->next = *ct_list;
+	*ct_list = c;
+	c->index = ci->cpt_index;
+
+	rcu_read_lock();
+	/* try an implicit helper assignation */
+	err = __nf_ct_try_assign_helper(ct, GFP_ATOMIC);
+	if (err < 0)
+		goto err2;
+
+	ct->status = ci->cpt_status;
+
+	memcpy(&ct->proto, ci->cpt_proto_data, sizeof(ct->proto));
+	if (nfct_help(ct))
+		memcpy(&nfct_help(ct)->help, ci->cpt_help_data, \
+					sizeof(nfct_help(ct)->help));
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	ct->mark = ci->cpt_mark;
+#endif
+
+	nat = nfct_nat(ct);
+
+	if (ct->status & IPS_NAT_DONE_MASK) {
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL) {
+			eprintk_ctx("conntrack: failed to add NAT extension\n");
+			err = -ENOMEM;
+			goto err2;
+		}
+#ifdef CONFIG_NF_NAT_NEEDED
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+	defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+		nat->masq_index = ci->cpt_masq_index;
+#endif
+		nat->seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos;
+		nat->seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before;
+		nat->seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after;
+		nat->seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos;
+		nat->seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before;
+		nat->seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after;
+
+		nf_nat_hash_conntrack(get_exec_env()->ve_netns, ct);
+#endif
+	}
+
+	ct->timeout.expires = jiffies + ci->cpt_timeout;
+	err = nf_conntrack_hash_check_insert(ct);
+
+	if (err < 0)
+		goto err2;
+
+	if (ci->cpt_next > ci->cpt_hdrlen)
+		err = undump_expect_list(ct, ci, pos, *ct_list, ctx);
+        rcu_read_unlock();
+	/*
+	 * nf_conntrack_hash_check_insert() sets ct->ct_general.use into 2,
+	 * because it think that the caller holds a reference to this object
+	 * and will put it.
+	 */
+	nf_ct_put(ct);
+
+	return err;
+err2:
+        rcu_read_unlock();
+        nf_conntrack_free(ct);
+        return err;
+}
+
+struct ip_ct_tcp_state_compat /*2.6.18*/
+{
+        u_int32_t       td_end;         /* max of seq + len */
+        u_int32_t       td_maxend;      /* max of ack + max(win, 1) */               
+        u_int32_t       td_maxwin;      /* max(win) */
+        u_int8_t        td_scale;       /* window scale factor */
+        u_int8_t        loose;          /* used when connection picked up from the middle */
+        u_int8_t        flags;          /* per direction options */
+};
+
+struct ip_ct_tcp_compat /*2.6.18*/
+{
+	struct ip_ct_tcp_state_compat seen[2];	/* connection parameters per direction */
+	u_int8_t	state;		/* state of the connection (enum tcp_conntrack) */
+	/* For detecting stale connections */
+	u_int8_t	last_dir;	/* Direction of the last packet (enum ip_conntrack_dir) */
+	u_int8_t	retrans;	/* Number of retransmitted packets */
+	u_int8_t	last_index;	/* Index of the last packet */
+	u_int32_t	last_seq;	/* Last sequence number seen in dir */
+	u_int32_t	last_ack;	/* Last sequence number seen in opposite dir */
+	u_int32_t	last_end;	/* Last seq + len */
+	u_int16_t	last_win;	/* Last window advertisement seen in dir */
+};
+
+void convert_proto_data_tcp_state(struct ip_ct_tcp_state *state)
+{
+	struct ip_ct_tcp_state_compat img;
+	memcpy(&img, state, sizeof(struct ip_ct_tcp_state_compat));
+	memset(state, 0, sizeof(struct ip_ct_tcp_state));
+	state->td_end = img.td_end;
+	state->td_maxend = img.td_maxend;
+	state->td_maxwin = img.td_maxwin;
+	state->td_scale = img.td_scale;
+	state->flags = img.flags;
+}
+
+void convert_proto_data_tcp(struct ip_ct_tcp *data)
+{
+	struct ip_ct_tcp_compat img;
+
+	memcpy(&img, data, sizeof(struct ip_ct_tcp_compat));
+	memset(data, 0, sizeof(struct ip_ct_tcp));
+
+	memcpy(&data->seen[0], &img.seen[0], sizeof(struct ip_ct_tcp_state_compat));
+	convert_proto_data_tcp_state(&data->seen[0]);
+	memcpy(&data->seen[1], &img.seen[1], sizeof(struct ip_ct_tcp_state_compat));
+	convert_proto_data_tcp_state(&data->seen[1]);
+	data->state = img.state;
+
+	data->last_dir = img.last_dir;
+	data->retrans = img.retrans;
+	data->last_index = img.last_index;
+	data->last_seq = img.last_seq;
+	data->last_ack = img.last_ack;
+	data->last_end = img.last_end;
+	data->last_win = img.last_win;
+}
+
+static void convert_conntrack_image(struct cpt_ip_conntrack_image *ci)
+{
+	struct cpt_ip_conntrack_image_compat img;
+	void *po, *pt;
+	long size, n = sizeof(struct cpt_ip_conntrack_image);
+
+	memcpy(&img, ci, sizeof(struct cpt_ip_conntrack_image_compat));
+
+	/* convert cpt_tuple */
+	pt = &ci->cpt_tuple[0].cpt_l3num;
+	size = n - ((long)pt - (long)ci);
+	memset(pt, 0, size);
+	ci->cpt_tuple[0].cpt_l3num = AF_INET;
+
+	pt = &ci->cpt_tuple[1];
+	po = &img.cpt_tuple[1];
+	memcpy(pt, po, sizeof(struct cpt_ipct_tuple_compat));
+	ci->cpt_tuple[1].cpt_l3num = AF_INET;
+
+	/* fix cpt_proto_data */
+	pt = &ci->cpt_status;
+	po = &img.cpt_status;
+	size = (long) &img.cpt_help_data - (long) po;
+	memcpy(pt, po, size);
+
+	if (ci->cpt_tuple[0].cpt_protonum == IPPROTO_TCP)
+		convert_proto_data_tcp((struct ip_ct_tcp *)ci->cpt_proto_data);
+
+	/* fix cpt_help_data */
+	pt = &ci->cpt_help_data;
+	po = &img.cpt_help_data;
+	size = (long) &img.cpt_initialized - (long) po;
+	memcpy(pt, po, size);
+
+	/* skip cpt_initialized, cpt_num_manips, cpt_nat_manips */
+	pt = &ci->cpt_nat_seq;
+	po = &img.cpt_nat_seq;
+	size = n - ((long)pt - (long)ci);
+	memcpy(pt, po, size);
+}
+
+int rst_restore_ip_conntrack(struct cpt_context * ctx)
+{
+	int err = 0;
+	loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_ip_conntrack_image ci;
+	struct ct_holder *c;
+	struct ct_holder *ct_list = NULL;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	BUILD_BUG_ON(sizeof(ci.cpt_proto_data) < sizeof(union nf_conntrack_proto));
+	BUILD_BUG_ON(sizeof(ci.cpt_help_data) < sizeof(union nf_conntrack_help));
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx);
+		if (err)
+			break;
+
+		if (ctx->image_version < CPT_VERSION_32)
+			convert_conntrack_image(&ci);
+
+		err = undump_one_ct(&ci, sec, &ct_list, ctx);
+		if (err) {
+			eprintk_ctx("Can't undump ct\n");
+			break;
+		}
+		sec += ci.cpt_next;
+	}
+
+	while ((c = ct_list) != NULL) {
+		ct_list = c->next;
+		kfree(c);
+	}
+
+	return err;
+}
+
+#else
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+
+int rst_restore_ip_conntrack(struct cpt_context * ctx)
+{
+	if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL)
+		return -EINVAL;
+	return 0;
+}
+
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_context.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_context.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_context.c	2015-01-21 12:02:48.229093499 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_context.c	2015-01-21 12:02:49.778052379 +0300
@@ -0,0 +1,395 @@
+/*
+ *
+ *  kernel/cpt/rst_context.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mmgang.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+
+#ifdef CONFIG_PRAM
+int rst_open_pram(cpt_context_t *ctx)
+{
+	int err = 0;
+
+	if (cpt_pram_ops)
+		err = cpt_pram_ops->rst_open(ctx);
+	if (err)
+		eprintk_ctx("rst_open_pram: %d\n", err);
+	return err;
+}
+
+void rst_close_pram(cpt_context_t *ctx)
+{
+	if (cpt_pram_ops)
+		cpt_pram_ops->rst_close(ctx);
+}
+
+int rst_undump_pram(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		loff_t pos, struct cpt_context *ctx)
+{
+	int err = -ENOSYS;
+
+	if (cpt_pram_ops)
+		err = cpt_pram_ops->rst_undump(mm, start, end, pos, ctx);
+	if (err)
+		eprintk_ctx("rst_undump_pram: %d\n", err);
+	return err;
+}
+#endif
+
+static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx)
+{
+	mm_segment_t oldfs;
+	ssize_t err = -EBADF;
+	struct file *file = ctx->file;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if (file)
+		err = file->f_op->read(file, addr, count, &file->f_pos);
+	set_fs(oldfs);
+	if (err != count)
+		return err >= 0 ? -EIO : err;
+	return 0;
+}
+
+static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
+{
+	mm_segment_t oldfs;
+	ssize_t err = -EBADF;
+	struct file *file = ctx->file;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if (file)
+		err = file->f_op->read(file, addr, count, &pos);
+	set_fs(oldfs);
+	if (err != count) {
+		eprintk_ctx("%s: read failed - addr: 0x%p, count: %ld, pos: %Ld, read: %ld\n",
+				__func__, addr, count, pos, err);
+		return err >= 0 ? -EIO : err;
+	}
+	return 0;
+}
+
+static void file_align(struct cpt_context *ctx)
+{
+	struct file *file = ctx->file;
+
+	if (file)
+		file->f_pos = CPT_ALIGN(file->f_pos);
+}
+
+int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end)
+{
+	struct cpt_section_hdr hdr;
+	int err;
+	loff_t pos;
+
+	pos = ctx->sections[type];
+	*start = *end = pos;
+
+	if (pos != CPT_NULL) {
+		if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0)
+			return err;
+		if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr))
+			return -EINVAL;
+		*start = pos + hdr.cpt_hdrlen;
+		*end = pos + hdr.cpt_next;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(rst_get_section);
+
+void rst_context_init(struct cpt_context *ctx)
+{
+	int i;
+
+	memset(ctx, 0, sizeof(*ctx));
+
+	init_MUTEX(&ctx->main_sem);
+	ctx->refcount = 1;
+
+	ctx->current_section = -1;
+	ctx->current_object = -1;
+	ctx->pagesize = PAGE_SIZE;
+	ctx->read = file_read;
+	ctx->pread = file_pread;
+	ctx->align = file_align;
+	for (i=0; i < CPT_SECT_MAX; i++)
+		ctx->sections[i] = CPT_NULL;
+	cpt_object_init(ctx);
+}
+
+static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx)
+{
+	struct cpt_section_hdr h;
+
+	while (start < end) {
+		int err;
+
+		err = ctx->pread(&h, sizeof(h), ctx, start);
+		if (err)
+			return err;
+		if (h.cpt_hdrlen < sizeof(h) ||
+		    h.cpt_next < h.cpt_hdrlen ||
+		    start + h.cpt_next > end)
+			return -EINVAL;
+		if (h.cpt_section >= CPT_SECT_MAX)
+			return -EINVAL;
+		ctx->sections[h.cpt_section] = start;
+		start += h.cpt_next;
+	}
+	return 0;
+}
+
+int rst_image_acceptable(unsigned long version)
+{
+	switch (CPT_VERSION_MAJOR(version)) {
+		case CPT_VERSION_18:
+			if (version >= CPT_VERSION_18_3)
+				return 1;
+			break;
+		case CPT_VERSION_32:
+			if (CPT_VERSION_MINOR(version) <= 
+				CPT_VERSION_MINOR(CPT_CURRENT_VERSION))
+				return 1;
+			break;
+		default:
+			break;
+	}
+
+	return 0;
+}
+
+int rst_open_dumpfile(struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_major_tail *v;
+	struct cpt_major_hdr  h;
+	unsigned long size;
+
+	err = -EBADF;
+	if (!ctx->file)
+		goto err_out;
+
+	err = -ENOMEM;
+	ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
+	if (ctx->tmpbuf == NULL)
+		goto err_out;
+	__cpt_release_buf(ctx);
+
+	size = ctx->file->f_dentry->d_inode->i_size;
+
+	if (size & 7) {
+		err = -EINVAL;
+		goto err_out;
+	}
+	if (size < sizeof(struct cpt_major_hdr) +
+	    sizeof(struct cpt_major_tail)) {
+		err = -EINVAL;
+		goto err_out;
+	}
+	err = ctx->pread(&h, sizeof(h), ctx, 0);
+	if (err) {
+		eprintk_ctx("too short image 1 %d\n", err);
+		goto err_out;
+	}
+	if (h.cpt_signature[0] != CPT_SIGNATURE0 ||
+	    h.cpt_signature[1] != CPT_SIGNATURE1 ||
+	    h.cpt_signature[2] != CPT_SIGNATURE2 ||
+	    h.cpt_signature[3] != CPT_SIGNATURE3) {
+		err = -EINVAL;
+		goto err_out;
+	}
+	if (h.cpt_hz != HZ) {
+		err = -EINVAL;
+		eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ);
+		goto err_out;
+	}
+	ctx->virt_jiffies64 = h.cpt_start_jiffies64;
+	ctx->start_time.tv_sec = h.cpt_start_sec;
+	ctx->start_time.tv_nsec = h.cpt_start_nsec;
+	ctx->kernel_config_flags = h.cpt_kernel_config[0];
+	ctx->iptables_mask = h.cpt_iptables_mask;
+	if (!rst_image_acceptable(h.cpt_image_version)) {
+		eprintk_ctx("Unknown image version: %x. Can't restore.\n",
+				h.cpt_image_version);
+		err = -EINVAL;
+		goto err_out;
+	}
+	ctx->image_version = h.cpt_image_version;
+	ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features);
+	ctx->image_arch = h.cpt_os_arch;
+
+	v = cpt_get_buf(ctx);
+	err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v));
+	if (err) {
+		eprintk_ctx("too short image 2 %d\n", err);
+		cpt_release_buf(ctx);
+		goto err_out;
+	}
+	if (v->cpt_signature[0] != CPT_SIGNATURE0 ||
+	    v->cpt_signature[1] != CPT_SIGNATURE1 ||
+	    v->cpt_signature[2] != CPT_SIGNATURE2 ||
+	    v->cpt_signature[3] != CPT_SIGNATURE3 ||
+	    v->cpt_nsect != CPT_SECT_MAX_INDEX) {
+		err = -EINVAL;
+		cpt_release_buf(ctx);
+		goto err_out;
+	}
+	if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) {
+		cpt_release_buf(ctx);
+		goto err_out;
+	}
+	ctx->tasks64 = v->cpt_64bit;
+	cpt_release_buf(ctx);
+	return 0;
+
+err_out:
+	if (ctx->tmpbuf) {
+		free_page((unsigned long)ctx->tmpbuf);
+		ctx->tmpbuf = NULL;
+	}
+	return err;
+}
+
+void rst_close_dumpfile(struct cpt_context *ctx)
+{
+	if (ctx->file) {
+		fput(ctx->file);
+		ctx->file = NULL;
+	}
+	if (ctx->tmpbuf) {
+		free_page((unsigned long)ctx->tmpbuf);
+		ctx->tmpbuf = NULL;
+	}
+}
+
+int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_object_hdr *hdr = tmp;
+	err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos);
+	if (err) {
+		eprintk_ctx("%s: dump file read failed: %d @%lld\n",
+				__func__, err, pos);
+		return err;
+	}
+	if (type > 0 && type != hdr->cpt_object) {
+		eprintk_ctx("%s: wrong object type: %d (expected: %d) @%lld\n",
+				__func__, type, hdr->cpt_object, pos);
+		return -EINVAL;
+	}
+	if (hdr->cpt_hdrlen > hdr->cpt_next) {
+		eprintk_ctx("%s: bad image object size: %d (next object in %Ld)\n",
+				__func__, hdr->cpt_hdrlen, hdr->cpt_next);
+		return -EINVAL;
+	}
+	if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) {
+		eprintk_ctx("%s: bad image header length: %d (object size: %Ld)\n",
+			__func__, hdr->cpt_hdrlen, hdr->cpt_next);
+		return -EINVAL;
+	}
+	if (size < sizeof(*hdr)) {
+		eprintk_ctx("%s: buffer is too small: %d (required: %ld) @%lld\n",
+				__func__, size, sizeof(*hdr), pos);
+		return -EINVAL;
+	}
+	if (size > hdr->cpt_hdrlen) {
+		memset((char *)tmp + hdr->cpt_hdrlen, 0, size - hdr->cpt_hdrlen);
+		size = hdr->cpt_hdrlen;
+	}
+	if (size > sizeof(*hdr))
+		err = ctx->pread(hdr+1, size - sizeof(*hdr),
+				 ctx, pos + sizeof(*hdr));
+	return err;
+}
+EXPORT_SYMBOL(_rst_get_object);
+
+void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx)
+{
+	int err;
+	void *tmp;
+	struct cpt_object_hdr hdr;
+	err = ctx->pread(&hdr, sizeof(hdr), ctx, pos);
+	if (err)
+		return NULL;
+	if (type > 0 && type != hdr.cpt_object)
+		return NULL;
+	if (hdr.cpt_hdrlen > hdr.cpt_next)
+		return NULL;
+	if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr))
+		return NULL;
+	tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL);
+	if (!tmp)
+		return NULL;
+	err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos);
+	if (!err)
+		return tmp;
+	kfree(tmp);
+	return NULL;
+}
+EXPORT_SYMBOL(__rst_get_object);
+
+__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_object_hdr hdr;
+	__u8 *name;
+
+	err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx);
+	if (err)
+		return NULL;
+	if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE)
+		return NULL;
+	name = (void*)__get_free_page(GFP_KERNEL);
+	if (!name)
+		return NULL;
+	err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen,
+		   ctx, *pos_p + hdr.cpt_hdrlen);
+	if (err) {
+		free_page((unsigned long)name);
+		return NULL;
+	}
+	*pos_p += hdr.cpt_next;
+	return name;
+}
+
+__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx)
+{
+	return __rst_get_name(&pos, ctx);
+}
+
+void rst_put_name(__u8 *name, struct cpt_context *ctx)
+{
+	unsigned long addr = (unsigned long)name;
+
+	if (addr)
+		free_page(addr&~(PAGE_SIZE-1));
+}
+
+struct rst_ops rst_ops = {
+	.get_object = _rst_get_object,
+	.rst_file = rst_file,
+};
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_delayfs.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_delayfs.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_delayfs.c	2015-01-21 12:02:48.488086622 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_delayfs.c	2015-01-21 12:02:51.095017417 +0300
@@ -0,0 +1,1888 @@
+/*
+ *  kernel/cpt/rst_delayfs.c
+ *
+ *  Copyright (C) 2009 Parallels
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ *  TODO:
+ *	- handling of a case when top mount got broken
+ *	- FIXMEs below
+ *	- do_coredump (filp_open, do_truncate)
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/socket.h>
+#include <linux/nfs_mount.h>
+#include <linux/sched.h>
+#include <linux/ve_nfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fdtable.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/seq_file.h>
+#include <net/af_unix.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_socket.h"
+#include "cpt_syscalls.h"
+
+#define DEBUG
+
+#define D(FMT, ...)	dprintk( \
+		"delayfs %d(%s) %s:%d " FMT "\n", \
+		current->pid, current->comm, \
+		__func__, __LINE__, ##__VA_ARGS__)
+
+enum {
+	SB_INITIAL = 0,
+	SB_LOCKED,
+	SB_FINISHED,
+	SB_BROKEN
+};
+
+struct delayfs_file_private {
+	struct delayed_flock_info *dfi;
+	struct file *real_fs_file;
+};
+
+struct delay_sb_info {
+	int state;
+	wait_queue_head_t blocked_tasks;
+
+	struct file_system_type *hidden_type;
+	void *data;
+	struct vfsmount *real;
+	spinlock_t file_lock;
+
+	struct unix_bind_info *bi_list;
+
+	unsigned long delay_tmo;
+	void (*handle_mount_failure)(struct delay_sb_info *si);
+	void (*restore_mount_params)(struct delay_sb_info *si);
+
+	/* NFS original mount options */
+	int nfs_mnt_soft;
+	int nfs_delay_tmo;
+	int nfs_mnt_retrans;
+};
+
+#define FNAME(file) ((file)->f_dentry->d_name.name)
+
+/* mm */
+
+static int delay_remmap(struct vm_area_struct *vma,
+		struct file* fake, struct file *real)
+{
+	struct address_space *mapping;
+
+	if (vma->vm_file != fake)
+		return VM_FAULT_RETRY;
+
+	if (IS_ERR(real))
+		return VM_FAULT_OOM;
+
+	if ((vma->vm_flags & VM_DENYWRITE) && deny_write_access(real))
+		return VM_FAULT_SIGBUS;
+
+	unlink_file_vma(vma);
+	vma->vm_file = real;
+	if (real->f_op->mmap(real, vma)) {
+		vma->vm_file = fake;
+		mapping = fake->f_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		__vma_link_file(vma);
+		spin_unlock(&mapping->i_mmap_lock);
+		if (vma->vm_flags & VM_DENYWRITE)
+			allow_write_access(real);
+
+		return VM_FAULT_SIGBUS;
+	}
+
+	mapping = real->f_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+	__vma_link_file(vma);
+	vma->vm_truncate_count = mapping->truncate_count;
+	spin_unlock(&mapping->i_mmap_lock);
+	get_file(real);
+	vma->vm_flags &= ~VM_DONTEXPAND;
+	fput(fake);
+	if (vma->vm_flags & VM_DENYWRITE)
+		allow_write_access(real);
+
+	return VM_FAULT_RETRY;
+}
+
+/*
+ * NOTE: Called with mmap_sem held for read.
+ */
+static int delay_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct delay_sb_info *si;
+	struct file *fake, **real;
+	static DEFINE_MUTEX(lock); /* protect cross-thread remmap */
+	int ret = 0;
+	pgoff_t offset = vmf->pgoff;
+	struct delayfs_file_private *priv;
+
+	mutex_lock(&lock);
+	if (vma->vm_ops->fault != delay_fault) {
+		mutex_unlock(&lock);
+		ret = VM_FAULT_RETRY;	/* race with other thread */
+		goto out;
+	}
+	fake = vma->vm_file;
+	get_file(fake);
+	mutex_unlock(&lock);
+
+	si = fake->f_dentry->d_sb->s_fs_info;
+	priv = fake->private_data;
+	real = &priv->real_fs_file;
+
+	D("addr:%p mnt:%p file:%p(%s)", (void *)offset, fake->f_vfsmnt, fake, FNAME(fake));
+	if (debug_level > 3)
+		dump_stack();
+
+	if (si->state == SB_INITIAL) {
+		if (vma->vm_flags & VM_SHARED) {
+			ret = VM_FAULT_SIGBUS;
+			goto out_put;
+		}
+		/* special case for restoring private mappings */
+		vmf->page = ZERO_PAGE(address);
+		get_page(vmf->page);
+		goto out_put;
+	}
+
+	if (!wait_event_timeout(si->blocked_tasks, *real, si->delay_tmo)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_put;
+	}
+
+	mutex_lock(&lock);
+	ret = delay_remmap(vma, fake, *real);
+	mutex_unlock(&lock);
+out_put:
+	fput(fake);
+out:
+	if (ret == VM_FAULT_RETRY)
+		up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static struct vm_operations_struct delay_vma_ops = {
+	.fault = delay_fault,
+};
+
+static int delay_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	D("mnt:%p file:%p(%s) offset:%lu range:%p-%p", file->f_vfsmnt, file,
+			FNAME(file), vma->vm_pgoff,
+			(void *)vma->vm_start, (void *)vma->vm_end);
+	vma->vm_ops = &delay_vma_ops;
+	vma->vm_flags |= VM_DONTEXPAND;
+	return 0;
+}
+
+/* switch */
+
+static void delay_switch_mm(struct mm_struct *mm, struct super_block *sb)
+{
+	struct vm_area_struct *vma;
+	struct file *fake, *real, *exe;
+	struct delayfs_file_private *priv;
+
+	down_write(&mm->mmap_sem);
+	for ( vma = mm->mmap ; vma ; vma = vma->vm_next ) {
+		fake = vma->vm_file;
+		if (!fake || fake->f_vfsmnt->mnt_sb != sb)
+			continue;
+		priv = vma->vm_file->private_data;
+		real = priv->real_fs_file;
+		if (real)
+			delay_remmap(vma, fake, real);
+	}
+	exe = mm->exe_file;
+	if (exe && exe->f_vfsmnt->mnt_sb == sb) {
+		priv = exe->private_data;
+		real = priv->real_fs_file;
+		if (real && !IS_ERR(real)) {
+			get_file(real);
+			fput(exe);
+			mm->exe_file = real;
+		}
+	}
+	up_write(&mm->mmap_sem);
+}
+
+struct delayed_flock_info {
+	struct file_lock *fl;
+	int svid;
+	struct delayed_flock_info *next;
+};
+
+static void delayed_flock(struct delayed_flock_info *dfi, struct file *file)
+{
+	int err;
+	struct file_lock *fl = dfi->fl;
+
+	err = nlmclnt_set_lockowner(file->f_dentry->d_inode, fl, dfi->svid);
+	if (err)
+		goto out;
+
+	fl->fl_file = file;
+	fl->fl_flags |= FL_LOCAL;
+
+	if (fl->fl_flags & FL_FLOCK)
+		err = file->f_op->flock(file, F_SETLK, fl);
+	else
+		err = file->f_op->lock(file, F_SETLK, fl);
+
+out:
+	locks_free_lock(fl);
+	dfi->fl = NULL;
+	kfree(dfi);
+
+	if (err)
+		eprintk("oh shit :( can't lock file back in %d:%s (%d)\n",
+				get_exec_env()->veid,
+				file->f_dentry->d_name.name, err);
+}
+
+static void apply_delayed_locks(struct delayed_flock_info *dfi, struct file *real)
+{
+	while (dfi) {
+		delayed_flock(dfi, real);
+		dfi = dfi->next;
+	}
+}
+
+static void delay_switch_fd(struct files_struct *files, struct super_block *sb)
+{
+	struct fdtable *fdt;
+	int i;
+	struct file *fake, *real;
+	struct delayfs_file_private *priv;
+
+	i = 0;
+restart:
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	for ( ; i < fdt->max_fds ; i++ ) {
+		struct delayed_flock_info *dfi;
+
+		fake = fdt->fd[i];
+		if (!fake || fake->f_vfsmnt->mnt_sb != sb)
+			continue;
+
+		priv = fake->private_data;
+		real = priv->real_fs_file;
+		if (!real || IS_ERR(real))
+			continue;
+
+		get_file(real);
+		rcu_assign_pointer(fdt->fd[i], real);
+
+		/*
+		 * Flock applying have to be done only once per file. That's
+		 * why we drop the link.
+		 * And file can be shared between processes, do file_lock is
+		 * not enough.
+		 */
+		spin_lock(&fake->f_lock);
+		dfi = priv->dfi;
+		priv->dfi = NULL;
+		spin_unlock(&fake->f_lock);
+		spin_unlock(&files->file_lock);
+
+		apply_delayed_locks(dfi, real);
+
+		fput(fake);
+		goto restart;
+	}
+	spin_unlock(&files->file_lock);
+}
+
+static void delay_switch_fs(struct fs_struct *fs, struct super_block *sb)
+{
+	struct file *filp;
+	struct path old_root = { .dentry = NULL, .mnt = NULL };
+	struct path old_pwd  = { .dentry = NULL, .mnt = NULL };
+
+	spin_lock(&fs->lock);
+
+	if (fs->root.mnt->mnt_sb == sb) {
+		filp = fs->root.dentry->d_fsdata;
+		if (!IS_ERR_OR_NULL(filp)) {
+			old_root = fs->root;
+			fs->root = filp->f_path;
+			path_get(&fs->root);
+		}
+	}
+
+	if (fs->pwd.mnt->mnt_sb == sb) {
+		filp = fs->pwd.dentry->d_fsdata;
+		if (!IS_ERR_OR_NULL(filp)) {
+			old_pwd = fs->pwd;
+			fs->pwd = filp->f_path;
+			path_get(&fs->pwd);
+		}
+	}
+
+	spin_unlock(&fs->lock);
+
+	path_put(&old_root);
+	path_put(&old_pwd);
+}
+
+static void delay_switch_current(struct super_block *sb)
+{
+	delay_switch_fs(current->fs, sb);
+	delay_switch_fd(current->files, sb);
+	delay_switch_mm(current->mm, sb);
+}
+
+static void delay_switch_one(struct task_struct *p, struct vfsmount *mnt)
+{
+	struct files_struct *files;
+	struct fs_struct *fs;
+	struct mm_struct *mm;
+
+	D("mnt:%p task:%d(%s)", mnt, p->pid, p->comm);
+	task_lock(p);
+	fs = p->fs;
+	if (fs) {
+		int kill;
+
+		spin_lock(&fs->lock);
+		fs->users++;
+		spin_unlock(&fs->lock);
+		task_unlock(p);
+
+		delay_switch_fs(fs, mnt->mnt_sb);
+
+		spin_lock(&fs->lock);
+		kill = !--fs->users;
+		spin_unlock(&fs->lock);
+		if (kill)
+			free_fs_struct(fs);
+	} else
+		task_unlock(p);
+
+	files = get_files_struct(p);
+	if (files) {
+		delay_switch_fd(files, mnt->mnt_sb);
+		put_files_struct(files);
+	}
+
+	mm = get_task_mm(p);
+	if (mm) {
+		delay_switch_mm(mm, mnt->mnt_sb);
+		mmput(mm);
+	}
+}
+
+static void delayfs_switch_all(struct vfsmount *mnt)
+{
+	struct ve_struct *env;
+	struct task_struct *p;
+
+	env = get_exec_env();
+
+	write_lock_irq(&tasklist_lock);
+	do {
+		if (list_empty(&env->vetask_auxlist))
+			break;
+
+		p = list_entry(env->vetask_auxlist.prev,
+				struct task_struct, ve_task_info.aux_list);
+		list_del(&VE_TASK_INFO(p)->aux_list);
+		list_add(&VE_TASK_INFO(p)->aux_list, &env->vetask_auxlist);
+
+		get_task_struct(p);
+		write_unlock_irq(&tasklist_lock);
+
+		delay_switch_one(p, mnt);
+
+		put_task_struct(p);
+
+		cond_resched();
+
+		write_lock_irq(&tasklist_lock);
+	} while (p != __first_task_ve(env));
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* wait */
+
+static int delayfs_restart(void)
+{
+	if (signal_pending(current))
+		return -EINTR;
+
+	set_tsk_thread_flag(current, TIF_SIGPENDING);
+	return -ERESTARTSYS;
+}
+
+static int delayfs_wait_mnt(struct super_block *sb)
+{
+	struct delay_sb_info *si = sb->s_fs_info;
+	long res;
+
+	if (si->state == SB_INITIAL) {
+		WARN_ON(1);
+		return -EDEADLK;
+	}
+
+	if (si->state == SB_BROKEN)
+		return -EIO;
+
+	D("si:%p from:%p", si, __builtin_return_address(0));
+	if (debug_level > 3)
+		dump_stack();
+
+	res = wait_event_interruptible_timeout(si->blocked_tasks,
+						si->state >= SB_FINISHED,
+						si->delay_tmo);
+	if (!res)
+		return -EIO;
+	if (res < 0)
+		return -EINTR;
+
+	delay_switch_current(sb);
+
+	return delayfs_restart();
+}
+
+static int delayfs_preopen(struct file *fake, struct delay_sb_info *si);
+
+static int delayfs_wait_file(struct file *fake)
+{
+	struct delay_sb_info *si = fake->f_dentry->d_sb->s_fs_info;
+	struct delayfs_file_private *priv = fake->private_data;
+	struct file **real = &priv->real_fs_file;
+	long res;
+
+	if (si->state == SB_INITIAL) {
+		WARN_ON(1);
+		return -EDEADLK;
+	}
+
+	D("mnt:%p file:%p(%s) from:%p", fake->f_vfsmnt, fake, FNAME(fake),
+			__builtin_return_address(0));
+	if (debug_level > 3)
+		dump_stack();
+
+	if (S_ISFIFO(fake->f_dentry->d_inode->i_mode) &&
+		((fake->f_mode & (FMODE_READ|FMODE_WRITE)) !=
+				 (FMODE_READ|FMODE_WRITE)))
+		res = wait_event_interruptible_timeout(si->blocked_tasks,
+						*real, si->delay_tmo);
+	else
+		res = wait_event_interruptible_timeout(si->blocked_tasks,
+					si->real, si->delay_tmo);
+	if (!res)
+		return -EIO;
+	if (res < 0)
+		return -EINTR;
+
+	if (!*real) {
+		if (delayfs_preopen(fake, si))
+			return -EIO;
+	}
+
+	delay_switch_current(fake->f_vfsmnt->mnt_sb);
+
+	if (IS_ERR(*real))
+		return -EIO;
+
+	return delayfs_restart();
+}
+
+/* stubs */
+
+static int delay_permission(struct inode *inode, int mask)
+{
+	return delayfs_wait_mnt(inode->i_sb);
+}
+
+static int delay_getattr(struct vfsmount *mnt, struct dentry *d, struct kstat *stat)
+{
+	return delayfs_wait_mnt(mnt->mnt_sb);
+}
+
+#ifdef DEBUG
+
+static int delay_create (struct inode *dir, struct dentry *dentry,
+		int mode, struct nameidata *nd)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static struct dentry *delay_lookup(struct inode *dir,
+			struct dentry *dentry, struct nameidata *nd)
+{
+	WARN_ON(1);
+	return ERR_PTR(-EIO);
+}
+
+static int delay_link (struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_unlink(struct inode *dir, struct dentry *dentry)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_symlink (struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_mkdir(struct inode *dir, struct dentry *dentry,
+			int mode)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_rmdir (struct inode *dir, struct dentry *dentry)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_mknod (struct inode *dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_rename (struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static void delay_truncate (struct inode *inode)
+{
+	WARN_ON(1);
+}
+
+static int delay_setattr(struct dentry *dentry, struct iattr *attrs)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static ssize_t delay_getxattr(struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static ssize_t delay_listxattr(struct dentry *dentry, char *buffer,
+			size_t buffer_size)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static int delay_removexattr(struct dentry *dentry, const char *name)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+
+static void delay_truncate_range(struct inode *inode, loff_t start, loff_t stop)
+{
+	WARN_ON(1);
+}
+
+#endif /* DEBUG */
+
+static struct inode_operations delay_dir_iops = {
+	/*
+	 * It's a hack - all the lookup happens with the
+	 * permission checks, thus we can safely freeeze
+	 * the tasks in this call
+	 */
+	.permission = delay_permission,
+	.getattr = delay_getattr,
+#ifdef DEBUG
+	.create		= delay_create,
+	.lookup		= delay_lookup,
+	.link		= delay_link,
+	.unlink		= delay_unlink,
+	.symlink	= delay_symlink,
+	.mkdir		= delay_mkdir,
+	.rmdir		= delay_rmdir,
+	.mknod		= delay_mknod,
+	.rename		= delay_rename,
+	/* .readlink	- EINVAL on root and sleep on permitions */
+	/* .follow_link	- must be no-op
+	   .put_link	*/
+	.truncate	= delay_truncate,
+	.setattr	= delay_setattr,
+	.setxattr	= delay_setxattr,
+	.getxattr	= delay_getxattr,
+	.listxattr	= delay_listxattr,
+	.removexattr	= delay_removexattr,
+	.truncate_range = delay_truncate_range, /* exists only in shm */
+#endif /* DEBUG */
+};
+
+static long delay_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	return delayfs_wait_file(filp);
+}
+
+static loff_t delay_llseek(struct file *filp, loff_t offset, int origin)
+{
+	return delayfs_wait_file(filp);
+}
+
+static ssize_t delay_read(struct file *filp, char __user *buf,
+			size_t size, loff_t *ppos)
+{
+	return delayfs_wait_file(filp);
+}
+
+static ssize_t delay_write(struct file *filp, const char __user *buf,
+			size_t siz, loff_t *ppos)
+{
+	return delayfs_wait_file(filp);
+}
+
+static int delay_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return delayfs_wait_file(filp);
+}
+
+static int delay_fsync(struct file *filp, struct dentry *dentry,
+			int datasync)
+{
+	// nothing to sync and there no reason to block
+	return 0;
+}
+
+static int delay_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	return delayfs_wait_file(filp);
+}
+
+/* see do_sendfile, generic_file_sendfile and file_send_actor*/
+static ssize_t delay_sendpage(struct file *filp, struct page *page,
+			int off, size_t len, loff_t *pos, int more)
+{
+	return delayfs_wait_file(filp);
+}
+
+static int delay_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	return delayfs_wait_file(filp);
+}
+
+static ssize_t delay_splice_write(struct pipe_inode_info *pipe,
+			struct file *filp, loff_t *ppos, size_t len,
+			unsigned int flags)
+{
+	return delayfs_wait_file(filp);
+}
+
+static ssize_t delay_splice_read(struct file *filp, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags)
+{
+	return delayfs_wait_file(filp);
+}
+
+static int delay_release(struct inode *ino, struct file *f)
+{
+	struct delayed_flock_info *dfi;
+	struct delayfs_file_private *priv;
+
+	priv = f->private_data;
+
+	while (priv->dfi) {
+		dfi = priv->dfi;
+		priv->dfi = dfi->next;
+
+		if (dfi->fl)
+			locks_free_lock(dfi->fl);
+		kfree(dfi);
+	}
+
+	if (!IS_ERR_OR_NULL(priv->real_fs_file))
+		fput(priv->real_fs_file);
+
+	if (S_ISFIFO(ino->i_mode))
+		pipe_release(ino, (f->f_mode & FMODE_READ) != 0,
+				  (f->f_mode & FMODE_WRITE)!= 0);
+	kfree(f->private_data);
+
+	return 0;
+}
+
+static int delay_open(struct inode *inode, struct file *file)
+{
+	file->private_data = kzalloc(sizeof(struct delayfs_file_private), GFP_KERNEL);
+	if (!file->private_data)
+		return -ENOMEM;
+
+	if (S_ISFIFO(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
+		if (!inode->i_pipe) {
+			inode->i_pipe = alloc_pipe_info(inode);
+			if (!inode->i_pipe) {
+				mutex_unlock(&inode->i_mutex);
+				kfree(file->private_data);
+				eprintk("%s: failed to allocate pipe buffer\n", __func__);
+				return -ENOMEM;
+			}
+			inode->i_private = (void *)1; /* need pipe data swap */
+		}
+		inode->i_pipe->readers += ((file->f_mode & FMODE_READ) != 0);
+		inode->i_pipe->writers += ((file->f_mode & FMODE_WRITE) != 0);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+static struct file_operations delay_dir_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = delay_ioctl,
+	.compat_ioctl	= delay_ioctl,
+	.mmap = delay_mmap,
+	.open		= delay_open,
+	.release	= delay_release,
+	.llseek		= delay_llseek,
+	.read		= delay_read,
+	.write		= delay_write,
+	.readdir	= delay_readdir,
+	/* .poll	- not required. by default return DEFAULT_POLLMASK */
+	/* .flush	- not required */
+	.fsync		= delay_fsync, /* non-blocked */
+	/* .fasync	- not required */
+	.lock		= delay_lock,
+	.sendpage	= delay_sendpage,
+	/* .get_unmapped_area - not required. for NOMMU only? */
+	/* .check_flags		FIXME problem with O_NOATIME O_DIRECT in setfl */
+	.flock		= delay_flock,
+	.splice_write	= delay_splice_write,
+	.splice_read	= delay_splice_read,
+	/* .aio_read	- aio banned. sys_io_submit return -EINVAL
+	   .aio_write
+	   .aio_fsync	*/
+};
+
+static void delayfs_release_dentry(struct dentry *dentry)
+{
+	struct file *real = dentry->d_fsdata;
+
+	D("de:%p name:%s real:%p", dentry, dentry->d_name.name, real);
+	if (real && !IS_ERR(real))
+		fput(real);
+}
+
+struct dentry_operations delay_dir_dops = {
+       .d_release = delayfs_release_dentry,
+};
+
+static void delayfs_show_type(struct seq_file *seq, struct super_block *sb)
+{
+	struct delay_sb_info *si = sb->s_fs_info;
+
+	seq_escape(seq, si->hidden_type->name, " \t\n\\");
+}
+
+static struct super_operations delay_super_ops = {
+	.show_type = delayfs_show_type,
+};
+
+static int delay_fill_sb(struct super_block *sb, void *data, int silent)
+{
+	struct inode *rinode;
+	struct delay_sb_info *si;
+
+	si = kzalloc(sizeof(struct delay_sb_info), GFP_KERNEL);
+	if (!si)
+		goto err;
+
+	init_waitqueue_head(&si->blocked_tasks);
+	spin_lock_init(&si->file_lock);
+
+	sb->s_fs_info = si;
+	sb->s_op = &delay_super_ops;
+
+	rinode = new_inode(sb);
+	if (!rinode)
+		goto err_free;
+
+	rinode->i_ino = 1;
+	rinode->i_mtime = rinode->i_atime = rinode->i_ctime = CURRENT_TIME;
+	rinode->i_blocks = 0;
+	rinode->i_uid = rinode->i_gid = 0;
+	rinode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
+	rinode->i_op = &delay_dir_iops;
+	rinode->i_fop = &delay_dir_fops;
+	rinode->i_nlink = 2;
+
+	sb->s_root = d_alloc_root(rinode);
+	if (!sb->s_root)
+		goto err_iput;
+
+	D("sb:%p si:%p ino:%p root:%p", sb, si, rinode, sb->s_root);
+	return 0;
+
+err_iput:
+	iput(rinode);
+err_free:
+	kfree(si);
+err:
+	return -ENOMEM;
+}
+
+static int delay_get_sb(struct file_system_type *type, int flags,
+		const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	D();
+	return get_sb_nodev(type, flags, data, delay_fill_sb, mnt);
+}
+
+static void delay_kill_sb(struct super_block *s)
+{
+	struct delay_sb_info *si = s->s_fs_info;
+
+	D("si:%p", si);
+	BUG_ON(waitqueue_active(&si->blocked_tasks));
+
+	while (si->bi_list) {
+		struct unix_bind_info *i;
+
+		i = si->bi_list;
+		si->bi_list = i->next;
+
+		sock_put(i->sk);
+		kfree(i);
+	}
+
+	mntput(si->real);
+	if (si->hidden_type)
+		put_filesystem(si->hidden_type);
+	free_pages((unsigned long )si->data, 1);
+	kfree(si);
+	kill_anon_super(s);
+}
+
+struct file_system_type delayfs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "delayfs",
+	.get_sb		= delay_get_sb,
+	.kill_sb	= delay_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
+};
+
+static int create_delayed_context(cpt_context_t *ctx)
+{
+	int i;
+	struct cpt_delayed_context *dctx;
+
+	if (ctx->dctx != NULL)
+		return 0;
+
+	dctx = kzalloc(sizeof(*dctx), GFP_KERNEL);
+	if (dctx == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < CPT_DOBJ_MAX; i++)
+		INIT_LIST_HEAD(&dctx->object_array[i]);
+	dctx->ve_id = ctx->ve_id;
+
+	ctx->dctx = dctx;
+	return 0;
+}
+
+#define DELAYFS_INITIAL_RETRY_TIMEOUT (16 * HZ)
+static int delay_max_timeout = 120 * HZ; 
+
+static void delayfs_nfs_handle_mount_failure(struct delay_sb_info *si)
+{
+	struct nfs_mount_data_dump *mount_data = si->data;
+
+	if (si->delay_tmo < delay_max_timeout)
+		si->delay_tmo <<= 1;
+
+	if (mount_data->version == NFS_MOUNT_MIGRATED) {
+		if (mount_data->timeo < delay_max_timeout)
+			mount_data->timeo <<= 1;
+	} else {
+		struct nfs_mount_data *old_data = si->data;
+
+		if (old_data->timeo < delay_max_timeout)
+			old_data->timeo <<= 1;
+	}
+}
+
+static void delayfs_nfs_restore_mount_params(struct delay_sb_info *si)
+{
+	nfs_change_server_params(si->real->mnt_sb->s_fs_info,
+				 si->nfs_delay_tmo, si->nfs_mnt_retrans);
+}
+
+static void delayfs_prepare_for_remount_loop(struct delay_sb_info *si)
+{
+	if (!strcmp(si->hidden_type->name, "nfs")) {
+		struct nfs_mount_data_dump *mount_data = si->data;
+
+		if (mount_data->version == NFS_MOUNT_MIGRATED) {
+			/*
+			 * Save real NFS mount parameters for further replacement.
+			 */
+			si->nfs_mnt_soft = mount_data->flags & NFS_MOUNT_SOFT;
+			si->nfs_delay_tmo = mount_data->timeo;
+			si->nfs_mnt_retrans = mount_data->retrans;
+			/*
+			 * Hack NFS mount options to avoid hanging during remount.
+			 */
+
+			mount_data->timeo = 1;
+			mount_data->retrans = 1;
+		} else {
+			struct nfs_mount_data *old_data = si->data;
+
+			/*
+			 * Save real NFS mount parameters for further replacement.
+			 */
+			si->nfs_delay_tmo = old_data->timeo;
+			si->nfs_mnt_retrans = old_data->retrans;
+			/*
+			 * Hack NFS mount options to avoid hanging during remount.
+			 */
+
+			old_data->timeo = 1;
+			old_data->retrans = 1;
+		}
+		/*
+		 * Set DFS parameters used during remount procedure.
+		 */
+		si->delay_tmo = (si->nfs_mnt_soft ?
+				(si->nfs_delay_tmo * si->nfs_mnt_retrans * HZ) :
+				MAX_SCHEDULE_TIMEOUT);
+		si->handle_mount_failure = delayfs_nfs_handle_mount_failure;
+		si->restore_mount_params = delayfs_nfs_restore_mount_params;
+	} else {
+		si->delay_tmo = MAX_SCHEDULE_TIMEOUT;
+		si->handle_mount_failure = NULL;
+		si->restore_mount_params = NULL;
+	}
+}
+
+static void *check_fs_supported(char *type, void *data)
+{
+	struct file_system_type *fs;
+
+	fs = get_fs_type(type);
+	if (!fs) {
+		eprintk("DelayFS: unknown file system type '%s'\n", type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!strcmp(fs->name, "nfs4") && !nfs_enable_v4_in_ct) {
+		eprintk("DelayFS: Can't restore mount: NFSv4 is disabled.\n");
+		put_filesystem(fs);
+		return ERR_PTR(-ENODEV);
+	}
+	return fs;
+}
+
+/* first stage */
+
+struct vfsmount *rst_mount_delayfs(char *type, int flags,
+		char *name, void *data, cpt_context_t *ctx)
+{
+	struct vfsmount *mnt;
+	struct delay_sb_info *si;
+	int err;
+	void *fs;
+
+	fs = check_fs_supported(type, data);
+	if (IS_ERR(fs))
+		return fs;
+
+	err = create_delayed_context(ctx);
+	if (err)
+		goto out;
+
+	mnt = vfs_kern_mount(&delayfs_type, flags, name, NULL);
+	err = PTR_ERR(mnt);
+	if (IS_ERR(mnt))
+		goto out;
+
+	si = mnt->mnt_sb->s_fs_info;
+
+	err = -ENOMEM;
+	/*
+	 * We need more than one page since NFS4 mount data is huge...
+	 */
+	si->data = (void *) __get_free_pages(GFP_KERNEL, 1);
+	if (!si->data)
+		goto out_put;
+	memcpy(si->data, data, PAGE_SIZE << 1);
+
+	si->hidden_type = fs;
+
+	delayfs_prepare_for_remount_loop(si);
+
+	return mnt;
+
+out_put:
+	kern_umount(mnt);
+out:
+	put_filesystem(fs);
+	return ERR_PTR(err);
+}
+
+struct file *rst_delayfs_screw(struct vfsmount *mnt,
+		char *name, int flags, loff_t offset, unsigned int mode)
+{
+	struct dentry *dentry;
+	struct inode *inode = NULL;
+	struct file *filp;
+	int err;
+
+	err = -EFAULT;
+	if (mnt->mnt_sb->s_type != &delayfs_type)
+		goto out;
+
+	err = -ENOMEM;
+	inode = new_inode(mnt->mnt_sb);
+	if (!inode)
+		goto out;
+	inode->i_op = &delay_dir_iops;
+	inode->i_fop = &delay_dir_fops;
+	inode->i_mode = mode & S_IFMT;
+
+	dentry = d_alloc_name(mnt->mnt_root, name);
+	err = -ENOMEM;
+	if (!dentry)
+		goto out;
+
+	dentry->d_op = &delay_dir_dops;
+	d_instantiate(dentry, inode);
+	inode = NULL;
+
+	mntget(mnt);
+	filp = dentry_open(dentry, mnt, flags, current_cred());
+	err = PTR_ERR(filp);
+	if (IS_ERR(filp))
+		goto out;
+
+	filp->f_pos = offset;
+	filp->f_heavy = 1;
+
+	D("mnt:%p file:%p de:%p ino:%p name:%s flags:%x offset:%lld",
+			mnt, filp, dentry, dentry->d_inode, name, flags, offset);
+	return filp;
+
+out:
+	D("mnt:%p name:%s flags:%x err:%d", mnt, name, flags, err);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+static int mknod_by_mntref(const char __user *filename, int mode,
+				unsigned dev, struct vfsmount *mnt)
+{
+	struct dentry * dentry;
+	struct nameidata nd;
+	int error = 0;
+
+	if (S_ISDIR(mode))
+		return -EPERM;
+
+	error = rst_path_lookup_at(mnt,  mnt->mnt_root, filename, LOOKUP_PARENT |
+			LOOKUP_DIVE, &nd);
+	if (error)
+		return error;
+
+	dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(dentry);
+	
+	if (!IS_POSIXACL(nd.path.dentry->d_inode))
+		mode &= ~current->fs->umask;
+	if (!IS_ERR(dentry)) {
+		switch (mode & S_IFMT) {
+		case 0: case S_IFREG:
+			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
+			break;
+		case S_IFCHR: case S_IFBLK:
+			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
+					new_decode_dev(dev));
+			break;
+		case S_IFIFO: case S_IFSOCK:
+			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
+			break;
+		case S_IFDIR:
+			error = -EPERM;
+			break;
+		default:
+			error = -EINVAL;
+		}
+		dput(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	path_put(&nd.path);
+
+	return error;
+
+}
+
+/* second stage */
+int rebind_unix_socket(struct vfsmount *rmnt, struct unix_bind_info *bi,
+	       		int flags)
+{
+	int err;
+	struct nameidata nd;
+	char *name = ((char *)bi->path) + bi->path_off;
+
+	if (rst_path_lookup_at(rmnt,  rmnt->mnt_root, name, flags, &nd) < 0) {
+		err = mknod_by_mntref(name, S_IFSOCK | (bi->i_mode & S_IALLUGO),
+			       		0, rmnt);
+		if (err) {
+			printk("%s: mknod [%s] err %d\n", __func__, name, err);
+			return err;
+		}
+
+		err = rst_path_lookup_at(rmnt,  rmnt->mnt_root, name, flags, &nd);
+		if (err < 0) {
+			printk("%s: lookup [%s] err %d\n", __func__, name, err);
+			return err;
+		}
+
+		if (bi->uid != -1 && bi->gid != -1)
+			sc_chown(bi->path, bi->uid, bi->gid);
+	}
+
+	if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) {
+		printk("%s: not a socket dentry %s\n", __func__, name);
+		path_put(&nd.path);
+		return -EINVAL;
+	}
+
+	err = unix_bind_path(bi->sk, nd.path.dentry, nd.path.mnt);
+	if (err < 0)
+		printk("%s: bind-path [%s] err %d\n", __func__, name, err);
+
+	return err;
+}
+
+static void rebind_unix_sockets(struct vfsmount *rmnt,
+		struct delay_sb_info *si)
+{
+	struct unix_bind_info *bi;
+
+	while ((bi = si->bi_list) != NULL) {
+		si->bi_list = bi->next;
+
+		rebind_unix_socket(rmnt, bi, 0);
+
+		sock_put(bi->sk);
+		kfree(bi);
+	}
+}
+
+static int rst_remount_delayfs(struct vfsmount *mnt)
+{
+	struct delay_sb_info *si = mnt->mnt_sb->s_fs_info;
+	struct vfsmount *real_mnt;
+
+	if (si->real)
+		return -EBUSY;
+
+	real_mnt = vfs_kern_mount(si->hidden_type, mnt->mnt_sb->s_flags | MS_CPTMOUNT,
+			mnt->mnt_devname, si->data);
+
+	if (IS_ERR(real_mnt))
+		return PTR_ERR(real_mnt);
+
+	D("fake: %p(%s)", mnt, mnt->mnt_sb->s_type->name);
+	D("real: %p(%s)", real_mnt, real_mnt->mnt_sb->s_type->name);
+	D("prnt: %p(%s)", mnt->mnt_parent, mnt->mnt_parent->mnt_sb->s_type->name);
+
+	si->real = mntget(real_mnt);
+	real_mnt->mnt_flags = mnt->mnt_flags & MNT_BEHAVIOR_FLAGS;
+
+	replace_mount(real_mnt, mnt);
+
+	rebind_unix_sockets(real_mnt, si);
+
+	return 0;
+}
+
+static int make_flags(struct file *filp)
+{
+	int flags = O_NOFOLLOW|O_NONBLOCK|O_NOCTTY;
+
+	switch (filp->f_mode &(FMODE_READ|FMODE_WRITE)) {
+		case FMODE_READ|FMODE_WRITE:
+			flags |= O_RDWR; break;
+		case FMODE_WRITE:
+			flags |= O_WRONLY; break;
+		case FMODE_READ:
+			flags |= O_RDONLY; break;
+		default: break;
+	}
+	flags |= filp->f_flags & ~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC);
+	return flags;
+}
+
+static int delayfs_lookup_file(const unsigned char *fname, int open_flags,
+			       int special_flags,
+			       struct nameidata *nd,
+			       struct vfsmount *mnt)
+{
+	struct file *real;
+	int flag = open_to_namei_flags(open_flags);
+	int err;
+
+	real = get_empty_filp();
+	if (real == NULL)
+		return -ENFILE;
+
+	real->f_flags = open_flags;
+
+	nd->intent.open.file = real;
+	nd->intent.open.flags = flag;
+	nd->intent.open.create_mode = 0;
+
+	err = rst_path_lookup_at(mnt, mnt->mnt_root, fname,
+				 lookup_flags(flag) | special_flags, nd);
+	if (IS_ERR(nd->intent.open.file)) {
+		if (err == 0) {
+			err = PTR_ERR(nd->intent.open.file);
+			path_put(&nd->path);
+		}
+	} else if (err)
+		release_open_intent(nd);
+	return err;
+}
+
+static struct file *delayfs_open_real_pipe(struct file *fake,
+					   int open_flag,
+					   struct vfsmount *mnt,
+					   struct nameidata *nd)
+{
+	struct file *real;
+	int err;
+
+	if (fake->f_mode & FMODE_READ) {
+		err = delayfs_lookup_file(FNAME(fake), open_flag, 0, nd, mnt);
+		if (err)
+			return ERR_PTR(err);
+		nd->intent.open.file->f_flags |= O_NONBLOCK;
+		real = nameidata_to_filp(nd);
+	} else {
+		struct file *tmp;
+		struct nameidata tmp_nd;
+
+		err = delayfs_lookup_file(FNAME(fake), O_RDWR|O_NONBLOCK, 0,
+					  &tmp_nd, mnt);
+		if (err)
+			return ERR_PTR(err);
+
+		tmp_nd.intent.open.file->f_flags |= O_NONBLOCK;
+		tmp = nameidata_to_filp(&tmp_nd);
+		if (IS_ERR(tmp))
+			return tmp;
+
+                real = dentry_open(dget(tmp->f_dentry), mntget(tmp->f_vfsmnt), open_flag, current_cred());
+                fput(tmp);
+	}
+
+	if (!IS_ERR(real)) {
+		int need_pipe_swap;
+		struct inode *inode = fake->f_dentry->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		need_pipe_swap = (long)inode->i_private;
+		inode->i_private = (void *)0;
+		mutex_unlock(&inode->i_mutex);
+
+		if (need_pipe_swap)
+			swap_pipe_info(real->f_dentry->d_inode,
+					fake->f_dentry->d_inode);
+	}
+
+	return real;
+}
+
+static struct file *delayfs_open_real_file(struct file *fake,
+					   struct vfsmount *mnt)
+{
+	struct nameidata nd;
+	int err;
+	int open_flags = make_flags(fake);
+	int lookup_flags = 0;
+
+	D("fake:%p(%s) flags:%d pos:%lld real_mnt:%p",
+			fake, FNAME(fake), open_flags,
+			(long long)fake->f_pos, mnt);
+
+	switch (fake->f_dentry->d_inode->i_mode & S_IFMT) {
+		case S_IFIFO:
+			return delayfs_open_real_pipe(fake, open_flags,
+						      mnt, &nd);
+		case S_IFREG:
+		case S_IFDIR:
+			lookup_flags = LOOKUP_OPEN;
+		default:
+			err = delayfs_lookup_file(FNAME(fake), open_flags,
+						  lookup_flags, &nd, mnt);
+			break;
+	}
+	if (err)
+		return ERR_PTR(err);
+	return nameidata_to_filp(&nd);
+}
+
+static int delayfs_preopen(struct file *fake, struct delay_sb_info *si)
+{
+	struct file *real;
+	int err;
+	struct delayfs_file_private *priv = fake->private_data;
+
+	real = delayfs_open_real_file(fake, si->real);
+	BUG_ON(real == NULL);
+	err = PTR_ERR(real);
+	if (IS_ERR(real))
+		goto out;
+
+	D("real:%p mnt:%p de:%p ino:%p", real, real->f_vfsmnt, real->f_dentry,
+			real->f_dentry->d_inode);
+
+	real->f_flags = fake->f_flags;
+	if (fake->f_pos != real->f_pos) {
+		loff_t off;
+
+		off = vfs_llseek(real, fake->f_pos, 0);
+		if (off < 0) {
+			eprintk("%s llseek:%d\n", __func__, (int)off);
+			real->f_pos = fake->f_pos;
+		}
+	}
+
+	spin_lock(&si->file_lock);
+	if (!priv->real_fs_file) {
+		priv->real_fs_file = real;
+		/* We need this assigment for restoring fs root and pwd */
+		if (fake->f_dentry->d_fsdata == NULL) {
+			fake->f_dentry->d_fsdata = real;
+			get_file(real);
+		} else if (!IS_ERR(fake->f_dentry->d_fsdata))
+			WARN_ON(real->f_dentry !=
+				((struct file *)fake->f_dentry->d_fsdata)->f_dentry);
+		real = NULL;
+	}
+	spin_unlock(&si->file_lock);
+
+	if (real)
+		fput(real);
+
+	err = 0;
+out:
+	D("file:%p(%s) err:%d", fake, fake->f_dentry->d_name.name, err);
+
+	return err;
+}
+
+static void delayfs_break(struct file *fake)
+{
+	struct delayfs_file_private *priv = fake->private_data;
+	struct delay_sb_info *si = fake->f_vfsmnt->mnt_sb->s_fs_info;
+
+	spin_lock(&si->file_lock);
+	if (priv->real_fs_file == NULL) {
+		priv->real_fs_file = ERR_PTR(-EIO);
+		fake->f_dentry->d_fsdata = ERR_PTR(-EIO);
+	}
+	spin_unlock(&si->file_lock);
+}
+
+static int delayfs_sillyrename(struct file *fake);
+
+static void delay_break_all(struct cpt_delayed_context *ctx)
+{
+	cpt_object_t *obj;
+	struct file *file;
+	struct vfsmount *mnt;
+	struct delay_sb_info *si;
+	struct delayfs_file_private *priv;
+
+	for_each_object(obj, CPT_DOBJ_FILE) {
+		file = obj->o_obj;
+		priv = file->private_data;
+		if (priv->real_fs_file == NULL)
+			delayfs_break(file);
+		else if (obj->o_flags & CPT_FILE_SILLYRENAME)
+			delayfs_sillyrename(file);
+	}
+
+	for_each_object(obj, CPT_DOBJ_VFSMOUNT_REF) {
+		mnt = obj->o_obj;
+
+		si = mnt->mnt_sb->s_fs_info;
+		si->state = SB_BROKEN;
+		wake_up_all(&si->blocked_tasks);
+	}
+}
+
+static void dctx_release_objects(struct cpt_delayed_context *ctx)
+{
+	cpt_object_t *obj, *nobj;
+
+	for_each_object_safe(obj, nobj, CPT_DOBJ_VFSMOUNT_REF) {
+		list_del(&obj->o_list);
+		mntput(obj->o_obj);
+		kfree(obj->o_image);
+		kfree(obj);
+	}
+
+	synchronize_rcu(); /* wait till fget_light gets the reference */
+
+	for_each_object_safe(obj, nobj, CPT_DOBJ_FILE) {
+		list_del(&obj->o_list);
+		fput(obj->o_obj);
+		kfree(obj->o_image);
+		kfree(obj);
+	}
+}
+
+void destroy_delayed_context(struct cpt_delayed_context *dctx)
+{
+	delay_break_all(dctx);
+	dctx_release_objects(dctx);
+	kfree(dctx);
+}
+
+static int delayfs_sillyrename(struct file *fake)
+{
+	struct delayfs_file_private *priv = fake->private_data;
+	struct file *real = priv->real_fs_file;
+	int err;
+
+	if (!real || IS_ERR(real))
+		return -ENODEV;
+
+	dget(real->f_dentry); /* see nfs_unlink */
+	mutex_lock_nested(&real->f_dentry->d_parent->d_inode->i_mutex, I_MUTEX_PARENT);
+	err = vfs_unlink(real->f_dentry->d_parent->d_inode, real->f_dentry);
+	mutex_unlock(&real->f_dentry->d_parent->d_inode->i_mutex);
+	dput(real->f_dentry);
+
+	D("file:%p(%s) ret:%d", fake, fake->f_dentry->d_name.name, err);
+	return err;
+}
+
+/* wire */
+
+int rst_freeze_delayfs(cpt_context_t *ctx)
+{
+	cpt_object_t *obj, *nobj;
+	struct vfsmount *mnt;
+	struct delay_sb_info *si;
+	/* dctx must be not NULL if any delayed object exists */
+	struct cpt_delayed_context *dctx = ctx->dctx;
+
+	for_each_object_safe(obj, nobj, CPT_OBJ_VFSMOUNT_REF) {
+		if (!(obj->o_flags & CPT_VFSMOUNT_DELAYFS))
+			continue;
+
+		list_move(&obj->o_list,
+				&dctx->object_array[CPT_DOBJ_VFSMOUNT_REF]);
+		ctx->objcount--;
+		mnt = obj->o_obj;
+		si = mnt->mnt_sb->s_fs_info;
+		si->state = SB_LOCKED;
+	}
+
+	for_each_object_safe(obj, nobj, CPT_OBJ_FILE)
+		if (obj->o_flags & CPT_FILE_DELAYFS) {
+			list_move(&obj->o_list,
+					&dctx->object_array[CPT_DOBJ_FILE]);
+			ctx->objcount--;
+		}
+	return 0;
+}
+
+static void delayfs_resume(struct cpt_delayed_context *ctx,
+		struct list_head *broken_mounts)
+{
+	int ret;
+	struct delay_sb_info *si;
+	cpt_object_t *obj, *nobj;
+	struct vfsmount *mnt;
+	struct file *file;
+	struct delayfs_file_private *priv;
+
+	/* mount */
+	for_each_object_safe(obj, nobj, CPT_DOBJ_VFSMOUNT_REF) {
+		BUG_ON(!(obj->o_flags & CPT_VFSMOUNT_DELAYFS));
+
+		mnt = obj->o_obj;
+		si = mnt->mnt_sb->s_fs_info;
+		ret = rst_remount_delayfs(mnt);
+		if (ret) {
+			if (si->handle_mount_failure)
+				si->handle_mount_failure(si);
+			list_move(&obj->o_list, broken_mounts);
+		}
+	}
+
+	/* restore mount parameters */
+	for_each_object(obj, CPT_DOBJ_VFSMOUNT_REF) {
+		mnt = obj->o_obj;
+		si = mnt->mnt_sb->s_fs_info;
+		if (si->restore_mount_params)
+			si->restore_mount_params(si);
+		wake_up_all(&si->blocked_tasks);
+	}
+
+	/* preopen */
+	for_each_object(obj, CPT_DOBJ_FILE) {
+		BUG_ON(!(obj->o_flags & CPT_FILE_DELAYFS));
+
+		file = obj->o_obj;
+		si = file->f_vfsmnt->mnt_sb->s_fs_info;
+		/* mount is broken or already reopened */
+		priv = file->private_data;
+		if (!si->real || priv->real_fs_file != NULL)
+			continue;
+
+		ret = delayfs_preopen(file, si);
+		if (ret) {
+			printk("%s: preopen %s err %d\n", __func__,
+					FNAME(file), ret);
+			delayfs_break(file);
+		}
+	}
+
+	/* wakeup */
+	for_each_object(obj, CPT_DOBJ_VFSMOUNT_REF) {
+		mnt = obj->o_obj;
+
+		D("wakeup %p", mnt);
+
+		si = mnt->mnt_sb->s_fs_info;
+		si->state = SB_FINISHED;
+		wake_up_all(&si->blocked_tasks);
+	}
+
+	/**
+	 * all files preopened or broken -- now noone block mmap_sem write lock
+	 */
+
+	/* switch */
+	for_each_object(obj, CPT_DOBJ_VFSMOUNT_REF) {
+		mnt = obj->o_obj;
+		delayfs_switch_all(mnt);
+	}
+}
+
+static int delay_first_timeout = 1 * HZ;
+
+struct ctl_table delayfs_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "first_timeout",
+		.data		= &delay_first_timeout,
+		.maxlen		= sizeof(delay_first_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max_timeout",
+		.data		= &delay_max_timeout,
+		.maxlen		= sizeof(delay_max_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int delayfs_resume_fn(void *d)
+{
+	struct cpt_delayed_context *dctx = d;
+	int retry_timeout = DELAYFS_INITIAL_RETRY_TIMEOUT;
+	unsigned long abort_timeout;
+	LIST_HEAD(broken_mounts);
+	LIST_HEAD(live_mounts);
+	int ve_id = -1;
+
+	dctx->dfs_daemon = current;
+
+	abort_timeout = jiffies + (unsigned long)300 * HZ;
+
+	daemonize("dfs_resume/%d", dctx->ve_id);
+	ve_printk(VE_LOG_BOTH, "DFS%d: resuming daemon started\n", dctx->ve_id);
+
+	allow_signal(SIGKILL);
+
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	complete(&dctx->dfs_notify);
+	/* Waiting for delayed context to be filled by resume process */
+	schedule();
+
+	schedule_timeout_interruptible(delay_first_timeout);
+
+try_again:
+	if (signal_pending(current)) {
+		ve_printk(VE_LOG_BOTH, "DFS%d: Got kill signal\n", dctx->ve_id);
+		goto out_splice;
+	}
+
+	if (abort_timeout && time_after(jiffies, abort_timeout)) {
+		ve_printk(VE_LOG_BOTH, "DFS%d: Timed out\n", dctx->ve_id);
+		goto out_splice;
+	}
+
+	delayfs_resume(dctx, &broken_mounts);
+
+	list_splice_init(&dctx->object_array[CPT_DOBJ_VFSMOUNT_REF],
+			&live_mounts);
+
+	if (!list_empty(&broken_mounts)) {
+		list_splice_init(&broken_mounts,
+				&dctx->object_array[CPT_DOBJ_VFSMOUNT_REF]);
+
+		ve_printk(VE_LOG_BOTH, "DFS%d: Retrying delayed mount in %d seconds\n",
+					dctx->ve_id, retry_timeout / HZ);
+		schedule_timeout_interruptible(retry_timeout);
+		if (retry_timeout < delay_max_timeout)
+			retry_timeout <<= 1;
+
+		goto try_again;
+	}
+	ve_id = dctx->ve_id;
+out_splice:
+	list_splice(&live_mounts, &dctx->object_array[CPT_DOBJ_VFSMOUNT_REF]);
+	destroy_delayed_context(dctx);
+	if (ve_id >= 0)
+		ve_printk(VE_LOG_BOTH, "DFS%d: Delayed mounts successfully resumed\n",
+					ve_id);
+	module_put_and_exit(0);
+}
+
+int rst_init_delayfs_daemon(cpt_context_t *ctx)
+{
+	int pid;
+	struct cpt_delayed_context *dctx = ctx->dctx;
+
+	if (dctx == NULL)
+		return 0;
+
+	__module_get(THIS_MODULE);
+
+	init_completion(&dctx->dfs_notify);
+
+	pid = kernel_thread(delayfs_resume_fn, dctx,
+			CLONE_FS | CLONE_FILES | CLONE_VM | SIGCHLD);
+	if (pid < 0) {
+		eprintk_ctx("%d: Failed to start delayfs daemon (err: %d)\n",
+				dctx->ve_id, pid);
+		destroy_delayed_context(dctx);
+		ctx->dctx = NULL;
+		module_put(THIS_MODULE);
+		return pid;
+	}
+
+	wait_for_completion(&dctx->dfs_notify);
+
+	return 0;
+}
+
+int rst_delay_flock(struct file *f, struct cpt_flock_image *fli,
+		cpt_context_t *ctx)
+{
+	int err;
+	struct delayed_flock_info *dfi;
+	struct file_lock *fl;
+	struct delayfs_file_private *priv;
+
+	err = -EINVAL;
+	if (!cpt_object_has(fli, cpt_svid) ||
+			fli->cpt_svid == CPT_NOINDEX) {
+		eprintk_ctx("No SVID for flock\n");
+		goto out;
+	}
+
+	err = -ENOMEM;
+	dfi = kmalloc(sizeof(*dfi), GFP_KERNEL);
+	if (dfi == NULL)
+		goto out;
+
+	err = -ENOMEM;
+	fl = locks_alloc_lock(1);
+	if (fl == NULL)
+		goto out1;
+
+	if (fli->cpt_flags & FL_FLOCK) {
+		fl->fl_flags = FL_FLOCK;
+		fl->fl_start = 0;
+		fl->fl_end = OFFSET_MAX;
+		fl->fl_pid = 0;
+		fl->fl_type = fli->cpt_type;
+	} else {
+		cpt_object_t *obj;
+
+		fl->fl_flags = fli->cpt_flags & ~FL_SLEEP;
+		fl->fl_end = fli->cpt_end;
+		fl->fl_start = fli->cpt_start;
+		fl->fl_type = fli->cpt_type;
+
+		err = -EINVAL;
+		obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES,
+				fli->cpt_owner, ctx);
+		if (!obj) {
+			eprintk_ctx("unknown lock owner %d\n",
+					(int)fli->cpt_owner);
+			goto out2;
+		}
+		fl->fl_owner = obj->o_obj;
+		if (fl->fl_owner == NULL)
+			eprintk_ctx("no lock owner\n");
+
+		fl->fl_pid = vpid_to_pid(fli->cpt_pid);
+		if (fl->fl_pid < 0) {
+			eprintk_ctx("unknown lock pid %d\n", fl->fl_pid);
+			goto out2;
+		}
+	}
+
+	priv = f->private_data;
+
+	dfi->fl = fl;
+	dfi->svid = fli->cpt_svid;
+	dfi->next = priv->dfi;
+
+	priv->dfi = dfi;
+	return 0;
+
+out2:
+	locks_free_lock(fl);
+out1:
+	kfree(dfi);
+out:
+	return err;
+}
+
+void rst_put_delayed_sockets(cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_VFSMOUNT_REF) {
+		struct vfsmount *mnt = obj->o_obj;
+		struct delay_sb_info *si;
+
+		if (mnt->mnt_sb->s_op != &delay_super_ops)
+			continue;
+
+		si = mnt->mnt_sb->s_fs_info;
+		while (si->bi_list) {
+			struct unix_bind_info *i;
+
+			i = si->bi_list;
+			si->bi_list = i->next;
+
+			sock_put(i->sk);
+		}
+	}
+}
+
+int rst_delay_unix_bind(struct sock *sk, struct cpt_sock_image *v,
+		cpt_context_t *ctx)
+{
+	int err;
+	cpt_object_t *mntobj;
+	struct vfsmount *mnt;
+	struct super_block *sb;
+	struct unix_bind_info *dbi;
+	struct delay_sb_info *sbi;
+
+	BUG_ON(v->cpt_sockflags & CPT_SOCK_DELETED);
+
+	mntobj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+			v->cpt_vfsmount_ref, ctx);
+	if (mntobj == NULL) {
+		eprintk_ctx("can't find vfsmount for unix socket\n");
+		return -EINVAL;
+	}
+
+	mnt = mntobj->o_obj;
+	sb = mnt->mnt_sb;
+	BUG_ON(sb->s_op != &delay_super_ops);
+
+	if (v->cpt_laddrlen - 2 <= mntobj->o_lock) {
+		eprintk_ctx("unix socket with too sort name (%d %s)\n",
+				mntobj->o_lock, (char *)v->cpt_laddr);
+		return -EINVAL;
+	}
+
+	err = unix_attach_addr(sk, (struct sockaddr_un *)v->cpt_laddr,
+			v->cpt_laddrlen);
+	if (err) {
+		eprintk_ctx("can't attach unix address %d\n", err);
+		return err;
+	}
+
+	dbi = kzalloc(sizeof(*dbi), GFP_KERNEL);
+	if (dbi == NULL)
+		return -ENOMEM;
+
+	sock_hold(sk);
+	dbi->sk = sk;
+	strcpy(dbi->path, ((char *)v->cpt_laddr) + 2);
+	dbi->path_off = mntobj->o_lock;
+
+	if (cpt_object_has(v, cpt_i_mode))
+		dbi->i_mode = v->cpt_i_mode;
+	dbi->uid = v->cpt_peer_uid;
+	dbi->gid = v->cpt_peer_gid;
+	if (cpt_object_has(v, cpt_i_uid) && cpt_object_has(v, cpt_i_gid)) {
+		dbi->uid = v->cpt_i_uid;
+		dbi->gid = v->cpt_i_gid;
+	}
+
+	sbi = sb->s_fs_info;
+	dbi->next = sbi->bi_list;
+	sbi->bi_list = dbi;
+
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_epoll.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_epoll.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_epoll.c	2015-01-21 12:02:48.229093499 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_epoll.c	2015-01-21 12:02:49.996046592 +0300
@@ -0,0 +1,163 @@
+/*
+ *
+ *  kernel/cpt/rst_epoll.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/eventpoll.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+struct file *cpt_open_epolldev(struct cpt_file_image *fi,
+			       unsigned flags,
+			       struct cpt_context *ctx)
+{
+	struct file *file;
+	int efd;
+
+	/* Argument "size" is ignored, use just 1 */
+	efd = sys_epoll_create(1);
+	if (efd < 0)
+		return ERR_PTR(efd);
+
+	file = fget(efd);
+	sys_close(efd);
+	return file;
+}
+
+static int restore_one_epoll(cpt_object_t *obj,
+			     loff_t pos,
+			     struct cpt_epoll_image *ebuf,
+			     cpt_context_t *ctx)
+{
+	int err = 0;
+	loff_t endpos;
+	struct file *file = obj->o_obj;
+	struct eventpoll *ep;
+
+	if (file->f_op != &eventpoll_fops) {
+		eprintk_ctx("bad epoll file\n");
+		return -EINVAL;
+	}
+
+	ep = file->private_data;
+
+	if (unlikely(ep == NULL)) {
+		eprintk_ctx("bad epoll device\n");
+		return -EINVAL;
+	}
+
+	endpos = pos + ebuf->cpt_next;
+	pos += ebuf->cpt_hdrlen;
+	while (pos < endpos) {
+		struct cpt_epoll_file_image efi;
+		struct epoll_event epds;
+
+		cpt_object_t *tobj;
+
+		err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx);
+		if (err)
+			return err;
+		tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx);
+		if (!tobj) {
+			eprintk_ctx("epoll file not found\n");
+			return -EINVAL;
+		}
+		epds.events = efi.cpt_events;
+		epds.data = efi.cpt_data;
+		mutex_lock(&epmutex);
+		mutex_lock(&ep->mtx);
+		err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd, 1);
+		clear_tfile_check_list();
+		if (!err) {
+			struct epitem *epi;
+			epi = ep_find(ep, tobj->o_obj, efi.cpt_fd);
+			if (epi) {
+				if (efi.cpt_ready) {
+					unsigned long flags;
+					spin_lock_irqsave(&ep->lock, flags);
+					if (list_empty(&epi->rdllink))
+						list_add_tail(&epi->rdllink, &ep->rdllist);
+					spin_unlock_irqrestore(&ep->lock, flags);
+				}
+			}
+		}
+		mutex_unlock(&ep->mtx);
+		mutex_unlock(&epmutex);
+		if (err)
+			break;
+		pos += efi.cpt_next;
+	}
+	return err;
+}
+
+int rst_eventpoll(cpt_context_t *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_EPOLL];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		cpt_object_t *obj;
+		struct cpt_epoll_image *ebuf = cpt_get_buf(ctx);
+		err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+		obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx);
+		if (obj == NULL) {
+			eprintk_ctx("cannot find epoll file object\n");
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+		err = restore_one_epoll(obj, sec, ebuf, ctx);
+		cpt_release_buf(ctx);
+		if (err)
+			return err;
+		sec += ebuf->cpt_next;
+	}
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_files.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_files.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_files.c	2015-01-21 12:02:48.230093472 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_files.c	2015-01-21 12:02:57.987834448 +0300
@@ -0,0 +1,2630 @@
+/*
+ *
+ *  kernel/cpt/rst_files.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/nsproxy.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/fs_struct.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/tty.h>
+#include <linux/namei.h>
+#include <linux/vmalloc.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <asm/uaccess.h>
+#include <bc/kmem.h>
+#include <linux/cpt_image.h>
+#include <linux/mnt_namespace.h>
+#include <linux/fdtable.h>
+#include <linux/shm.h>
+#include <linux/signalfd.h>
+#include <linux/proc_fs.h>
+#include <linux/init_task.h>
+#include <linux/anon_inodes.h>
+#include <linux/timerfd.h>
+#include <linux/cgroup.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/ve_proto.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+
+#include "cpt_syscalls.h"
+
+
+struct filejob {
+	struct filejob *next;
+	int	pid;
+	loff_t	fdi;
+};
+
+static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx)
+{
+	struct filejob *j;
+
+	j = kmalloc(sizeof(*j), GFP_KERNEL);
+	if (j == NULL)
+		return -ENOMEM;
+	j->pid = current->pid;
+	j->fdi = pos;
+	j->next = ctx->filejob_queue;
+	ctx->filejob_queue = j;
+	return 0;
+}
+
+static void _anon_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	/*
+	 * If nobody else uses this page, and we don't already have a
+	 * temporary page, let's keep track of it as a one-deep
+	 * allocation cache. (Otherwise just release our reference to it)
+	 */
+	if (page_count(page) == 1 && !pipe->tmp_page)
+		pipe->tmp_page = page;
+	else
+		page_cache_release(page);
+
+	module_put(THIS_MODULE);
+}
+
+static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf, int atomic)
+{
+	if (atomic) {
+		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+		return kmap_atomic(buf->page, KM_USER0);
+	}
+
+	return kmap(buf->page);
+}
+
+static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf, void *map_data)
+{
+	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+		kunmap_atomic(map_data, KM_USER0);
+	} else
+		kunmap(buf->page);
+}
+
+static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	if (page_count(page) == 1) {
+		lock_page(page);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
+static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+	return 0;
+}
+
+static struct pipe_buf_operations _anon_pipe_buf_ops = {
+	.can_merge = 1,
+	.map = _anon_pipe_buf_map,
+	.unmap = _anon_pipe_buf_unmap,
+	.release = _anon_pipe_buf_release,
+	.confirm = _anon_pipe_buf_confirm,
+	.get = _anon_pipe_buf_get,
+	.steal = _anon_pipe_buf_steal,
+};
+
+/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer
+ * many times. We need to mark it in CPT_OBJ_INODE table in some way.
+ */
+static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi,
+			   struct cpt_context *ctx)
+{
+	struct inode *ino = file->f_dentry->d_inode;
+	struct cpt_inode_image ii;
+	struct cpt_object_hdr hdr;
+	struct cpt_obj_bits b;
+	struct pipe_inode_info *info;
+	int err;
+	int count;
+	__u64 pos;
+
+	if (!S_ISFIFO(ino->i_mode)) {
+		eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode);
+		return -EINVAL;
+	}
+	if (fi->cpt_inode == CPT_NULL)
+		return 0;
+
+	err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
+	if (err)
+		return err;
+
+	if (ii.cpt_next <= ii.cpt_hdrlen)
+		return 0;
+
+	pos = fi->cpt_inode + ii.cpt_hdrlen;
+
+	/*
+	 * Inode object can be followed by either CPT_OBJ_NAME object or
+	 * CPT_OBJ_BITS object. So here we read header and check it's object
+	 * type.
+	 */
+	err = rst_get_object(0, pos, &hdr, ctx);
+	if (err)
+		return err;
+	if (hdr.cpt_object == CPT_OBJ_NAME) {
+		/*
+		 * Inode object is followed by CPT_OBJ_NAME. I.e. original
+		 * inode dentry was unlinked on source node and here is it's
+		 * alias name.
+		 * Name object can be followed by CPT_OBJ_BITS (with pipe
+		 * buffers content).
+		 */
+		if (ii.cpt_next <= ii.cpt_hdrlen + hdr.cpt_next)
+			return 0;
+
+		pos += hdr.cpt_next;
+	}
+
+	err = rst_get_object(CPT_OBJ_BITS, pos, &b, ctx);
+	if (err)
+		return err;
+
+	if (b.cpt_size == 0)
+		return 0;
+
+	mutex_lock(&ino->i_mutex);
+	info = ino->i_pipe;
+	if (info->nrbufs) {
+		mutex_unlock(&ino->i_mutex);
+		eprintk("pipe buffer is restored already\n");
+		return -EINVAL;
+	}
+	info->curbuf = 0;
+	count = 0;
+	while (count < b.cpt_size) {
+		struct pipe_buffer *buf = info->bufs + info->nrbufs;
+		void * addr;
+		int chars;
+
+		chars = b.cpt_size - count;
+		if (chars > PAGE_SIZE)
+			chars = PAGE_SIZE;
+		if (!try_module_get(THIS_MODULE)) {
+			err = -EBUSY;
+			break;
+		}
+
+		buf->page = alloc_page(GFP_HIGHUSER);
+		if (buf->page == NULL) {
+			err = -ENOMEM;
+			break;
+		}
+		buf->ops = &_anon_pipe_buf_ops;
+		buf->offset = 0;
+		buf->len = chars;
+		info->nrbufs++;
+		addr = kmap(buf->page);
+		err = ctx->pread(addr, chars, ctx,
+				 pos + b.cpt_hdrlen + count);
+		if (err)
+			break;
+		count += chars;
+	}
+	mutex_unlock(&ino->i_mutex);
+
+	return err;
+}
+
+static int make_flags(struct cpt_file_image *fi)
+{
+	int flags = O_NOFOLLOW;
+	switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) {
+	case FMODE_READ|FMODE_WRITE:
+		flags |= O_RDWR; break;
+	case FMODE_WRITE:
+		flags |= O_WRONLY; break;
+	case FMODE_READ:
+		flags |= O_RDONLY; break;
+	default: break;
+	}
+	flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC);
+	flags |= O_NONBLOCK|O_NOCTTY;
+	return flags;
+}
+
+static struct file *open_fake_file(u32 cpt_i_mode, struct cpt_context *ctx)
+{
+	struct file *file;
+
+	file = anon_inode_getfile(FAKE_FILE_NAME, &bad_file_ops, NULL, 0);
+
+	if (IS_ERR_OR_NULL(file))
+		eprintk_ctx("Can't open fake file\n");
+
+	return file;
+}
+
+struct file *rst_open_file(cpt_object_t *mntobj, char *name,
+			      struct cpt_file_image *fi,
+			      unsigned flags,
+			      struct cpt_context *ctx)
+{
+	struct nameidata nd;
+	int err;
+
+	if (mntobj && (mntobj->o_flags & CPT_VFSMOUNT_DELAYFS)) {
+		struct vfsmount *mnt = mntobj->o_obj;
+
+		if (fi->cpt_lflags & CPT_DENTRY_ROOT)
+			name = "";
+		else if (strlen(name) > mntobj->o_lock)
+			name = name + mntobj->o_lock + 1;
+		else {
+			eprintk_ctx("name %s to short for mnt %d\n", name, mntobj->o_lock);
+			return ERR_PTR(-EINVAL);
+		}
+		return rst_delayfs_screw(mnt, name, flags, fi->cpt_pos, fi->cpt_i_mode);
+	}
+
+	err = rst_path_lookup(mntobj, name, LOOKUP_FOLLOW, &nd);
+	if (err) {
+		eprintk_ctx("%s: failed to lookup path '%s': %d\n", __func__, name, err);
+		return ERR_PTR(err);
+	}
+
+	return dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+}
+
+static struct file *open_pipe(cpt_object_t *mntobj, char *name,
+			      struct cpt_file_image *fi,
+			      unsigned flags,
+			      struct cpt_context *ctx)
+{
+	int err;
+	cpt_object_t *obj;
+	struct cpt_inode_image ii;
+	struct file *rf, *wf;
+
+	err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
+	if (err)
+		return ERR_PTR(err);
+
+	if (ii.cpt_sb == FSMAGIC_PIPEFS) {
+		int pfd[2];
+
+		if ((err = sc_pipe(pfd)) < 0)
+			return ERR_PTR(err);
+
+		rf = fcheck(pfd[0]);
+		wf = fcheck(pfd[1]);
+		get_file(rf);
+		get_file(wf);
+		sc_close(pfd[0]);
+		sc_close(pfd[1]);
+
+		if (fi->cpt_mode&FMODE_READ) {
+			struct file *tf;
+			tf = wf; wf = rf; rf = tf;
+		}
+	} else {
+		if (fi->cpt_mode&FMODE_READ) {
+			rf = rst_open_file(mntobj, name, fi, flags, ctx);
+			if (IS_ERR(rf)) {
+				dprintk_ctx("filp_open\n");
+				return rf;
+			}
+			dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current),
+				    (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode);
+			return rf;
+		}
+
+		dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode);
+
+		rf = rst_open_file(mntobj, name, fi, O_RDWR|O_NONBLOCK, ctx);
+		if (IS_ERR(rf))
+			return rf;
+		wf = dentry_open(dget(rf->f_dentry),
+				 mntget(rf->f_vfsmnt), flags, current_cred());
+		if (IS_ERR(wf)) {
+			fput(rf);
+			return wf;
+		}
+	}
+
+	/* Add pipe inode to obj table. */
+	obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx);
+	if (obj == NULL) {
+		fput(rf); fput(wf);
+		return ERR_PTR(-ENOMEM);
+	}
+	cpt_obj_setpos(obj, fi->cpt_inode, ctx);
+	obj->o_parent = rf;
+
+	/* Add another side of pipe to obj table, it will not be used
+	 * (o_pos = PT_NULL), another processes opeining pipe will find
+	 * inode and open it with dentry_open(). */
+	obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx);
+	if (obj == NULL) {
+		fput(wf);
+		return ERR_PTR(-ENOMEM);
+	}
+	return wf;
+}
+
+static struct file *open_special(cpt_object_t *mntobj, char *name,
+				 struct cpt_file_image *fi,
+				 unsigned flags,
+				 int deleted,
+				 struct cpt_context *ctx)
+{
+	struct cpt_inode_image *ii;
+	struct file *file;
+
+	/* Directories and named pipes are not special actually */
+	if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode))
+		return NULL;
+
+	/* No support for block devices at the moment. */
+	if (S_ISBLK(fi->cpt_i_mode))
+		return ERR_PTR(-EINVAL);
+
+	if (S_ISSOCK(fi->cpt_i_mode)) {
+		eprintk_ctx("bug: socket is not open\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* Support only (some) character devices at the moment. */
+	if (!S_ISCHR(fi->cpt_i_mode))
+		return ERR_PTR(-EINVAL);
+
+	ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx);
+	if (ii == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	/* Do not worry about this right now. /dev/null,zero,*random are here.
+	 * To prohibit at least /dev/mem?
+	 */
+	if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) {
+		kfree(ii);
+		return NULL;
+	}
+
+	/* /dev/net/tun will be opened by caller */
+	if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) {
+		kfree(ii);
+		return NULL;
+	}
+
+	file = rst_open_tty(mntobj, name, fi, ii, flags, ctx);
+	kfree(ii);
+	return file;
+}
+
+#define for_each_lock(inode, lockp) \
+	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
+
+void fixup_lock_pid(struct inode *inode, unsigned int cpt_pid, cpt_context_t *ctx)
+{
+	struct file_lock **loop;
+
+	lock_kernel();
+	for_each_lock(inode, loop) {
+		struct pid *pid = (*loop)->fl_nspid;
+		struct ve_struct *ve;
+
+		if (pid != task_tgid(current))
+			continue;
+
+		ve = __find_ve_by_id(ctx->ve_id);
+		BUG_ON(!ve);
+
+		put_pid(pid);
+
+		rcu_read_lock();
+		pid = find_pid_ns(cpt_pid, ve->ve_ns->pid_ns);
+		(*loop)->fl_nspid = get_pid(pid);
+		(*loop)->fl_pid = cpt_pid;
+		rcu_read_unlock();
+	}
+
+	unlock_kernel();
+}
+
+static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli,
+		cpt_context_t *ctx)
+{
+	struct file_lock lock;
+	cpt_object_t *obj;
+	int err;
+
+	/* Deleted delayed files restore on root fs, not need to use delayed flock */
+	if ((fli->cpt_flags & CPT_FLOCK_DELAYED) &&
+	    file->f_dentry->d_op == &delay_dir_dops)
+		return rst_delay_flock(file, fli, ctx);
+
+	memset(&lock, 0, sizeof(lock));
+	lock.fl_type = fli->cpt_type;
+	lock.fl_flags = fli->cpt_flags & ~FL_SLEEP;
+	lock.fl_start = fli->cpt_start;
+	lock.fl_end = fli->cpt_end;
+	obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx);
+	if (!obj) {
+		eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner);
+		return -EINVAL;
+	}
+	lock.fl_owner = obj->o_obj;
+	lock.fl_pid = fli->cpt_pid;
+	if (lock.fl_pid < 0) {
+		eprintk_ctx("unknown lock pid %d\n", lock.fl_pid);
+		return -EINVAL;
+	}
+	lock.fl_file = file;
+
+	if (lock.fl_owner == NULL)
+		eprintk_ctx("no lock owner\n");
+	err = posix_lock_file(file, &lock, NULL);
+	if (err < 0) {
+		eprintk_ctx("can't lock file\n");
+		return err;
+	}
+
+	fixup_lock_pid(file->f_path.dentry->d_inode, fli->cpt_pid, ctx);
+	return 0;
+}
+
+static int restore_flock(struct file *file, struct cpt_flock_image *fli,
+		cpt_context_t *ctx)
+{
+	int cmd, err, fd;
+
+	/* Deleted delayed files restore on root fs, not need to use delayed flock */
+	if ((fli->cpt_flags & CPT_FLOCK_DELAYED) &&
+	    file->f_dentry->d_op == &delay_dir_dops)
+		return rst_delay_flock(file, fli, ctx);
+
+	fd = get_unused_fd();
+	if (fd < 0) {
+		eprintk_ctx("BSD flock cannot be restored\n");
+		return fd;
+	}
+	get_file(file);
+	fd_install(fd, file);
+	if (fli->cpt_type & LOCK_MAND) {
+		cmd = fli->cpt_type;
+	} else if (fli->cpt_type == F_RDLCK) {
+		cmd = LOCK_SH;
+	} else if (fli->cpt_type == F_WRLCK) {
+		cmd = LOCK_EX;
+	} else {
+		eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type);
+		sc_close(fd);
+		return -EINVAL;
+	}
+
+	err = sc_flock(fd, LOCK_NB | cmd);
+	sc_close(fd);
+	if (err)
+		return err;
+
+	fixup_lock_pid(file->f_path.dentry->d_inode, fli->cpt_pid, ctx);
+	return 0;
+}
+
+static int fixup_posix_locks(struct file *file,
+			     struct cpt_file_image *fi,
+			     loff_t pos, struct cpt_context *ctx)
+{
+	int err;
+	loff_t end;
+	struct cpt_flock_image fli;
+
+	end = pos + fi->cpt_next;
+	pos += fi->cpt_hdrlen;
+	while (pos < end) {
+		err = rst_get_object(-1, pos, &fli, ctx);
+		if (err)
+			return err;
+		if (fli.cpt_object == CPT_OBJ_FLOCK &&
+		    (fli.cpt_flags&FL_POSIX)) {
+			err = restore_posix_lock(file, &fli, ctx);
+			if (err)
+				return err;
+			dprintk_ctx("posix lock restored\n");
+		}
+		pos += fli.cpt_next;
+	}
+	return 0;
+}
+
+int rst_posix_locks(struct cpt_context *ctx)
+{
+	int err;
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+		struct cpt_file_image fi;
+
+		if (obj->o_pos == CPT_NULL)
+			continue;
+
+		err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx);
+		if (err < 0)
+			return err;
+		if (fi.cpt_next > fi.cpt_hdrlen)
+			fixup_posix_locks(file, &fi, obj->o_pos, ctx);
+	}
+	return 0;
+}
+
+static int fixup_flocks(struct file *file,
+			struct cpt_file_image *fi,
+			loff_t pos, struct cpt_context *ctx)
+{
+	int err;
+	loff_t end;
+	struct cpt_flock_image fli;
+
+	end = pos + fi->cpt_next;
+	pos += fi->cpt_hdrlen;
+	while (pos < end) {
+		err = rst_get_object(-1, pos, &fli, ctx);
+		if (err)
+			return err;
+		if (fli.cpt_object == CPT_OBJ_FLOCK &&
+		    (fli.cpt_flags&FL_FLOCK)) {
+			err = restore_flock(file, &fli, ctx);
+			if (err)
+				return err;
+			dprintk_ctx("bsd lock restored\n");
+		}
+		pos += fli.cpt_next;
+	}
+	return 0;
+}
+
+static int restore_reg_chunk(struct file *file, loff_t pos,
+		struct cpt_page_block * pgb, cpt_context_t *ctx)
+{
+	int err;
+	loff_t opos;
+	loff_t ipos;
+	int count;
+
+	ipos = pos + pgb->cpt_hdrlen;
+	opos = pgb->cpt_start;
+	count = pgb->cpt_end-pgb->cpt_start;
+	while (count > 0) {
+		mm_segment_t oldfs;
+		int copy = count;
+
+		if (copy > PAGE_SIZE)
+			copy = PAGE_SIZE;
+		(void)cpt_get_buf(ctx);
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
+		set_fs(oldfs);
+		if (err) {
+			__cpt_release_buf(ctx);
+			goto out;
+		}
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		ipos += copy;
+		err = file->f_op->write(file, ctx->tmpbuf, copy, &opos);
+		set_fs(oldfs);
+		__cpt_release_buf(ctx);
+		if (err != copy) {
+			if (err >= 0)
+				err = -EIO;
+			goto out;
+		}
+		count -= copy;
+	}
+	err = 0;
+out:
+	return err;
+}
+
+static int fixup_reg_data(struct file *file, loff_t pos, loff_t end,
+			  struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_page_block pgb;
+
+	if (file->f_op->write == NULL) {
+		eprintk_ctx("no write method. Cannot restore contents of the file.\n");
+		return -EINVAL;
+	}
+
+	atomic_long_inc(&file->f_count);
+
+	while (pos < end) {
+		err = rst_get_object(-1, pos, &pgb, ctx);
+		if (err)
+			goto out;
+		dprintk_ctx("restoring file data block: %08x-%08x\n",
+		       (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
+
+		switch (pgb.cpt_object) {
+			case CPT_OBJ_PAGES:
+				if (!(file->f_mode & FMODE_WRITE) ||
+				    (file->f_flags&O_DIRECT)) {
+					fput(file);
+					file = dentry_open(dget(file->f_dentry),
+							   mntget(file->f_vfsmnt),
+							   O_WRONLY | O_LARGEFILE,
+							   current_cred());
+					if (IS_ERR(file))
+						return PTR_ERR(file);
+				}
+				err = restore_reg_chunk(file, pos, &pgb, ctx); 
+				if (err)
+					goto out;
+				break;
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+			case CPT_OBJ_ITERPAGES:
+			case CPT_OBJ_ITERYOUNGPAGES:
+				err = -EINVAL;
+				if (file->f_vfsmnt != get_exec_env()->shmem_mnt)
+					goto out;
+				err = rst_iter_chunk(file, pos, &pgb, ctx);
+				if (err)
+					goto out;
+				break;
+#endif
+			default:
+				eprintk_ctx("unsupported page type: %d.\n", 
+						pgb.cpt_object);
+				err = -EINVAL;
+				break;
+		}
+		pos += pgb.cpt_next;
+	}
+	err = 0;
+
+out:
+	fput(file);
+	return err;
+}
+
+
+static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi,
+			      struct cpt_inode_image *ii,
+			      struct cpt_context *ctx)
+{
+	int err;
+	struct file *file = *file_p;
+	struct iattr newattrs;
+
+	if (!S_ISREG(fi->cpt_i_mode))
+		return 0;
+
+	if (file == NULL) {
+		file = shmem_file_setup("dev/zero", ii->cpt_size, 0);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+		*file_p = file;
+	}
+
+	if (ii->cpt_next > ii->cpt_hdrlen) {
+		struct cpt_object_hdr hdr;
+		err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen);
+		if (err)
+			return err;
+		if ((hdr.cpt_object == CPT_OBJ_PAGES)
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+			|| (hdr.cpt_object == CPT_OBJ_ITERPAGES)
+#endif
+		) {
+			err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen,
+					fi->cpt_inode+ii->cpt_next, ctx);
+			if (err)
+				return err;
+		}
+	}
+
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	/* stage 1 - update size like do_truncate does */
+	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+	newattrs.ia_size = ii->cpt_size;
+	cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime);
+	err = notify_change(file->f_dentry, &newattrs);
+	if (err)
+		goto out;
+
+	/* stage 2 - update times, owner and mode */
+	newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME |
+		ATTR_ATIME_SET | ATTR_MTIME_SET |
+		ATTR_MODE | ATTR_UID | ATTR_GID;
+	newattrs.ia_uid = ii->cpt_uid;
+	newattrs.ia_gid = ii->cpt_gid;
+	newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT;
+	newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT);
+	cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime);
+	cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime);
+	err = notify_change(file->f_dentry, &newattrs);
+
+out:
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	return err;
+}
+
+static int fixup_file_flags(struct file *file, const struct cred *cred,
+			    struct cpt_file_image *fi,
+			    int was_dentry_open, loff_t pos,
+			    cpt_context_t *ctx)
+{
+	if (fi->cpt_pos != file->f_pos) {
+		int err = -ESPIPE;
+		if (file->f_op->llseek)
+			err = file->f_op->llseek(file, fi->cpt_pos, 0);
+		if (err < 0) {
+			dprintk_ctx("file %Ld lseek %Ld - %Ld\n",
+				    (long long)pos,
+				    (long long)file->f_pos,
+				    (long long)fi->cpt_pos);
+			file->f_pos = fi->cpt_pos;
+		}
+	}
+
+	if (cred->uid != fi->cpt_uid || cred->gid != fi->cpt_gid)
+		wprintk_ctx("fixup_file_flags: oops... creds mismatch\n");
+
+	/*
+	 * this is wrong. but with current cpt_file_image there's
+	 * nothing we can do
+	 */
+
+	put_cred(file->f_cred);
+	file->f_cred = get_cred(cred);
+
+	file->f_owner.pid = 0;
+	if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) {
+		file->f_owner.pid = find_get_pid(fi->cpt_fown_pid);
+		if (file->f_owner.pid == NULL) {
+			wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n",
+					fi->cpt_fown_pid);
+			return -EINVAL;
+		}
+	}
+	file->f_owner.uid = fi->cpt_fown_uid;
+	file->f_owner.euid = fi->cpt_fown_euid;
+	file->f_owner.signum = fi->cpt_fown_signo;
+
+	if (file->f_mode != fi->cpt_mode) {
+		if (was_dentry_open &&
+		    ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) {
+			file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK);
+			file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK);
+		}
+		if (file->f_mode != fi->cpt_mode)
+			wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode);
+	}
+	if (file->f_flags != fi->cpt_flags) {
+		if (!(fi->cpt_flags&O_NOFOLLOW))
+			file->f_flags &= ~O_NOFOLLOW;
+		if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) {
+			file->f_flags &= ~O_NONBLOCK;
+			file->f_flags |= fi->cpt_flags&O_NONBLOCK;
+		}
+		if ((file->f_flags ^ fi->cpt_flags) & O_LARGEFILE) {
+			file->f_flags &= ~O_LARGEFILE;
+			file->f_flags |= fi->cpt_flags & O_LARGEFILE;
+		}
+		if (fi->cpt_flags&FASYNC) {
+			if (fi->cpt_fown_fd == -1) {
+				wprintk_ctx("No fd for FASYNC\n");
+				return -EINVAL;
+			} else if (file->f_op && file->f_op->fasync) {
+				if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) {
+					wprintk_ctx("FASYNC problem\n");
+					return -EINVAL;
+				} else {
+					file->f_flags |= FASYNC;
+				}
+			}
+		}
+		if (file->f_dentry->d_sb->s_magic != ANON_INODE_FS_MAGIC) {
+			if (file->f_flags != fi->cpt_flags) {
+				eprintk_ctx("file %ld flags mismatch %08x %08x\n", 
+						(long)pos, file->f_flags, fi->cpt_flags);
+				return -EINVAL;
+			}
+		}
+	}
+	return 0;
+}
+
+static struct file *
+open_deleted(char *name, unsigned flags, struct cpt_file_image *fi,
+	     struct cpt_inode_image *ii, cpt_context_t *ctx)
+{
+	struct file * file;
+	char *suffix = NULL;
+	int attempt = 0;
+	int tmp_pass = 0;
+	mode_t mode = fi->cpt_i_mode;
+
+	/* Strip (deleted) part... */
+	if (strlen(name) > strlen(" (deleted)")) {
+		if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) {
+			suffix = &name[strlen(name) - strlen(" (deleted)")];
+			*suffix = 0;
+		} else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) {
+			memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1);
+			suffix = name + strlen(name);
+		}
+	}
+
+try_again:
+	for (;;) {
+		if (attempt) {
+			if (attempt > 1000) {
+				eprintk_ctx("open_deleted: failed after %d attempts\n", attempt);
+				return ERR_PTR(-EEXIST);
+			}
+			if (suffix == NULL) {
+				eprintk_ctx("open_deleted: no suffix\n");
+				return ERR_PTR(-EEXIST);
+			}
+			sprintf(suffix, ".%08x", (unsigned)((__current_kernel_time().tv_nsec>>10)+attempt));
+		}
+		attempt++;
+
+		if (S_ISFIFO(mode)) {
+			int err;
+			err = sc_mknod(name, S_IFIFO|(mode&017777), 0);
+			if (err == -EEXIST)
+				continue;
+			if (err < 0 && !tmp_pass)
+				goto change_dir;
+			if (err < 0)
+				return ERR_PTR(err);
+			file = open_pipe(NULL, name, fi, flags, ctx);
+			sc_unlink(name);
+		} else if (S_ISCHR(mode)) {
+			int err;
+			err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev));
+			if (err == -EEXIST)
+				continue;
+			if (err < 0 && !tmp_pass)
+				goto change_dir;
+			if (err < 0)
+				return ERR_PTR(err);
+			file = filp_open(name, flags, mode&017777);
+			sc_unlink(name);
+		} else if (S_ISDIR(mode)) {
+			int err;
+			err = sc_mkdir(name, mode&017777);
+			if (err == -EEXIST)
+				continue;
+			if (err < 0 && !tmp_pass)
+				goto change_dir;
+			if (err < 0)
+				return ERR_PTR(err);
+			file = filp_open(name, flags, mode&017777);
+			sc_rmdir(name);
+		} else {
+			unsigned int open_flags = flags | O_EXCL;
+
+			if (!(fi->cpt_lflags & CPT_DENTRY_SILLYRENAME))
+				open_flags |= O_CREAT;
+
+			file = filp_open(name, open_flags, mode&017777);
+			if (IS_ERR(file)) {
+				if (PTR_ERR(file) == -EEXIST)
+					continue;
+				if (!tmp_pass)
+					goto change_dir;
+			} else {
+				sc_unlink(name);
+			}
+		}
+		break;
+	}
+
+	if (IS_ERR(file)) {
+		eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file));
+		return file;
+	} else {
+		dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode);
+	}
+	return file;
+
+change_dir:
+	sprintf(name, "/tmp/rst%u", current->pid);
+	suffix = name + strlen(name);
+	attempt = 1;
+	tmp_pass = 1;
+	goto try_again;
+}
+
+#ifdef CONFIG_SIGNALFD
+static struct file *open_signalfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx)
+{
+	sigset_t mask;
+	mm_segment_t old_fs;
+	int fd;
+	struct file *file;
+
+	cpt_sigset_import(&mask, fi->cpt_priv);
+
+	old_fs = get_fs(); set_fs(KERNEL_DS);
+	fd = do_signalfd(-1, &mask, flags & (O_CLOEXEC | O_NONBLOCK));
+	set_fs(old_fs);
+
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	file = fget(fd);
+	sys_close(fd);
+
+	return file;
+}
+#else
+static struct file *open_signalfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif
+
+static struct file * open_timerfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	int fd;
+	struct file *file;
+	struct cpt_timerfd_image o;
+	struct itimerspec utmr;
+	struct itimerspec otmr;
+	struct timerfd_ctx *timerfd_ctx;
+	int err;
+
+	err = rst_get_object(CPT_OBJ_TIMERFD, *pos, &o, ctx);
+	if (err)
+		return ERR_PTR(err);
+	*pos += o.cpt_next;
+
+	cpt_timespec_import(&utmr.it_value, o.cpt_it_value);
+	cpt_timespec_import(&utmr.it_interval, o.cpt_it_interval);
+
+	old_fs = get_fs(); set_fs(KERNEL_DS);
+
+	fd = sys_timerfd_create(o.cpt_clockid,
+					flags & (O_CLOEXEC | O_NONBLOCK));
+	if (fd < 0) {
+		set_fs(old_fs);
+		return ERR_PTR(fd);
+	}
+	err = sys_timerfd_settime(fd, 0, &utmr, &otmr);
+
+	set_fs(old_fs);
+
+	if (err) {
+		file = ERR_PTR(err);
+		goto out;
+	} else
+		file = fget(fd);
+	sys_close(fd);
+
+	timerfd_ctx = file->private_data;
+
+	spin_lock_irq(&timerfd_ctx->wqh.lock);
+	if (o.cpt_expired)
+		timerfd_ctx->expired = 1;
+	timerfd_ctx->ticks += o.cpt_ticks;
+	spin_unlock_irq(&timerfd_ctx->wqh.lock);
+out:
+	return file;
+}
+
+static struct file * open_eventfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	int fd;
+	struct file *file;
+	struct cpt_eventfd_image o;
+	int err;
+
+	err = rst_get_object(CPT_OBJ_EVENTFD, *pos, &o, ctx);
+	if (err)
+		return ERR_PTR(err);
+
+	old_fs = get_fs(); set_fs(KERNEL_DS);
+	fd = sys_eventfd2(o.cpt_count, o.cpt_flags);
+	set_fs(old_fs);
+	if (fd < 0)
+		return ERR_PTR(fd);
+	file = fget(fd);
+	sys_close(fd);
+	return file;
+}
+
+struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx)
+{
+	int err;
+	int was_dentry_open = 0;
+	cpt_object_t *obj;
+	cpt_object_t *iobj;
+	struct cpt_file_image fi;
+	__u8 *name = NULL;
+	struct file *file;
+	struct proc_dir_entry *proc_dead_file;
+	int flags;
+	loff_t pos2;
+	cpt_object_t *mntobj = NULL;
+	const struct cred *cred_origin;
+
+	/*
+	 * It may happen that a process which created a file
+	 * had changed its UID after that (keeping file opened/referenced
+	 * with write permissions for 'own' only) as a result we might
+	 * be unable to read it at restore time due to credentials
+	 * mismatch, to break this tie we temporary take init_cred credentials
+	 * and as only the file gets read into the memory we restore original
+	 * credentials back
+	 *
+	 * Same time if between credentials rise/restore you need
+	 * the former credentials (for fixups or whatever) --
+	 * use cred_origin for that
+	 */
+
+	cred_origin = override_creds(get_exec_env()->init_cred);
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx);
+	if (obj) {
+		file = obj->o_obj;
+		if (obj->o_index >= 0) {
+			dprintk_ctx("file is attached to a socket\n");
+			err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
+			if (err < 0)
+				goto err_out;
+			fixup_file_flags(file, cred_origin, &fi, 0, pos, ctx);
+		}
+		get_file(file);
+		revert_creds(cred_origin);
+		return file;
+	}
+
+	err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
+	if (err < 0) {
+		eprintk_ctx("%s: failed to get file object: %d\n", __func__, err);
+		goto err_out;
+	}
+
+	flags = make_flags(&fi);
+
+	pos2 = pos + fi.cpt_hdrlen;
+	if (fi.cpt_next > fi.cpt_hdrlen)
+		name = __rst_get_name(&pos2, ctx);
+
+	if (!name) {
+		eprintk_ctx("no name for file?\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (cpt_object_has(&fi, cpt_vfsmount) && fi.cpt_vfsmount != CPT_NULL) {
+		mntobj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+				fi.cpt_vfsmount, ctx);
+		if (!mntobj && lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_MISSED_REF,
+						    fi.cpt_vfsmount, ctx)) {
+			file = open_fake_file(fi.cpt_i_mode, ctx);
+			if (!IS_ERR_OR_NULL(file))
+				goto map_file;
+			else {
+				err = PTR_ERR(file);
+				goto err_out;
+			}
+		}
+
+		if (!mntobj) {
+			eprintk_ctx("no vfsmount found for %s: %Ld\n", name, fi.cpt_vfsmount);
+			err = -ENODEV;
+			goto err_out;
+		}
+	}
+
+	if ((fi.cpt_lflags & CPT_DENTRY_DELETED) &&
+	    !(fi.cpt_lflags & CPT_DENTRY_SILLYRENAME) &&
+	    mntobj && (mntobj->o_flags & CPT_VFSMOUNT_DELAYFS)) {
+		sprintf(name, "/tmp/rst.%lu", jiffies);
+		mntobj = NULL;
+	}
+
+	/* Easy way, inode has been already open. */
+	if (fi.cpt_inode != CPT_NULL &&
+	    !(fi.cpt_lflags & CPT_DENTRY_CLONING) &&
+	    (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL &&
+	    iobj->o_parent) {
+		struct file *filp = iobj->o_parent;
+		file = dentry_open(dget(filp->f_dentry),
+				   mntget(filp->f_vfsmnt), flags, current_cred());
+		dprintk_ctx("rst_file: file obtained by dentry_open\n");
+		was_dentry_open = 1;
+		goto map_file;
+	}
+
+	if (fi.cpt_lflags & CPT_DENTRY_DELETED) {
+		struct cpt_inode_image ii;
+		if (fi.cpt_inode == CPT_NULL) {
+			eprintk_ctx("deleted file and no inode.\n");
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx);
+		if (err) {
+			eprintk_ctx("%s: failed to get file inode object (cpt_inode: %Ld): %d\n", __func__, fi.cpt_inode, err);
+			goto err_out;
+		}
+
+		if (ii.cpt_next > ii.cpt_hdrlen) {
+			struct cpt_object_hdr hdr;
+			err = ctx->pread(&hdr, sizeof(hdr), ctx,
+					fi.cpt_inode + ii.cpt_hdrlen);
+			if (err) {
+				eprintk_ctx("%s: failed to read file inode (cpt_inode: %Ld): %d\n", __func__, fi.cpt_inode, err);
+				goto err_out;
+			}
+			if (hdr.cpt_object == CPT_OBJ_NAME) {
+				rst_put_name(name, ctx);
+				name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen,
+						ctx);
+				if (!name) {
+					eprintk_ctx("no name for link?\n");
+					err = -EINVAL;
+					goto err_out;
+				}
+				if (cpt_object_has(&ii, cpt_vfsmount) &&
+						ii.cpt_vfsmount != CPT_NULL) {
+					mntobj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+							ii.cpt_vfsmount, ctx);
+					if (!mntobj) {
+						eprintk_ctx("no vfsmount found: %s\n", name);
+						err = -ENODEV;
+						goto err_out;
+					}
+				}
+
+				if ((fi.cpt_lflags & CPT_DENTRY_HARDLINKED) &&
+				    !ctx->hardlinked_on) {
+					eprintk_ctx("Open hardlinked is off\n");
+					err = -EPERM;
+					goto err_out;
+				}
+
+				if (!(fi.cpt_lflags & CPT_DENTRY_SILLYRENAME)) {
+					if (mntobj && (mntobj->o_flags & CPT_VFSMOUNT_DELAYFS)) {
+						sprintf(name, "/tmp/rst.%lu", jiffies);
+						mntobj = NULL;
+					} else
+						goto open_file;
+				}
+				/*
+				 * We can be here ONLY is we are going to open
+				 * and unlink SILLY-RENAMED file on NFS
+				 * private which is also marked by
+				 * CPT_DENTRY_SILLYRENAME flag.
+				 */
+			}
+		}
+
+		/* One very special case... */
+		if (S_ISREG(fi.cpt_i_mode) &&
+		   (!name[0] || (strcmp(name, "/dev/zero (deleted)") == 0)
+			     || (strcmp(name, " (deleted)/dev/zero") == 0))) {
+
+			/* MAP_ANON|MAP_SHARED mapping.
+			 * kernel makes this damn ugly way, when file which
+			 * is passed to mmap by user does not match
+			 * file finally attached to VMA. Ok, rst_mm
+			 * has to take care of this. Otherwise, it will fail.
+			 */
+			file = NULL;
+		} else if (S_ISREG(fi.cpt_i_mode) ||
+			   S_ISCHR(fi.cpt_i_mode) ||
+			   S_ISFIFO(fi.cpt_i_mode) ||
+			   S_ISDIR(fi.cpt_i_mode)) {
+			if (S_ISCHR(fi.cpt_i_mode)) {
+				file = open_special(mntobj, name, &fi, flags, 1, ctx);
+				if (file != NULL)
+					goto map_file;
+			}
+			file = open_deleted(name, flags, &fi, &ii, ctx);
+			if (IS_ERR(file)) {
+				eprintk_ctx("%s: failed to open deleted file '%s': %d\n", __func__, name, err);
+				goto out;
+			}
+		} else {
+			eprintk_ctx("not a regular deleted file.\n");
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		err = fixup_file_content(&file, &fi, &ii, ctx);
+		if (err) {
+			eprintk_ctx("%s: failed to fix up file content: %d\n", __func__, err);
+			goto err_put;
+		}
+		goto map_file;
+	} else {
+open_file:
+		if (!name[0]) {
+			eprintk_ctx("empty name for file?\n");
+			err = -EINVAL;
+			goto err_out;
+		}
+		if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) &&
+		    (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL)
+			goto map_file;
+#ifdef CONFIG_INOTIFY_USER
+		if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) &&
+		    (file = rst_open_inotify(&fi, flags, ctx)) != NULL)
+			goto map_file;
+#else
+		if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) {
+			err = -EINVAL;
+			goto err_out;
+		}
+#endif
+		if ((fi.cpt_lflags & CPT_DENTRY_SIGNALFD) &&
+			(file = open_signalfd(&fi, flags, ctx)) != NULL)
+			goto map_file;
+		if ((fi.cpt_lflags & CPT_DENTRY_TIMERFD) &&
+			(file = open_timerfd(&fi, flags, ctx, &pos2)) != NULL)
+			goto map_file;
+		if ((fi.cpt_lflags & CPT_DENTRY_EVENTFD) &&
+			(file = open_eventfd(&fi, flags, ctx, &pos2)) != NULL)
+			goto map_file;
+		if ((fi.cpt_lflags & CPT_DENTRY_FAKEFILE) &&
+			(file = open_fake_file(fi.cpt_mode, ctx)) != NULL)
+			goto map_file;
+		if (S_ISFIFO(fi.cpt_i_mode) &&
+		    (file = open_pipe(mntobj, name, &fi, flags, ctx)) != NULL)
+			goto map_file;
+		if (!S_ISREG(fi.cpt_i_mode) &&
+		    (file = open_special(mntobj, name, &fi, flags, 0, ctx)) != NULL) {
+			if (S_ISBLK(fi.cpt_i_mode)) {
+				/* The only case we support block devices is when
+				 * they are watched using inotify. Do not print
+				 * error in this case */
+				goto out;
+			}
+			goto map_file;
+		}
+	}
+
+	/* This hook is needed to open file /proc/<pid>/<somefile>
+	 * but there is no proccess with pid <pid>.
+	 */
+	proc_dead_file = NULL;
+	if (fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD) {
+		sprintf(name, "/proc/rst_dead_pid_file_%d", task_pid_vnr(current));
+
+		proc_dead_file = create_proc_entry(name + 6, S_IRUGO|S_IWUGO,
+						   get_exec_env()->proc_root);
+		if (!proc_dead_file) {
+			eprintk_ctx("can't create proc entry %s\n", name);
+			err = -ENOMEM;
+			goto err_out;
+		}
+#ifdef CONFIG_PROC_FS
+		proc_dead_file->proc_fops = &dummy_proc_pid_file_operations;
+		proc_dead_file->data = &dummy_proc_pid_file_operations;
+#endif
+	}
+
+	file = rst_open_file(mntobj, name, &fi, flags, ctx);
+
+	if (proc_dead_file) {
+		remove_proc_entry(proc_dead_file->name,
+				  get_exec_env()->proc_root);
+		if (!IS_ERR(file))
+			d_drop(file->f_dentry);
+	}
+map_file:
+	if (!IS_ERR(file)) {
+		fixup_file_flags(file, cred_origin, &fi, was_dentry_open, pos, ctx);
+
+		if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) {
+			err = fixup_pipe_data(file, &fi, ctx);
+			if (err) {
+				eprintk_ctx("%s: failed to fixup file '%s' pipe data: %d\n", __func__, name, err);
+				goto err_put;
+			}
+		}
+
+		/* This is very special hack. Logically, cwd/root are
+		 * nothing but open directories. Nevertheless, this causes
+		 * failures of restores, when number of open files in VE
+		 * is close to limit. So, if it is rst_file() of cwd/root
+		 * (fd = -2) and the directory is not deleted, we skip
+		 * adding files to object table. If the directory is
+		 * not unlinked, this cannot cause any problems.
+		 */
+		if (fd != -2 ||
+		    !S_ISDIR(file->f_dentry->d_inode->i_mode) ||
+		    (fi.cpt_lflags & CPT_DENTRY_DELETED) ||
+		    (mntobj && (mntobj->o_flags & CPT_VFSMOUNT_DELAYFS))) {
+			obj = cpt_object_get(CPT_OBJ_FILE, file, ctx);
+			if (!obj) {
+				obj = cpt_object_add(CPT_OBJ_FILE, file, ctx);
+				if (obj)
+					get_file(file);
+			}
+			if (obj) {
+				cpt_obj_setpos(obj, pos, ctx);
+				if (mntobj && (mntobj->o_flags & CPT_VFSMOUNT_DELAYFS))
+					obj->o_flags |= CPT_FILE_DELAYFS;
+				if (fi.cpt_lflags & CPT_DENTRY_SILLYRENAME)
+					obj->o_flags |= CPT_FILE_SILLYRENAME;
+			}
+
+			obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
+			if (obj) {
+				cpt_obj_setpos(obj, fi.cpt_inode, ctx);
+				if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED))
+					obj->o_parent = file;
+			}
+		}
+
+		if (fi.cpt_next > fi.cpt_hdrlen) {
+			err = fixup_flocks(file, &fi, pos, ctx);
+			if (err) {
+				eprintk_ctx("%s: failed to fixup file '%s' flocks: %d\n", __func__, name, err);
+				goto err_put;
+			}
+		}
+	} else {
+		if ((fi.cpt_lflags & CPT_DENTRY_PROC) &&
+		    !(fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) {
+			dprintk_ctx("rst_file /proc delayed\n");
+			file = NULL;
+		} else if (name)
+			eprintk_ctx("can't open file %s\n", name);
+	}
+
+out:
+	if (name)
+		rst_put_name(name, ctx);
+	revert_creds(cred_origin);
+	return file;
+
+err_put:
+	if (file)
+		fput(file);
+err_out:
+	if (name)
+		rst_put_name(name, ctx);
+	revert_creds(cred_origin);
+	return ERR_PTR(err);
+}
+
+
+__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	__u32 flag = 0;
+
+	if (ti->cpt_files == CPT_NULL ||
+	    lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx))
+		flag |= CLONE_FILES;
+	if (ti->cpt_fs == CPT_NULL ||
+	    lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx))
+		flag |= CLONE_FS;
+	return flag;
+}
+
+static void local_close_files(struct files_struct * files)
+{
+	int i, j;
+
+	j = 0;
+	for (;;) {
+		unsigned long set;
+		i = j * __NFDBITS;
+		if (i >= files->fdt->max_fds)
+			break;
+		set = files->fdt->open_fds->fds_bits[j];
+		while (set) {
+			if (set & 1) {
+				struct file * file = xchg(&files->fdt->fd[i], NULL);
+				if (file)
+					filp_close(file, files);
+			}
+			i++;
+			set >>= 1;
+		}
+		files->fdt->open_fds->fds_bits[j] = 0;
+		files->fdt->close_on_exec->fds_bits[j] = 0;
+		j++;
+	}
+}
+
+int rst_files(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	struct cpt_files_struct_image fi;
+	struct files_struct *f = current->files;
+	cpt_object_t *obj;
+	loff_t pos, endpos;
+	int err;
+
+	if (ti->cpt_files == CPT_NULL) {
+		current->files = NULL;
+		if (f)
+			put_files_struct(f);
+		return 0;
+	}
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx);
+	if (obj) {
+		if (obj->o_obj != f) {
+			put_files_struct(f);
+			f = obj->o_obj;
+			atomic_inc(&f->count);
+			current->files = f;
+		}
+		return 0;
+	}
+
+	err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx);
+	if (err)
+		return err;
+
+	local_close_files(f);
+
+	if (fi.cpt_max_fds > f->fdt->max_fds) {
+		spin_lock(&f->file_lock);
+		err = expand_fdtable(f, fi.cpt_max_fds-1);
+		spin_unlock(&f->file_lock);
+		if (err < 0)
+			return err;
+	}
+
+	pos = ti->cpt_files + fi.cpt_hdrlen;
+	endpos = ti->cpt_files + fi.cpt_next;
+	while (pos < endpos) {
+		struct cpt_fd_image fdi;
+		struct file *filp;
+
+		err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx);
+		if (err)
+			return err;
+
+		filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
+		if (IS_ERR(filp)) {
+			eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp),
+				    (long long)fdi.cpt_file);
+			return PTR_ERR(filp);
+		}
+		if (filp == NULL) {
+			int err = rst_filejob_queue(pos, ctx);
+			if (err)
+				return err;
+		} else {
+			if (fdi.cpt_fd >= f->fdt->max_fds) BUG();
+			f->fdt->fd[fdi.cpt_fd] = filp;
+			FD_SET(fdi.cpt_fd, f->fdt->open_fds);
+			if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
+				FD_SET(fdi.cpt_fd, f->fdt->close_on_exec);
+		}
+
+		pos += fdi.cpt_next;
+	}
+	f->next_fd = fi.cpt_next_fd;
+
+	obj = cpt_object_add(CPT_OBJ_FILES, f, ctx);
+	if (obj) {
+		cpt_obj_setpos(obj, ti->cpt_files, ctx);
+		cpt_obj_setindex(obj, fi.cpt_index, ctx);
+	}
+	return 0;
+}
+
+int rst_do_filejobs(cpt_context_t *ctx)
+{
+	struct filejob *j;
+
+	while ((j = ctx->filejob_queue) != NULL) {
+		int err;
+		struct task_struct *tsk;
+		struct cpt_fd_image fdi;
+		struct file *filp;
+
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_vpid(j->pid);
+		if (tsk)
+			get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+		if (!tsk)
+			return -EINVAL;
+
+		err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx);
+		if (err) {
+			put_task_struct(tsk);
+			return err;
+		}
+
+		if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
+		if (tsk->files->fdt->fd[fdi.cpt_fd] ||
+		    FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) {
+			eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi);
+			put_task_struct(tsk);
+			return -EBUSY;
+		}
+
+		filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
+		if (IS_ERR(filp)) {
+			eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file);
+			put_task_struct(tsk);
+			return PTR_ERR(filp);
+		}
+		if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
+		tsk->files->fdt->fd[fdi.cpt_fd] = filp;
+		FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds);
+		if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
+			FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec);
+
+		dprintk_ctx("filejob %Ld done\n", j->fdi);
+
+		put_task_struct(tsk);
+		ctx->filejob_queue = j->next;
+		kfree(j);
+	}
+	return 0;
+}
+
+void rst_flush_filejobs(cpt_context_t *ctx)
+{
+	struct filejob *j;
+
+	while ((j = ctx->filejob_queue) != NULL) {
+		ctx->filejob_queue = j->next;
+		kfree(j);
+	}
+}
+
+int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	struct fs_struct *f = current->fs;
+	cpt_object_t *obj;
+
+	if (ti->cpt_fs == CPT_NULL) {
+		exit_fs(current);
+		return 0;
+	}
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx);
+	if (obj) {
+		if (obj->o_obj != f) {
+			exit_fs(current);
+			f = obj->o_obj;
+			spin_lock(&f->lock);
+			f->users++;
+			spin_unlock(&f->lock);
+			current->fs = f;
+		}
+		return 0;
+	}
+
+	/* Do _not_ restore root. Image contains absolute pathnames.
+	 * So, we fix it in context of rst process.
+	 */
+
+	obj = cpt_object_add(CPT_OBJ_FS, f, ctx);
+	if (obj)
+		cpt_obj_setpos(obj, ti->cpt_fs, ctx);
+
+	return 0;
+}
+
+/*
+ * Read dev's UUID from its superblock and compare with the given.
+ * Returns device string like "/dev/ploopXXXp1" if success.
+ */
+static char *compare_mntdev_uuid(dev_t dev, u8 *uuid, struct cpt_context *ctx)
+{
+	struct block_device *bdev;
+	char buf[36 + 1]; /* heximal UUID is 36 symbols */
+	char *mntdev = NULL;
+	unsigned long long logical_sb_block, sb_block = 1;
+	unsigned long offset = 0;
+	struct buffer_head *bh;
+	void *es;
+	const u8 *u;
+	int blocksize, err;
+
+	bdev = open_by_devnum(dev, FMODE_READ);
+	if (IS_ERR(bdev)) {
+		eprintk_ctx("Can't get UUID: open_by_devnum(%d:%d) failed with %ld\n",
+			    MAJOR(dev), MINOR(dev),PTR_ERR(bdev));
+		return (void *)bdev;
+	}
+
+	err = bd_claim(bdev, get_exec_env());
+	if (err) {
+		/* Already claimed by somebody */
+		goto put;
+	}
+
+#define EXT4_MIN_BLOCK_SIZE	1024
+#define EXT4_UUID_OFFSET	0x68
+	blocksize = EXT4_MIN_BLOCK_SIZE;
+	if (blocksize < bdev_logical_block_size(bdev))
+		blocksize = bdev_logical_block_size(bdev);
+
+	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+	} else {
+		logical_sb_block = sb_block;
+	}
+
+	set_blocksize(bdev, blocksize);
+	bh = __bread(bdev, logical_sb_block, blocksize);
+	if (!bh) {
+		eprintk_ctx("Can't get UUID: bread(%d:%d) failed with %ld\n",
+			    MAJOR(dev), MINOR(dev),PTR_ERR(bdev));
+		mntdev = ERR_PTR(-EIO);
+		goto release;
+	}
+
+	/* start of ext4 superblock */
+	es = (((char *)bh->b_data) + offset);
+	/* UUID address */
+	u = es + EXT4_UUID_OFFSET;
+
+	uuid_bytes_to_hex(buf, u);
+	if (strcmp(buf, uuid) == 0) {
+		/* We reuse this buffer for mntdev */
+		mntdev = uuid;
+		/* This stands on that ploop has only partition */
+		sprintf(mntdev, "/dev/%sp1", bdev->bd_disk->disk_name);
+	}
+	brelse(bh);
+release:
+	bd_release(bdev);
+put:
+	blkdev_put(bdev, FMODE_READ);
+	return mntdev;
+}
+
+static char *rst_get_mntdev_by_uuid(loff_t *pos_p, bool *missed, struct cpt_context *ctx)
+{
+	struct ve_struct *ve = get_exec_env();
+	char *uuid = __rst_get_name(pos_p, ctx);
+	struct ve_devmnt *devmnt;
+	char *mntdev = NULL;
+
+	if (!uuid) {
+		eprintk_ctx("Can't get mntdev UUID\n");
+		return NULL;
+	}
+
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(devmnt, &ve->devmnt_list, link) {
+		mntdev = compare_mntdev_uuid(devmnt->dev, (u8 *)uuid, ctx);
+		if (IS_ERR(mntdev))
+			continue;
+		else if (mntdev)
+			break;
+	}
+	mutex_unlock(&ve->devmnt_mutex);
+
+	if (IS_ERR_OR_NULL(mntdev)) {
+		/* Return non-zero string */
+		mntdev = uuid;
+		*missed = true;
+	}
+
+	return mntdev;
+}
+
+int rst_get_dentry(struct dentry **dp, struct vfsmount **mp,
+		   loff_t *pos, struct cpt_context *ctx)
+{
+	struct cpt_file_image fi;
+	struct file * file;
+	int err;
+
+	err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx);
+	if (err)
+		return err;
+
+	file = rst_file(*pos, -2, ctx);
+	if (IS_ERR(file)) {
+		if (PTR_ERR(file) == -EINVAL && (S_ISLNK(fi.cpt_i_mode) ||
+						 S_ISBLK(fi.cpt_i_mode))) {
+			/* Two special cases: inotify on symlink and on bdevs */
+			struct nameidata nd;
+			__u8 *name = NULL;
+
+			if (fi.cpt_next > fi.cpt_hdrlen)
+				name = rst_get_name(*pos + fi.cpt_hdrlen, ctx);
+			if (!name) {
+				eprintk_ctx("can't get name for file\n");
+				return -EINVAL;
+			}
+			if ((err = path_lookup(name, 0, &nd)) != 0) {
+				eprintk_ctx("path_lookup %s: %d\n", name, err);
+				rst_put_name(name, ctx);
+				return -EINVAL;
+			}
+			*dp = nd.path.dentry;
+			*mp = nd.path.mnt;
+			*pos += fi.cpt_next;
+			rst_put_name(name, ctx);
+			return 0;
+		}
+		return PTR_ERR(file);
+	}
+
+	*dp = dget(file->f_dentry);
+	*mp = mntget(file->f_vfsmnt);
+	*pos += fi.cpt_next;
+	fput(file);
+	return 0;
+}
+
+static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt,
+			  struct dentry *dentry)
+{
+	struct dentry *old_root;
+	struct vfsmount *old_rootmnt;
+	spin_lock(&fs->lock);
+	old_root = fs->root.dentry;
+	old_rootmnt = fs->root.mnt;
+	fs->root.mnt = mnt;
+	fs->root.dentry = dentry;
+	spin_unlock(&fs->lock);
+	if (old_root) {
+		dput(old_root);
+		mntput(old_rootmnt);
+	}
+}
+
+static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+			 struct dentry *dentry)
+{
+	struct dentry *old_pwd;
+	struct vfsmount *old_pwdmnt;
+
+	spin_lock(&fs->lock);
+	old_pwd = fs->pwd.dentry;
+	old_pwdmnt = fs->pwd.mnt;
+	fs->pwd.mnt = mnt;
+	fs->pwd.dentry = dentry;
+	spin_unlock(&fs->lock);
+
+	if (old_pwd) {
+		dput(old_pwd);
+		mntput(old_pwdmnt);
+	}
+}
+
+
+int rst_restore_fs(struct cpt_context *ctx)
+{
+	loff_t pos;
+	cpt_object_t *obj;
+	int err = 0;
+
+	for_each_object(obj, CPT_OBJ_FS) {
+		struct cpt_fs_struct_image fi;
+		struct fs_struct *fs = obj->o_obj;
+		int i;
+		struct dentry *d[3];
+		struct vfsmount *m[3];
+
+		err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx);
+		if (err)
+			return err;
+
+		fs->umask = fi.cpt_umask;
+
+		pos = obj->o_pos + fi.cpt_hdrlen;
+		d[0] = d[1] = d[2] = NULL;
+		m[0] = m[1] = m[2] = NULL;
+		i = 0;
+		while (pos < obj->o_pos + fi.cpt_next && i<3) {
+			err = rst_get_dentry(d+i, m+i, &pos, ctx);
+			if (err) {
+				eprintk_ctx("cannot get_dir: %d\n", err);
+				for (--i; i >= 0; i--) {
+					if (d[i])
+						dput(d[i]);
+					if (m[i])
+						mntput(m[i]);
+				}
+				return err;
+			}
+			i++;
+		}
+		if (d[0])
+			__set_fs_root(fs, m[0], d[0]);
+		if (d[1])
+			__set_fs_pwd(fs, m[1], d[1]);
+		if (d[2])
+			wprintk_ctx("altroot arrived...\n");
+	}
+	return err;
+}
+
+int rst_path_lookup_at(struct vfsmount *mnt, struct dentry *dentry,
+		const char *path, unsigned int flags, struct nameidata *nd)
+{
+	struct filename filename = { .name = path };
+
+	nd->flags = flags;
+	nd->last_type = LAST_ROOT;
+	nd->depth = 0;
+	nd->path.dentry = dget(dentry);
+	nd->path.mnt = mntget(mnt);
+
+	return path_walk(&filename, nd);
+}
+
+int rst_path_lookup(cpt_object_t *mntobj, const char *path,
+		unsigned int flags, struct nameidata *nd)
+{
+	struct vfsmount *mnt;
+
+	if (!mntobj)
+		return path_lookup(path, flags, nd);
+
+	if (strlen(path) < mntobj->o_lock) {
+		eprintk("path %s to short for mnt pos:%lu len:%d\n",
+				path, (unsigned long)mntobj->o_pos, mntobj->o_lock);
+		return -EINVAL;
+	}
+
+	mnt = mntobj->o_obj;
+	return rst_path_lookup_at(mnt, mnt->mnt_root,
+			path + mntobj->o_lock, flags | LOOKUP_DIVE, nd);
+}
+
+void rst_finish_vfsmount_ref(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_NAMESPACE) {
+		if (obj->o_obj)
+			put_mnt_ns(obj->o_obj);
+		if (obj->o_parent)
+			put_nsproxy(obj->o_parent);
+	}
+
+	for_each_object(obj, CPT_OBJ_VFSMOUNT_REF)
+		mntput(obj->o_obj);
+}
+
+struct vfsmount *rst_kern_mount(const char *fstype, int flags,
+		const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	put_filesystem(type);
+	return mnt;
+}
+
+struct tar_args
+{
+	int pfd;
+	struct vfsmount *mnt;
+};
+
+static int undumptmpfs(void *arg)
+{
+	struct tar_args *args = arg;
+	int i;
+	int fd1, fd2, err;
+	char *argv[] = { "tar", "x", "-C", "/", "-S", NULL };
+	char *argv_pwd[] = { "tar", "x", "-S", NULL };
+
+	if (args->pfd != 0)
+		sc_dup2(args->pfd, 0);
+
+	if (args->mnt) {
+		struct path pwd = {
+			.mnt = args->mnt,
+			.dentry = args->mnt->mnt_root,
+		};
+		set_fs_pwd(current->fs, &pwd);
+	}
+
+	set_fs(KERNEL_DS);
+	fd1 = sc_open("/dev/null", O_WRONLY, 0);
+	fd2 = sc_open("/dev/null", O_WRONLY, 0);
+try:
+	if (fd1 < 0 || fd2 < 0) {
+		if (fd1 == -ENOENT && fd2 == -ENOENT) {
+			err = sc_mknod("/dev/null", S_IFCHR|0666,
+					new_encode_dev((MEM_MAJOR<<MINORBITS)|3));
+			if (err < 0) {
+				eprintk("can't create /dev/null: %d\n", err);
+				module_put(THIS_MODULE);
+				return 255 << 8;
+			}
+			fd1 = sc_open("/dev/null", O_WRONLY, 0666);
+			fd2 = sc_open("/dev/null", O_WRONLY, 0666);
+			sc_unlink("/dev/null");
+			goto try;
+		}
+		eprintk("can not open /dev/null for tar: %d %d\n", fd1, fd2);
+		module_put(THIS_MODULE);
+		return 255 << 8;
+	}
+	if (fd1 != 1)
+		sc_dup2(fd1, 1);
+	if (fd2 != 2)
+		sc_dup2(fd2, 2);
+
+	for (i = 3; i < current->files->fdt->max_fds; i++)
+		sc_close(i);
+
+	module_put(THIS_MODULE);
+
+	i = kernel_execve("/bin/tar", args->mnt ? argv_pwd : argv, NULL);
+	eprintk("failed to exec /bin/tar: %d\n", i);
+	return 255 << 8;
+}
+
+static int rst_restore_tmpfs(loff_t *pos, struct vfsmount *mnt,
+			     struct cpt_context * ctx)
+{
+	int err;
+	int pfd[2];
+	struct file *f;
+	struct cpt_obj_tar v;
+	int n;
+	loff_t end;
+	int pid;
+	int status;
+	mm_segment_t oldfs;
+	sigset_t ignore, blocked;
+	struct tar_args args;
+
+	err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx);
+	if (err < 0)
+		return err;
+
+	err = sc_pipe(pfd);
+	if (err < 0)
+		return err;
+	args.pfd = pfd[0];
+	args.mnt = mnt;
+	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+	sigprocmask(SIG_BLOCK, &ignore, &blocked);
+	pid = err = local_kernel_thread(undumptmpfs, (void*)&args, SIGCHLD, 0);
+	if (err < 0) {
+		eprintk_ctx("tmpfs local_kernel_thread: %d\n", err);
+		goto out;
+	}
+	f = fget(pfd[1]);
+	sc_close(pfd[1]);
+	sc_close(pfd[0]);
+
+	ctx->file->f_pos = *pos + sizeof(v);
+	end = ctx->file->f_pos + v.cpt_len;
+	if (v.cpt_content != CPT_CONTENT_DATA) {
+		/*
+		 * Old kernels: before 042stab054.
+		 */
+		ctx->file->f_pos = *pos + sizeof(struct cpt_object_hdr);
+		end = *pos + v.cpt_next;
+	}
+	*pos += v.cpt_next;
+	do {
+		char buf[16];
+
+		n = end - ctx->file->f_pos;
+		if (n > sizeof(buf))
+			n = sizeof(buf);
+
+		if (ctx->read(buf, n, ctx))
+			break;
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		f->f_op->write(f, buf, n, &f->f_pos);
+		set_fs(oldfs);
+	} while (ctx->file->f_pos < end);
+
+	fput(f);
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if ((err = sc_waitx(pid, 0, &status)) < 0)
+		eprintk_ctx("wait4: %d\n", err);
+	else if ((status & 0x7f) == 0) {
+		err = (status & 0xff00) >> 8;
+		if (err != 0) {
+			eprintk_ctx("tar exited with %d\n", err);
+			err = -EINVAL;
+		}
+	} else {
+		eprintk_ctx("tar terminated\n");
+		err = -EINVAL;
+	}
+	set_fs(oldfs);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+	return err;
+
+out:
+	if (pfd[1] >= 0)
+		sc_close(pfd[1]);
+	if (pfd[0] >= 0)
+		sc_close(pfd[0]);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+	return err;
+}
+
+struct vfsmount *rst_lookup_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx)
+{
+	struct mnt_namespace *n = current->nsproxy->mnt_ns;
+	struct path root = current->fs->root;
+	struct list_head *p;
+	struct vfsmount *t, *mnt;
+	char *path, *path_buf;
+
+	mnt = ERR_PTR(-ENOENT);
+	path_buf = cpt_get_buf(ctx);
+	down_read(&namespace_sem);
+	list_for_each(p, &n->list) {
+		struct path pt;
+		t = list_entry(p, struct vfsmount, mnt_list);
+		pt.dentry = t->mnt_root;
+		pt.mnt = t;
+		path = d_path(&pt, path_buf, PAGE_SIZE);
+		if (IS_ERR(path) || strcmp(path, mntpnt))
+			continue;
+		/* Allow changing fs type only for root filesystem */
+		if (!strcmp(t->mnt_sb->s_type->name, mnttype) ||
+		    (pt.mnt == root.mnt && pt.dentry == root.dentry)) {
+			mnt = mntget(t);
+			break;
+		}
+	}
+	up_read(&namespace_sem);
+	__cpt_release_buf(ctx);
+	return mnt;
+}
+
+static int missed_mount_allowed(unsigned sb_flags)
+{
+	return (CPT_MNT_PLOOP & sb_flags) != 0;
+}
+
+static __u8 *rst_get_mount_data(loff_t *pos_p, struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_object_hdr hdr;
+	__u8 *name;
+
+	err = rst_get_object(CPT_OBJ_MOUNT_DATA, *pos_p, &hdr, ctx);
+	if (err)
+		return NULL;
+	if (hdr.cpt_next - hdr.cpt_hdrlen > (PAGE_SIZE << 1))
+		return NULL;
+	name = (void*)__get_free_pages(GFP_KERNEL, 1);
+	if (!name)
+		return NULL;
+	err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen,
+		   ctx, *pos_p + hdr.cpt_hdrlen);
+	if (err) {
+		free_pages((unsigned long)name, 1);
+		return NULL;
+	}
+	*pos_p += hdr.cpt_next;
+	return name;
+}
+
+static char *restore_get_mount_data(loff_t *pos, struct cpt_context *ctx, int *type)
+{
+	char *data;
+
+	*type = CPT_OBJ_MOUNT_DATA;
+	data = rst_get_mount_data(pos, ctx);
+	if (!data) {
+		/* Old image? */
+		*type = CPT_OBJ_NAME;
+		data = __rst_get_name(pos, ctx);
+	}
+	return data;
+}
+
+static void rst_put_mount_data(__u8 *name, struct cpt_context *ctx)
+{
+	unsigned long addr = (unsigned long)name;
+
+	if (addr)
+		free_pages(addr&~(PAGE_SIZE-1), 1);
+}
+
+static void restore_put_mount_data(char *data, struct cpt_context *ctx, int type)
+{
+	if (type == CPT_OBJ_MOUNT_DATA)
+		rst_put_mount_data(data, ctx);
+	else if (type == CPT_OBJ_NAME)
+		rst_put_name(data, ctx);
+	else BUG();
+}
+
+int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos,
+			 cpt_object_t *ns_obj, struct cpt_context *ctx)
+{
+	int err = 0;
+	loff_t endpos;
+	loff_t mntpos = pos;
+	struct vfsmount *mnt, *shared, *master;
+	cpt_object_t *mntobj, *bindobj, *parent;
+
+	endpos = pos + mi->cpt_next;
+	pos += mi->cpt_hdrlen;
+
+	while (pos < endpos && !err) {
+		char *mntdev;
+		char *mntpnt;
+		char *mnttype;
+		char *mntbind = NULL;
+		char *mntdata = NULL;
+		bool missed_ploop = false;
+		int is_cgroup;
+		int is_tmpfs = 0;
+		int is_ro_tmpfs = 0;
+		int data_type = 0;
+
+		if (!(mi->cpt_mntflags & CPT_MNT_PLOOP))
+			mntdev = __rst_get_name(&pos, ctx);
+		else
+			mntdev = rst_get_mntdev_by_uuid(&pos, &missed_ploop, ctx);
+
+		mntpnt = __rst_get_name(&pos, ctx);
+		mnttype = __rst_get_name(&pos, ctx);
+
+		err = -EINVAL;
+		if (!mntdev || !mntpnt || !mnttype)
+			goto out_err;
+
+		is_cgroup = strcmp(mnttype, "cgroup") == 0;
+
+		if (mi->cpt_mntflags & CPT_MNT_BIND) {
+			mntbind = __rst_get_name(&pos, ctx);
+			if (!mntbind)
+				goto out_err;
+		}
+
+		/* legacy workarounds for images from ancient kernels */
+		if (!cpt_object_has(mi, cpt_mnt_parent)) {
+			/* erroneous root-bindmount */
+			if (mntbind && (!strcmp(mntbind, "/") ||
+					!strcmp(mntbind, "")))
+				mi->cpt_mntflags &= ~CPT_MNT_BIND;
+
+			/* non-external root-mount. skip it. */
+			if (!(mi->cpt_mntflags & CPT_MNT_EXT) &&
+					!strcmp(mntpnt, "/")) {
+				err = 0;
+				goto out_err;
+			}
+		}
+
+		if (mi->cpt_mntflags & CPT_MNT_DELAYFS) {
+			mntdata = restore_get_mount_data(&pos, ctx, &data_type);
+			if (!mntdata) {
+				eprintk_ctx("failed to get mount data\n");
+				goto out_err;
+			}
+		}
+
+		bindobj = NULL;
+		if (cpt_object_has(mi, cpt_mnt_bind) &&
+				mi->cpt_mnt_bind != CPT_NULL) {
+			if (is_cgroup)
+				bindobj = lookup_cpt_obj_byindex(CPT_OBJ_CGROUP,
+					mi->cpt_mnt_bind, ctx);
+			else
+				bindobj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+					mi->cpt_mnt_bind, ctx);
+			if (!bindobj) {
+				eprintk_ctx("bind mount source not found: %s\n",
+						mntbind);
+				err = -ENODEV;
+				goto out_err;
+			}
+		}
+
+		parent = NULL;
+		if (cpt_object_has(mi, cpt_mnt_parent) &&
+				mi->cpt_mnt_parent != CPT_NULL) {
+			parent = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+					mi->cpt_mnt_parent, ctx);
+			if (!parent) {
+				err = -ENOLINK;
+				goto out_err;
+			}
+		}
+
+		shared = NULL;
+		if (cpt_object_has(mi, cpt_mnt_shared) &&
+				mi->cpt_mnt_shared != CPT_NULL) {
+			cpt_object_t *shared_obj;
+
+			shared_obj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+					mi->cpt_mnt_shared, ctx);
+			if (!shared_obj || !shared_obj->o_obj) {
+				err = -ENOLINK;
+				goto out_err;
+			}
+			shared = shared_obj->o_obj;
+		}
+
+		master = NULL;
+		if (cpt_object_has(mi, cpt_mnt_master) &&
+				mi->cpt_mnt_master != CPT_NULL) {
+			cpt_object_t *master_obj;
+
+			master_obj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+					mi->cpt_mnt_master, ctx);
+			if (!master_obj || !master_obj->o_obj) {
+				err = -ENOLINK;
+				goto out_err;
+			}
+			master = master_obj->o_obj;
+		}
+
+		mntobj = alloc_cpt_object(GFP_KERNEL, ctx);
+		if (!mntobj) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+		cpt_obj_setpos(mntobj, mntpos, ctx);
+		mntobj->o_lock = strlen(mntpnt);
+
+		if (mi->cpt_mntflags & CPT_MNT_DELAYFS) {
+			mnt = rst_mount_delayfs(mnttype, mi->cpt_flags,
+					mntdev, mntdata, ctx);
+			mntobj->o_flags |= CPT_VFSMOUNT_DELAYFS;
+		} else if (mi->cpt_mntflags & CPT_MNT_EXT) {
+			mnt = rst_lookup_ext_mount(mntpnt, mnttype, ctx);
+			if (IS_ERR(mnt))
+				eprintk_ctx("mount point is missing: %s\n", mntpnt);
+		} else if (mi->cpt_mntflags & CPT_MNT_PLOOP) {
+			mnt = NULL;
+			if (!missed_ploop) {
+				unsigned sb_flags = mi->cpt_flags & ~MS_KERNMOUNT;
+				/*
+				 * rst_kern_mount() is for in-kernel filesystems, which do not
+				 * require BKL. We add lock_kernel() just to use it for EXT4.
+				 */
+				lock_kernel();
+				mnt = rst_kern_mount(mnttype, sb_flags, mntdev, NULL);
+				unlock_kernel();
+			}
+
+			if (IS_ERR_OR_NULL(mnt)) {
+				eprintk_ctx("restore of ploop mount was failed\n");
+				if (missed_mount_allowed(mi->cpt_mntflags)) {
+					eprintk_ctx("restore, ignoring %s\n", mntdev);
+					cpt_obj_setobj(mntobj, NULL, ctx);
+					intern_cpt_object(CPT_OBJ_VFSMOUNT_MISSED_REF,
+							  mntobj, ctx);
+					err = 0;
+					goto out_err;
+				}
+			}
+		} else if (mi->cpt_mntflags & CPT_MNT_BIND) {
+			struct nameidata nd;
+
+			err = rst_path_lookup(bindobj, mntbind,
+					LOOKUP_FOLLOW, &nd);
+			if (err) {
+				eprintk_ctx("bindmount lookup failed: @%lld %s\n",
+						bindobj ? bindobj->o_pos : 0, mntpnt);
+				goto out_err;
+			}
+
+			mnt = vfs_bind_mount(nd.path.mnt, nd.path.dentry);
+			path_put(&nd.path);
+		} else if (is_cgroup) {
+			struct cgroup *cgrp;
+
+			if (bindobj == NULL) {
+				err = -EINVAL;
+				goto out_err;
+			}
+
+			cgrp = bindobj->o_obj;
+
+			bindobj = lookup_cpt_object(CPT_OBJ_CGROUPS, cgrp->dentry->d_sb, ctx);
+			if (!bindobj) {
+				err = -ENODEV;
+				goto out_err;
+			}
+
+			mnt = vfs_bind_mount(bindobj->o_parent, cgrp->dentry);
+		} else if (!strcmp(mnttype, "rootfs")) {
+			mnt = current->nsproxy->mnt_ns->root;
+			mnt = vfs_bind_mount(mnt, mnt->mnt_root);
+		} else {
+			unsigned sb_flags;
+
+			if (!strcmp(mnttype, "tmpfs") ||
+			    !strcmp(mnttype, "devtmpfs")) {
+				is_tmpfs = 1;
+				if (mi->cpt_flags & MS_RDONLY) {
+					/* tar can't extract to R/O fs */
+					mi->cpt_flags &= ~MS_RDONLY;
+					is_ro_tmpfs = 1;
+				}
+			}
+			sb_flags = mi->cpt_flags & ~MS_KERNMOUNT;
+			mnt = rst_kern_mount(mnttype, sb_flags, mntdev, NULL);
+		}
+
+		if (IS_ERR_OR_NULL(mnt)) {
+			err = PTR_ERR(mnt);
+			free_cpt_object(mntobj, ctx);
+			goto out_err;
+		}
+
+		err = 0;
+		cpt_obj_setobj(mntobj, mnt, ctx);
+		intern_cpt_object(CPT_OBJ_VFSMOUNT_REF, mntobj, ctx);
+
+		if (!ns_obj->o_obj) {
+			struct mnt_namespace *mnt_ns;
+
+			mnt_ns = create_mnt_ns(mntget(mnt));
+			if (IS_ERR(mnt_ns)) {
+				err = PTR_ERR(mnt_ns);
+				goto out_err;
+			}
+			cpt_obj_setobj(ns_obj, mnt_ns, ctx);
+		}
+
+		if (!mnt->mnt_ns) {
+			struct nameidata nd;
+			unsigned mntflags;
+
+			err = rst_path_lookup(parent, mntpnt, LOOKUP_FOLLOW, &nd);
+			if (err) {
+				eprintk_ctx("Failed to lookup path '%s'\n", mntpnt);
+				goto out_err;
+			}
+			mntflags = MNT_CPT | (mi->cpt_mntflags & ~(CPT_MNT_BIND |
+				   CPT_MNT_PLOOP | CPT_MNT_EXT | CPT_MNT_DELAYFS));
+			if (is_ro_tmpfs)
+				mntflags &= ~MNT_READONLY;
+			err = do_add_mount(mntget(mnt), &nd.path, mntflags, NULL);
+			path_put(&nd.path);
+			if (err)
+				goto out_err;
+
+			if (shared || master) {
+				down_write(&namespace_sem);
+				if (master)
+					mnt->mnt_master = master;
+					list_add(&mnt->mnt_slave,
+						 &master->mnt_slave_list);
+				if (shared)
+					list_add(&mnt->mnt_share,
+						 &shared->mnt_share);
+				up_write(&namespace_sem);
+			}
+
+			if (is_tmpfs) {
+				if (ns_obj->o_flags & CPT_NAMESPACE_MAIN)
+					err = rst_restore_tmpfs(&pos, NULL, ctx);
+				else
+					err = rst_restore_tmpfs(&pos, mnt, ctx);
+			}
+		}
+
+		if (!err && is_ro_tmpfs) {
+			struct path path = {	.mnt = mnt,
+						.dentry = mnt->mnt_root, };
+			/* We don't support fs-specific options, so last arg is NULL */
+			err = do_remount(&path, mnt->mnt_sb->s_flags | MS_RDONLY,
+					 mnt->mnt_flags | MNT_READONLY, NULL);
+			if (err)
+				eprintk_ctx("Can't remount fs read-only\n");
+		}
+out_err:
+		if (err)
+			eprintk_ctx("Failed to restore mount point @%lld"
+					" dev '%s', type '%s', path '%s'\n",
+					mntpos, mntdev, mnttype, mntpnt);
+		if (mntdev)
+			rst_put_name(mntdev, ctx);
+		if (mntpnt)
+			rst_put_name(mntpnt, ctx);
+		if (mnttype)
+			rst_put_name(mnttype, ctx);
+		if (mntbind)
+			rst_put_name(mntbind, ctx);
+		if (mntdata)
+			restore_put_mount_data(mntdata, ctx, data_type);
+	}
+	return err;
+}
+
+int restore_one_namespace(cpt_object_t *obj, loff_t pos, loff_t endpos,
+			  struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_vfsmount_image mi;
+
+	while (pos < endpos) {
+		err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx);
+		if (err)
+			return err;
+		err = restore_one_vfsmount(&mi, pos, obj, ctx);
+		if (err)
+			return err;
+		pos += mi.cpt_next;
+	}
+	return 0;
+}
+
+int rst_task_namespace(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+	struct nsproxy *ns;
+
+	if (ti->cpt_namespace == CPT_NULL)
+		return 0;
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_NAMESPACE, ti->cpt_namespace, ctx);
+	if (!obj) {
+		eprintk_ctx("namespace not found @%lld\n", ti->cpt_namespace);
+		return -ENOLINK;
+	}
+
+	if (current->nsproxy->mnt_ns == obj->o_obj)
+		return 0;
+
+	ns = obj->o_parent;
+	if (!ns) {
+		ns = duplicate_nsproxy(current->nsproxy);
+		if (!ns)
+			return -ENOMEM;
+		put_mnt_ns(ns->mnt_ns);
+		ns->mnt_ns = obj->o_obj;
+		get_mnt_ns(ns->mnt_ns);
+		obj->o_parent = ns;
+	}
+	switch_task_namespaces(current, get_nsproxy(ns));
+	return 0;
+}
+
+int rst_root_namespace(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_NAMESPACE];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_object_hdr sbuf;
+	cpt_object_t *obj;
+	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx);
+		if (err)
+			return err;
+
+		obj = cpt_object_add(CPT_OBJ_NAMESPACE, mnt_ns, ctx);
+		if (!obj)
+			return -ENOMEM;
+		cpt_obj_setpos(obj, sec, ctx);
+		if (mnt_ns) {
+			obj->o_flags |= CPT_NAMESPACE_MAIN;
+			get_mnt_ns(mnt_ns);
+			mnt_ns = NULL;
+		}
+
+		err = restore_one_namespace(obj, sec + sbuf.cpt_hdrlen,
+					    sec + sbuf.cpt_next, ctx);
+		if (err)
+			return err;
+		if (!obj->o_obj)
+			return -ENOLINK;
+		sec += sbuf.cpt_next;
+	}
+
+	return 0;
+}
+
+int rst_stray_files(struct cpt_context *ctx)
+{
+	int err = 0;
+	loff_t sec = ctx->sections[CPT_SECT_FILES];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		struct cpt_object_hdr sbuf;
+		cpt_object_t *obj;
+
+		err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx);
+		if (err)
+			break;
+
+		obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx);
+		if (!obj) {
+			struct file *file;
+
+			dprintk_ctx("stray file %Ld\n", sec);
+
+			file = rst_sysv_shm_itself(sec, ctx);
+
+			if (IS_ERR(file)) {
+				eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file));
+				return PTR_ERR(file);
+			} else {
+				fput(file);
+			}
+		}
+		sec += sbuf.cpt_next;
+	}
+
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_inotify.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_inotify.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_inotify.c	2015-01-21 12:02:48.230093472 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_inotify.c	2015-01-21 12:02:50.790025514 +0300
@@ -0,0 +1,179 @@
+/*
+ *
+ *  kernel/cpt/rst_inotify.c
+ *
+ *  Copyright (C) 2000-2007  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/inotify.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+struct file *rst_open_inotify(struct cpt_file_image *fi,
+			      unsigned flags,
+			      struct cpt_context *ctx)
+{
+	return inotify_create(O_DIRECT);
+}
+
+static int restore_one_inotify(cpt_object_t *obj,
+			       loff_t pos,
+			       struct cpt_inotify_image *ibuf,
+			       cpt_context_t *ctx)
+{
+	int err = 0;
+	loff_t endpos;
+	struct file *file = obj->o_obj;
+	struct fsnotify_group *group;
+
+	if (file->f_op != &inotify_fops) {
+		eprintk_ctx("bad inotify file\n");
+		return -EINVAL;
+	}
+
+	group = file->private_data;
+
+	if (unlikely(group == NULL)) {
+		eprintk_ctx("bad inotify device\n");
+		return -EINVAL;
+	}
+
+	endpos = pos + ibuf->cpt_next;
+	pos += ibuf->cpt_hdrlen;
+	while (pos < endpos) {
+		union {
+			struct cpt_inotify_wd_image wi;
+			struct cpt_inotify_ev_image ei;
+		} u;
+
+		err = rst_get_object(-1, pos, &u, ctx);
+		if (err) {
+			eprintk_ctx("rst_get_object: %d\n", err);
+			return err;
+		}
+		if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) {
+			struct path p;
+			loff_t fpos = pos + u.wi.cpt_hdrlen;
+
+			err = rst_get_dentry(&p.dentry, &p.mnt, &fpos, ctx);
+			if (err) {
+				eprintk_ctx("rst_get_dentry: %d\n", err);
+				return err;
+			}
+
+			err = __inotify_new_watch(group, &p, u.wi.cpt_mask, u.wi.cpt_wd);
+			path_put(&p);
+			if (err < 0)
+				break;
+
+			err = 0; /* for proper returt value */
+		} else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) {
+#if 0
+			struct inotify_user_watch dummy_watch;
+			struct inotify_watch *w;
+			char *name = NULL;
+
+			if (u.ei.cpt_namelen) {
+				name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL);
+				if (name == NULL) {
+					err = -ENOMEM;
+					break;
+				}
+				name[u.ei.cpt_namelen] = 0;
+				err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen);
+				if (err) {
+					kfree(name);
+					break;
+				}
+			}
+
+			w = &dummy_watch.wdata;
+			dummy_watch.dev = dev;
+			atomic_set(&w->count, 2);
+
+			/* Trick to avoid destruction due to exit event */
+			if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT))
+				atomic_inc(&w->count);
+			dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask,
+						      u.ei.cpt_cookie, name, NULL);
+			if (name)
+				kfree(name);
+#endif
+			wprintk_ctx("inotify events dropped\n");
+		} else {
+			eprintk_ctx("bad object: %u\n", u.wi.cpt_object);
+			err = -EINVAL;
+			break;
+		}
+		pos += u.wi.cpt_next;
+	}
+	return err;
+}
+
+int rst_inotify(cpt_context_t *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_INOTIFY];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		cpt_object_t *obj;
+		struct cpt_inotify_image ibuf;
+
+		err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx);
+		if (err)
+			return err;
+		obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx);
+		if (obj == NULL) {
+			eprintk_ctx("cannot find inotify file object\n");
+			return -EINVAL;
+		}
+		err = restore_one_inotify(obj, sec, &ibuf, ctx);
+		if (err)
+			return err;
+		sec += ibuf.cpt_next;
+	}
+
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_iterative.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_iterative.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_iterative.c	2015-01-21 12:02:48.693081182 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_iterative.c	2015-01-21 12:02:50.865023523 +0300
@@ -0,0 +1,519 @@
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/errno.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/uio.h>
+#ifndef __ia64__
+#include <asm/ldt.h>
+#endif
+#include <asm/mmu.h>
+#include <asm/tlb.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/cpt_image.h>
+#include <linux/rbtree.h>
+#include <linux/mmgang.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_pagein.h"
+
+/* TODO:
+ * 1. Error handling and recovery
+ */
+
+struct swp_node
+{
+	swp_entry_t		ent;
+	struct anon_vma		*anon;
+	u64			pfn;
+	struct rb_node		rb_hash;
+	/*
+	 * This value signal not to clean swap entry
+	 * when rst_drop_iter_rbtree is executed.
+	 * It is faster than check every swap entry
+	 * for belongings to shared memory
+	 */
+	int			keep;
+};
+
+static inline struct swp_node * rb_lookup_pfn(u64 pfn, cpt_context_t *ctx)
+{
+	struct rb_node *n = ctx->iter_rb_root.rb_node;
+	struct swp_node *pd;
+
+	while (n)
+	{
+		pd = rb_entry(n, struct swp_node, rb_hash);
+
+		if (pfn < pd->pfn)
+			n = n->rb_left;
+		else if (pfn > pd->pfn)
+			n = n->rb_right;
+		else
+			return pd->ent.val ? pd : NULL;
+	}
+	return NULL;
+}
+
+static inline int rb_insert_pfn(u64 pfn, swp_entry_t ent, cpt_context_t *ctx)
+{
+	struct rb_node **p = &ctx->iter_rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct swp_node *pd;
+
+	while (*p)
+	{
+		parent = *p;
+		pd = rb_entry(parent, struct swp_node, rb_hash);
+
+		if (pfn < pd->pfn)
+			p = &(*p)->rb_left;
+		else if (pfn > pd->pfn)
+			p = &(*p)->rb_right;
+		else
+			goto out;
+	}
+
+	pd = kmalloc(sizeof(struct swp_node), GFP_KERNEL);
+	if (pd == NULL)
+		return -ENOMEM;
+	memset(pd, 0, sizeof(struct swp_node));
+	rb_link_node(&pd->rb_hash, parent, p);
+	rb_insert_color(&pd->rb_hash, &ctx->iter_rb_root);
+out:
+	pd->pfn = pfn;
+	pd->ent = ent;
+	pd->anon = NULL;
+	return 0;
+}
+
+static int iter_clone(struct mm_struct * mm,
+		      unsigned long addr,
+		      struct page *src_page,
+		      cpt_context_t * ctx)
+{
+	int err;
+	struct page *page;
+	void *dst, *src;
+
+	err = get_user_pages(current, mm, addr,
+			     1, 1, 1, &page, NULL);
+	if (err == 0)
+		err = -EFAULT;
+	if (err < 0) {
+		eprintk_ctx("iter_clone: get_user_pages: %d\n", err);
+		return err;
+	}
+
+	dst = kmap(page);
+	src = kmap(src_page);
+	memcpy(dst, src, PAGE_SIZE);
+	kunmap(src_page);
+	kunmap(page);
+
+	page_cache_release(page);
+	return 0;
+}
+
+/* See handle_mm_fault */
+int rst_iter(struct vm_area_struct *vma, u64 pfn,
+	     unsigned long addr, cpt_context_t * ctx)
+{
+	int err = -EFAULT;
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+	struct swp_node *swn;
+
+	swn = rb_lookup_pfn(pfn, ctx);
+	if (swn == NULL) {
+		eprintk_ctx("rst_iter: missing pfn %lx\n", (unsigned long)pfn);
+		return -EINVAL;
+	}
+
+	if (swn->anon && swn->anon != vma->anon_vma) {
+		struct page * page;
+		err = -ENOMEM;
+		page = read_swap_cache_async(swn->ent, GFP_HIGHUSER, vma, addr);
+		if (page) {
+			err = -EIO;
+			wait_on_page_locked(page);
+			if (PageUptodate(page))
+				err = iter_clone(mm, addr, page, ctx);
+			page_cache_release(page);
+		}
+		wprintk("cloning iter page due to anon vma mismatch %d\n", err);
+		return err;
+	}
+
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (unlikely(!pud))
+		return -ENOMEM;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (unlikely(!pmd))
+		return -ENOMEM;
+
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (unlikely(!pte))
+		return -ENOMEM;
+
+	if (pte_none(*pte)) {
+		if (swap_duplicate(swn->ent) < 0)
+			BUG();
+		set_pte(pte, swp_entry_to_pte(swn->ent));
+		inc_mm_counter(mm, swap_usage);
+		if (list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			if (list_empty(&mm->mmlist))
+				list_add(&mm->mmlist, &init_mm.mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		swn->anon = vma->anon_vma;
+		err = 0;
+	} else {
+		eprintk_ctx("rst_iter for populated pte: 0x%lx %lx\n",
+			    addr, (unsigned long)pfn);
+	}
+	pte_unmap_unlock(pte, ptl);
+
+	return err;
+}
+
+int
+rst_iter_chunk(struct file *file, loff_t pos,
+	       struct cpt_page_block * pgb,
+	       cpt_context_t *ctx)
+{
+	unsigned long ptr = pgb->cpt_start;
+	u64 page_pos[16];
+	int err;
+
+	err = ctx->pread(&page_pos,
+			 8*(pgb->cpt_end-ptr)/PAGE_SIZE,
+			 ctx,
+			 pos + pgb->cpt_hdrlen);
+	if (err) {
+		eprintk_ctx("Oops\n");
+		return -EINVAL;
+	}
+
+	while (ptr < pgb->cpt_end) {
+		unsigned long pfn = page_pos[(ptr-pgb->cpt_start)/PAGE_SIZE];
+		struct swp_node *swn;
+
+		swn = rb_lookup_pfn(pfn, ctx);
+		if (swn == NULL) {
+			eprintk_ctx("rst_iter_shmem: missing pfn %lx\n", pfn);
+			return -EINVAL;
+		}
+		if (swn->anon) {
+			eprintk_ctx("rst_iter_shmem: creepy anon?\n");
+			return -EINVAL;
+		}
+		err = shmem_insertpage(file->f_dentry->d_inode,
+				       ptr/PAGE_SIZE, swn->ent);
+		if (err) {
+			eprintk_ctx("rst_iter_shmem: failed to insert?\n");
+			return err;
+		}
+		swn->keep = 1;
+		ptr += PAGE_SIZE;
+	}
+	if (i_size_read(file->f_dentry->d_inode) < ptr)
+		i_size_write(file->f_dentry->d_inode, ptr);
+	return 0;
+}
+
+static int nread(struct file *file, char *buf, int len)
+{
+	int offset = 0;
+
+	while (offset < len) {
+		int res;
+		mm_segment_t oldfs;
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		res = vfs_read(file, buf+offset, len-offset, &file->f_pos);
+		set_fs(oldfs);
+		if (res < 0)
+			return res;
+		if (res == 0)
+			return -EIO;
+		offset += res;
+	}
+	return 0;
+}
+
+
+static struct page *dontread_swap_cache(swp_entry_t entry, struct file *file,
+					struct user_beancounter *ub)
+{
+	struct page *found_page, *new_page = NULL;
+	int err = 0;
+	void *dst;
+
+	do {
+		found_page = find_get_page(&swapper_space, entry.val);
+		if (found_page)
+			break;
+
+		/*
+		 * Get a new page to read into from swap.
+		 */
+		if (!new_page) {
+			new_page = alloc_page(GFP_HIGHUSER);
+			if (!new_page)
+				break;		/* Out of memory */
+			if (gang_add_user_page(new_page, get_ub_gs(ub), GFP_KERNEL))
+				break;
+		}
+
+		lock_page(new_page);
+		SetPageSwapBacked(new_page);
+		err = add_to_swap_cache(new_page, entry, GFP_KERNEL);
+		if (!err) {
+			lru_cache_add_anon(new_page);
+			goto dirty_page;
+		}
+		unlock_page(new_page);
+	} while (err != -ENOENT && err != -ENOMEM);
+
+	if (new_page) {
+		if (page_gang(new_page))
+			gang_del_user_page(new_page);
+		page_cache_release(new_page);
+	}
+	if (found_page) {
+		lock_page(found_page);
+		new_page = found_page;
+		goto dirty_page;
+	}
+	return NULL;
+
+dirty_page:
+	dst = kmap(new_page);
+	err = nread(file, dst, PAGE_SIZE);
+	kunmap(new_page);
+	SetPageDirty(new_page);
+	SetPageUptodate(new_page);
+	unlock_page(new_page);
+	if (err) {
+		page_cache_release(new_page);
+		return NULL;
+	}
+	return new_page;
+}
+
+int rst_iteration(cpt_context_t *ctx)
+{
+	int err = 0;
+	struct file * file = ctx->pagein_file_in;
+	mm_segment_t oldfs;
+	struct user_beancounter *ub;
+
+#ifdef ITER_DEBUG
+	if (!file) {
+		file = filp_open("/var/tmp/dmp_", O_RDONLY, 0);
+		if (IS_ERR(file))
+			file = NULL;
+		ctx->pagein_file_in = file;
+	}
+#endif
+	if (file == NULL)
+		return -EBADF;
+#ifndef ITER_DEBUG
+	if (ctx->pagein_file_out == NULL)
+		return -EBADF;
+#endif
+
+	ub = ctx->iter_ub;
+	if (ub == NULL) {
+		if (ctx->ve_id == 0) {
+			ub = get_beancounter_longterm(mm_ub(&init_mm));
+		} else {
+			ub = get_beancounter_byuid(ctx->ve_id, 1);
+			err = -ENOMEM;
+			if (ub == NULL)
+				goto out;
+		}
+		ctx->iter_ub = ub;
+	}
+	get_beancounter(ub);
+
+	for (;;) {
+		struct swp_node * swn;
+		swp_entry_t ent;
+		void *dst;
+		struct page * page;
+		struct pgin_reply rep;
+
+		err = nread(file, (void*)&rep, sizeof(rep));
+		if (err) {
+#ifdef ITER_DEBUG
+			err = 0;
+#endif
+			break;
+		}
+
+		if (rep.rmid != PGIN_RMID) {
+			err = -EINVAL;
+			eprintk_ctx("iter stream corrupt\n");
+			break;
+		}
+
+		if (rep.handle == 0) {
+			switch (rep.error) {
+			case ITER_PASS:
+				continue;
+			case ITER_STOP:
+				break;
+			default:
+				eprintk_ctx("iter stream corrupt: unknown control code %d\n", rep.error);
+				err = -EINVAL;
+			}
+			break;
+		}
+
+		err = -ENOMEM;
+
+		swn = rb_lookup_pfn(rep.handle, ctx);
+		if (swn) {
+			page = dontread_swap_cache(swn->ent, file, ub);
+			if (page == NULL) {
+				eprintk_ctx("Found swap entry without page\n");
+				break;
+			}
+			page_cache_release(page);
+			continue;
+		}
+
+		if (nr_swap_pages*4 < total_swap_pages) {
+			eprintk_ctx("Swap pages barrier\n");
+			break;
+		}
+
+		page = alloc_page(GFP_HIGHUSER);
+		if (page == NULL) {
+			eprintk_ctx("Failed to alloc page\n");
+			break;
+		}
+
+		err = gang_add_user_page(page, get_ub_gs(ub), GFP_KERNEL);
+		if (err) {
+			eprintk_ctx("Failed to charge page\n");
+			page_cache_release(page);
+			break;
+		}
+
+		dst = kmap(page);
+		err = nread(file, dst, PAGE_SIZE);
+		kunmap(page);
+
+		if (err) {
+			eprintk_ctx("Failed to read page\n");
+			gang_del_user_page(page);
+			page_cache_release(page);
+			break;
+		}
+
+		lock_page(page);
+		SetPageUptodate(page);
+		SetPageSwapBacked(page);
+		if (add_to_swap(page, ub)) {
+			lru_cache_add_anon(page);
+			ent.val = page->private;
+		} else {
+			unlock_page(page);
+			gang_del_user_page(page);
+			page_cache_release(page);
+			eprintk_ctx("Failed to add page to swap\n");
+			err = -ENOMEM;
+			break;
+		}
+		unlock_page(page);
+		page_cache_release(page);
+
+		err = swap_duplicate(ent);
+		if (err) {
+			eprintk_ctx("Failed to duplicate page in swap\n");
+			break;
+		}
+
+		err = rb_insert_pfn(rep.handle, ent, ctx);
+		if (err) {
+			eprintk_ctx("Failed to add swap enry to tree\n");
+			free_swap_and_cache(ent);
+			break;
+		}
+	}
+	put_beancounter(ub);
+
+out:
+#ifndef ITER_DEBUG
+	if (!err) {
+		struct pgin_request req;
+		req.rmid = PGIN_RMID;
+		req.size = PGIN_STOP;
+		req.index = 0;
+		req.handle = 0;
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = vfs_write(ctx->pagein_file_out, (void*)&req, sizeof(req),
+				&ctx->pagein_file_out->f_pos);
+		set_fs(oldfs);
+		if (err != sizeof(req)) {
+			if (err >= 0)
+				err = -EIO;
+		} else {
+			err = 0;
+		}
+	}
+#endif
+	if (err) {
+		fput(ctx->pagein_file_out);
+		ctx->pagein_file_out = NULL;
+		fput(ctx->pagein_file_in);
+		ctx->pagein_file_in = NULL;
+		rst_drop_iter_rbtree(ctx);
+	}
+	return err;
+}
+
+void rst_drop_iter_rbtree(cpt_context_t *ctx)
+{
+	struct swp_node *pd;
+	struct rb_node *node;
+
+	if (ctx->iter_rb_root.rb_node == NULL)
+		goto free_ub;
+
+	while ((node = ctx->iter_rb_root.rb_node) != NULL) {
+		pd = rb_entry(node, struct swp_node, rb_hash);
+		if (pd->ent.val && !pd->keep)
+			free_swap_and_cache(pd->ent);
+		rb_erase(node, &ctx->iter_rb_root);
+		kfree(pd);
+	}
+
+free_ub:
+	if (ctx->iter_ub) {
+		put_beancounter_longterm(ctx->iter_ub);
+		ctx->iter_ub = NULL;
+	}
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_mm.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_mm.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_mm.c	2015-01-21 12:02:48.231093445 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_mm.c	2015-01-21 12:02:50.947021346 +0300
@@ -0,0 +1,1300 @@
+/*
+ *
+ *  kernel/cpt/rst_mm.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mmgang.h>
+#include <linux/hugetlb.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+#include <linux/vmalloc.h>
+#include <linux/rmap.h>
+#include <linux/hash.h>
+#include <linux/binfmts.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#ifdef CONFIG_X86
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#endif
+#include <asm/mmu_context.h>
+#include <asm/vsyscall.h>
+#include <linux/cpt_image.h>
+
+#ifdef CONFIG_VE
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#endif
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+#include "cpt_ubc.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+
+#include "cpt_syscalls.h"
+
+#define __PAGE_NX (1ULL<<63)
+
+#ifdef CONFIG_IA32_EMULATION
+extern struct linux_binfmt compat_elf_format;
+#else
+extern struct linux_binfmt elf_format;
+#endif
+
+static unsigned long make_prot(struct cpt_vma_image *vmai)
+{
+	unsigned long prot = 0;
+
+	if (vmai->cpt_flags&VM_READ)
+		prot |= PROT_READ;
+	if (vmai->cpt_flags&VM_WRITE)
+		prot |= PROT_WRITE;
+	if (vmai->cpt_flags&VM_EXEC)
+		prot |= PROT_EXEC;
+	if (vmai->cpt_flags&VM_GROWSDOWN)
+		prot |= PROT_GROWSDOWN;
+	if (vmai->cpt_flags&VM_GROWSUP)
+		prot |= PROT_GROWSUP;
+	return prot;
+}
+
+static unsigned long make_flags(struct cpt_vma_image *vmai)
+{
+	unsigned long flags = MAP_FIXED | MAP_CPT;
+
+	if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE))
+		flags |= MAP_SHARED;
+	else
+		flags |= MAP_PRIVATE;
+
+	if (vmai->cpt_file == CPT_NULL)
+		flags |= MAP_ANONYMOUS;
+	if (vmai->cpt_flags&VM_GROWSDOWN)
+		flags |= MAP_GROWSDOWN;
+#ifdef MAP_GROWSUP
+	if (vmai->cpt_flags&VM_GROWSUP)
+		flags |= MAP_GROWSUP;
+#endif
+	if (vmai->cpt_flags&VM_DENYWRITE)
+		flags |= MAP_DENYWRITE;
+	if (vmai->cpt_flags&VM_EXECUTABLE)
+		flags |= MAP_EXECUTABLE;
+	if (!(vmai->cpt_flags&VM_ACCOUNT))
+		flags |= MAP_NORESERVE;
+	return flags;
+}
+
+#ifdef CONFIG_X86
+#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \
+				&& !defined(CONFIG_XEN)
+static int __alloc_ldt(mm_context_t *pc, int mincount)
+{
+	int oldsize, newsize, nr;
+
+	if (mincount <= pc->size)
+		return 0;
+	/*
+	 * LDT got larger - reallocate if necessary.
+	 */
+	oldsize = pc->size;
+	mincount = (mincount+511)&(~511);
+	newsize = mincount*LDT_ENTRY_SIZE;
+	for (nr = 0; nr * PAGE_SIZE < newsize; nr++) {
+		BUG_ON(nr * PAGE_SIZE >= 64*1024);
+		if (!pc->ldt_pages[nr]) {
+			pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC);
+			if (!pc->ldt_pages[nr])
+				goto nomem;
+			clear_highpage(pc->ldt_pages[nr]);
+		}
+	}
+	pc->size = mincount;
+	return 0;
+
+nomem:
+	while (--nr >= 0)
+		__free_page(pc->ldt_pages[nr]);
+	pc->size = 0;
+	return -ENOMEM;
+}
+
+static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
+{
+	struct mm_struct *mm = current->mm;
+	int i;
+	int err;
+	int size;
+
+	err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE);
+	if (err)
+		return err;
+
+	size = mm->context.size*LDT_ENTRY_SIZE;
+
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		int nr = i / PAGE_SIZE, bytes;
+		char *kaddr = kmap(mm->context.ldt_pages[nr]);
+
+		bytes = size - i;
+		if (bytes > PAGE_SIZE)
+			bytes = PAGE_SIZE;
+		err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i);
+		kunmap(mm->context.ldt_pages[nr]);
+		if (err)
+			return err;
+	}
+
+	load_LDT(&mm->context);
+	return 0;
+}
+
+#else
+
+static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
+{
+	struct mm_struct *mm = current->mm;
+	int oldsize = mm->context.size;
+	void *oldldt;
+	void *newldt;
+	int err;
+
+	if (li->cpt_size > PAGE_SIZE)
+		newldt = ub_vmalloc(li->cpt_size);
+	else
+		newldt = kmalloc(li->cpt_size, GFP_KERNEL_UBC);
+
+	if (!newldt)
+		return -ENOMEM;
+
+	err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen);
+	if (err) {
+		if (li->cpt_size > PAGE_SIZE)
+			vfree(newldt);
+		else
+			kfree(newldt);
+		return err;
+	}
+
+	oldldt = mm->context.ldt;
+	mm->context.ldt = newldt;
+	mm->context.size = li->cpt_size/LDT_ENTRY_SIZE;
+
+	load_LDT(&mm->context);
+
+	if (oldsize) {
+		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(oldldt);
+		else
+			kfree(oldldt);
+	}
+	return 0;
+}
+#endif
+#endif
+
+static int
+restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg)
+{
+	struct aio_ring_info *info = &aio_ctx->ring_info;
+	unsigned nr_events = aio_ctx->max_reqs;
+	unsigned long size;
+	int nr_pages;
+
+	/* We recalculate parameters of the ring exactly like
+	 * fs/aio.c does and then compare calculated values
+	 * with ones, stored in dump. They must be the same. */
+
+	nr_events += 2;
+
+	size = sizeof(struct aio_ring);
+	size += sizeof(struct io_event) * nr_events;
+	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	if (nr_pages != aimg->cpt_ring_pages)
+		return -EINVAL;
+
+	info->nr_pages = nr_pages;
+
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+
+	if (nr_events != aimg->cpt_nr)
+		return -EINVAL;
+
+	info->nr = 0;
+	info->ring_pages = info->internal_pages;
+	if (nr_pages > AIO_RING_PAGES) {
+		info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+		if (!info->ring_pages)
+			return -ENOMEM;
+		memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
+	}
+
+	info->mmap_size = nr_pages * PAGE_SIZE;
+
+	/* This piece of shit is not entirely my fault. Kernel aio.c makes
+	 * something odd mmap()ping some pages and then pinning them.
+	 * I guess it is just some mud remained of failed attempt to show ring
+	 * to user space. The result is odd. :-) Immediately after
+	 * creation of AIO context, kernel shares those pages with user
+	 * and user can read and even write there. But after the first
+	 * fork, pages are marked COW with evident consequences.
+	 * I remember, I did the same mistake in the first version
+	 * of mmapped packet socket, luckily that crap never reached
+	 * mainstream.
+	 *
+	 * So, what are we going to do? I can simulate this odd behaviour
+	 * exactly, but I am not insane yet. For now just take the pages
+	 * from user space. Alternatively, we could keep kernel copy
+	 * in AIO context image, which would be more correct.
+	 *
+	 * What is wrong now? If the pages are COWed, ring is transferred
+	 * incorrectly.
+	 */
+	down_read(&current->mm->mmap_sem);
+	info->mmap_base = aimg->cpt_mmap_base;
+	info->nr_pages = get_user_pages(current, current->mm,
+					info->mmap_base, nr_pages, 
+					1, 0, info->ring_pages, NULL);
+	up_read(&current->mm->mmap_sem);
+
+	if (unlikely(info->nr_pages != nr_pages)) {
+		int i;
+
+		for (i=0; i<info->nr_pages; i++)
+			put_page(info->ring_pages[i]);
+		if (info->ring_pages && info->ring_pages != info->internal_pages)
+			kfree(info->ring_pages);
+		return -EFAULT;
+	}
+
+	aio_ctx->user_id = info->mmap_base;
+
+	info->nr = nr_events;
+	info->tail = aimg->cpt_tail;
+
+	return 0;
+}
+
+static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx)
+{
+	int err;
+	struct kioctx *aio_ctx;
+	struct ve_struct *ve;
+
+	aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+	if (!aio_ctx)
+		return -ENOMEM;
+
+	memset(aio_ctx, 0, sizeof(*aio_ctx));
+	aio_ctx->max_reqs = aimg->cpt_max_reqs;
+
+	if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) {
+		kmem_cache_free(kioctx_cachep, aio_ctx);
+		eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err);
+		return err;
+	}
+
+	ve = get_exec_env();
+	aio_ctx->ve = get_ve(ve);
+	spin_lock(&ve->aio_nr_lock);
+	ve->aio_nr += aio_ctx->max_reqs;
+	spin_unlock(&ve->aio_nr_lock);
+
+	aio_ctx->mm = current->mm;
+	atomic_inc(&aio_ctx->mm->mm_count);
+	atomic_set(&aio_ctx->users, 1);
+	spin_lock_init(&aio_ctx->ctx_lock);
+	spin_lock_init(&aio_ctx->ring_info.ring_lock);
+	init_waitqueue_head(&aio_ctx->wait);
+	INIT_LIST_HEAD(&aio_ctx->active_reqs);
+	INIT_LIST_HEAD(&aio_ctx->run_list);
+	INIT_DELAYED_WORK(&aio_ctx->wq, aio_kick_handler);
+
+	spin_lock(&aio_ctx->mm->ioctx_lock);
+	hlist_add_head(&aio_ctx->list, &aio_ctx->mm->ioctx_list);
+	spin_unlock(&aio_ctx->mm->ioctx_lock);
+
+	return 0;
+}
+
+struct anonvma_map
+{
+	struct hlist_node	list;
+	struct anon_vma		*avma;
+	__u64			id;
+};
+
+static int verify_create_anonvma(struct mm_struct *mm,
+				 struct cpt_vma_image *vmai,
+				 cpt_context_t *ctx)
+{
+	struct anon_vma *avma = NULL;
+	struct anon_vma *new_avma;
+	struct vm_area_struct *vma;
+	int h;
+
+	if (!ctx->anonvmas) {
+		if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE)
+			return -EINVAL;
+		if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL)
+			return -ENOMEM;
+		for (h = 0; h < CPT_ANONVMA_HSIZE; h++)
+			INIT_HLIST_HEAD(&ctx->anonvmas[h]);
+	} else {
+		struct anonvma_map *map;
+		struct hlist_node *elem;
+
+		h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
+		hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) {
+			if (map->id == vmai->cpt_anonvmaid) {
+				avma = map->avma;
+				break;
+			}
+		}
+	}
+
+	down_read(&mm->mmap_sem);
+	if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) {
+		up_read(&mm->mmap_sem);
+		return -ESRCH;
+	}
+	if (vma->vm_start != vmai->cpt_start) {
+		up_read(&mm->mmap_sem);
+		eprintk_ctx("vma start mismatch\n");
+		return -EINVAL;
+	}
+	if (vma->vm_pgoff != vmai->cpt_pgoff) {
+		dprintk_ctx("vma pgoff mismatch, fixing\n");
+		if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) {
+			eprintk_ctx("cannot fixup vma pgoff\n");
+			up_read(&mm->mmap_sem);
+			return -EINVAL;
+		}
+		vma->vm_pgoff = vmai->cpt_pgoff;
+	}
+
+	if (!vma->anon_vma) {
+		if (avma) {
+			vma->anon_vma = avma;
+			if (anon_vma_link(vma)) {
+				vma->anon_vma = NULL;
+				up_read(&mm->mmap_sem);
+				return -ENOMEM;
+			}
+		} else {
+			int err;
+
+			err = anon_vma_prepare(vma);
+
+			if (err) {
+				up_read(&mm->mmap_sem);
+				return err;
+			}
+		}
+	} else {
+		/* Note, we _can_ arrive to the situation, when two
+		 * different anonvmaid's point to one anon_vma, this happens
+		 * f.e. when mmap() merged new area to previous one and
+		 * they will share one anon_vma even if they did not on
+		 * original host.
+		 *
+		 * IT IS OK. To all that I understand, we may merge all
+		 * the anon_vma's and rmap can scan all the huge list of vmas
+		 * searching for page. It is just "suboptimal".
+		 *
+		 * Real disaster would happen, if vma already got an anon_vma
+		 * with different id. It is very rare case, kernel does the
+		 * best efforts to merge anon_vmas when some attributes are
+		 * different. In this case we will fall to copying memory.
+		 */
+		if (avma && vma->anon_vma != avma) {
+			up_read(&mm->mmap_sem);
+			wprintk_ctx("anon_vma mismatch\n");
+			return 0;
+		}
+	}
+
+	new_avma = vma->anon_vma;
+	up_read(&mm->mmap_sem);
+
+	if (!avma) {
+		struct anonvma_map *map;
+
+		if (!new_avma)
+			return -EINVAL;
+
+		if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL)
+			return -ENOMEM;
+
+		map->id = vmai->cpt_anonvmaid;
+		map->avma = new_avma;
+		h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
+		hlist_add_head(&map->list, &ctx->anonvmas[h]);
+	}
+	return 0;
+}
+
+static int copy_mm_pages(struct mm_struct *src, unsigned long start,
+			 unsigned long end)
+{
+	int err;
+
+	for (; start < end; start += PAGE_SIZE) {
+		struct page *page;
+		struct page *spage;
+		void *maddr, *srcaddr;
+
+		err = get_user_pages(current, current->mm,
+				     start, 1, 1, 1, &page, NULL);
+		if (err == 0)
+			err = -EFAULT;
+		if (err < 0)
+			return err;
+
+		err = get_user_pages(current, src,
+				     start, 1, 0, 1, &spage, NULL);
+
+		if (err == 0)
+			err = -EFAULT;
+		if (err < 0) {
+			page_cache_release(page);
+			return err;
+		}
+
+		srcaddr = kmap(spage);
+		maddr = kmap(page);
+		memcpy(maddr, srcaddr, PAGE_SIZE);
+		set_page_dirty_lock(page);
+		kunmap(page);
+		kunmap(spage);
+		page_cache_release(page);
+		page_cache_release(spage);
+	}
+	return 0;
+}
+
+#include <linux/proc_fs.h>
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+static int cpt_setup_vdso(unsigned long addr, int is_rhel5)
+{
+#ifdef CONFIG_COMPAT
+	if (test_thread_flag(TIF_IA32))
+		return compat_arch_setup_additional_pages(NULL, 0, addr);
+#endif
+#ifdef CONFIG_X86_64
+	if (is_rhel5)
+		return arch_setup_additional_pages_rhel5(NULL, 0, addr);
+#endif
+	return arch_setup_additional_pages(NULL, 0, addr);
+}
+#else
+#define cpt_setup_vdso(addr)	(0)
+#endif
+
+static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos,
+		struct cpt_context *ctx)
+{
+	int err = 0;
+	unsigned long addr;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct file *file = NULL;
+	unsigned long prot;
+	int checked = 0;
+
+	if (vmai->cpt_type == CPT_VMA_VDSO || vmai->cpt_type == CPT_VMA_VDSO_OLD) {
+		if (ctx->vdso == NULL || !test_thread_flag(TIF_IA32)) {
+			int is_rhel5;
+
+			is_rhel5 = (ctx->image_version < CPT_VERSION_32 ||
+					vmai->cpt_type == CPT_VMA_VDSO_OLD);
+
+			err = cpt_setup_vdso(vmai->cpt_start, is_rhel5);
+			if (err)
+				eprintk_ctx("%s: failed to setup vdso: %Ld (rhel5: %d)\n", __func__,
+					(unsigned long long)vmai->cpt_start,
+					is_rhel5);
+			goto out;
+		}
+	}
+
+	prot = make_prot(vmai);
+
+	if (vmai->cpt_file != CPT_NULL) {
+		if (vmai->cpt_type == CPT_VMA_TYPE_0) {
+			file = rst_file(vmai->cpt_file, -1, ctx);
+			if (IS_ERR(file)) {
+				eprintk_ctx("do_rst_vma: rst_file: %Ld\n",
+						(unsigned long long)vmai->cpt_file);
+				return PTR_ERR(file);
+			}
+		} else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) {
+			file = rst_sysv_shm_vma(vmai, ctx);
+			if (IS_ERR(file)) {
+				eprintk_ctx("%s: rst_sysv_shm_vma failed: %ld\n",
+						__func__, PTR_ERR(file));
+				return PTR_ERR(file);
+			}
+		}
+	}
+
+	down_write(&mm->mmap_sem);
+
+	if ((make_flags(vmai) & VM_EXECUTABLE) && mm->exe_file != file)
+		set_mm_exe_file(mm, file);
+
+	addr = do_mmap_pgoff(file, vmai->cpt_start,
+			     vmai->cpt_end-vmai->cpt_start,
+			     prot, make_flags(vmai),
+			     vmai->cpt_pgoff);
+
+	if (addr != vmai->cpt_start) {
+		up_write(&mm->mmap_sem);
+
+		err = -EINVAL;
+		if (IS_ERR((void*)addr))
+			err = addr;
+		eprintk_ctx("cannot mmap vma %Ld\n", vmapos);
+		goto out;
+	}
+
+	vma = find_vma(mm, vmai->cpt_start);
+	if (vma == NULL) {
+		up_write(&mm->mmap_sem);
+		eprintk_ctx("cannot find mmapped vma\n");
+		err = -ESRCH;
+		goto out;
+	}
+
+	vma->vm_flags |= VM_NOHUGEPAGE;
+
+	/* do_mmap_pgoff() can merge new area to previous one (not to the next,
+	 * we mmap in order, the rest of mm is still unmapped). This can happen
+	 * f.e. if flags are to be adjusted later, or if we had different
+	 * anon_vma on two adjacent regions. Split it by brute force. */
+	if (vma->vm_start != vmai->cpt_start) {
+		dprintk_ctx("vma %Ld merged, split\n", vmapos);
+		err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0);
+		if (err) {
+			up_write(&mm->mmap_sem);
+			eprintk_ctx("cannot split vma\n");
+			goto out;
+		}
+	}
+	up_write(&mm->mmap_sem);
+
+	if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) {
+		err = verify_create_anonvma(mm, vmai, ctx);
+		if (err) {
+			eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos);
+			goto out;
+		}
+	}
+
+	if (vmai->cpt_type == CPT_VMA_VDSO) {
+		struct page *page;
+		void *maddr;
+
+		down_read(&mm->mmap_sem);
+		err = get_user_pages(current, current->mm,
+				(unsigned long)vmai->cpt_start,
+				1, 1, 1, &page, NULL);
+		up_read(&mm->mmap_sem);
+		if (err == 0)
+			err = -EFAULT;
+		if (err < 0) {
+			eprintk_ctx("can't get vdso: get_user_pages: %d\n", err);
+			goto out;
+		}
+		err = 0;
+		maddr = kmap(page);
+		memcpy(maddr, ctx->vdso, PAGE_SIZE);
+		set_page_dirty_lock(page);
+		kunmap(page);
+		page_cache_release(page);
+		goto out;
+	}
+
+	if (vmai->cpt_next > vmai->cpt_hdrlen) {
+		loff_t offset = vmapos + vmai->cpt_hdrlen;
+
+		do {
+			union {
+				struct cpt_page_block pb;
+				struct cpt_remappage_block rpb;
+				struct cpt_copypage_block cpb;
+				struct cpt_lazypage_block lpb;
+				struct cpt_iterpage_block ipb;
+			} u;
+			loff_t pos;
+
+			err = rst_get_object(-1, offset, &u, ctx);
+			if (err) {
+				eprintk_ctx("vma fix object: %d\n", err);
+				goto out;
+			}
+			if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) {
+				err = sc_remap_file_pages(u.rpb.cpt_start,
+							  u.rpb.cpt_end-u.rpb.cpt_start,
+							  0, u.rpb.cpt_pgoff, 0);
+				if (err < 0) {
+					eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err,
+					       (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), 
+					       (__u32)u.rpb.cpt_pgoff);
+					goto out;
+				}
+				offset += u.rpb.cpt_next;
+				continue;
+			} else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) {
+				err = -EINVAL;
+				goto out;
+			} else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) {
+				struct vm_area_struct *vma, *vma1;
+				struct mm_struct *src;
+				struct anon_vma *src_anon;
+				cpt_object_t *mobj;
+
+				if (!vmai->cpt_anonvmaid || !vmai->cpt_anonvma) {
+					err = -EINVAL;
+					eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n");
+					goto out;
+				}
+
+				mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx);
+				if (!mobj) {
+					eprintk_ctx("lost mm_struct to clone pages from\n");
+					err = -ESRCH;
+					goto out;
+				}
+				src = mobj->o_obj;
+
+				down_read(&src->mmap_sem);
+				src_anon = NULL;
+				vma1 = find_vma(src, u.cpb.cpt_start);
+				if (vma1)
+					src_anon = vma1->anon_vma;
+				up_read(&src->mmap_sem);
+
+				if (!vma1) {
+					eprintk_ctx("lost src vm_area_struct\n");
+					err = -ESRCH;
+					goto out;
+				}
+
+				down_read(&mm->mmap_sem);
+				if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) {
+					up_read(&mm->mmap_sem);
+					eprintk_ctx("lost vm_area_struct\n");
+					err = -ESRCH;
+					goto out;
+				}
+
+				if (!src_anon ||
+				    !vma->anon_vma ||
+				    vma->anon_vma != src_anon ||
+				    vma->vm_start - vma1->vm_start !=
+				    (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) {
+					up_read(&mm->mmap_sem);
+					wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos);
+					err = copy_mm_pages(mobj->o_obj,
+							    u.cpb.cpt_start,
+							    u.cpb.cpt_end);
+				} else {
+					err = __copy_page_range(vma, vma1,
+								u.cpb.cpt_start,
+								u.cpb.cpt_end-u.cpb.cpt_start);
+					up_read(&mm->mmap_sem);
+				}
+				if (err) {
+					eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err,
+						(__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), 
+						(long)u.cpb.cpt_source);
+					goto out;
+				}
+
+				offset += u.cpb.cpt_next;
+				continue;
+			} else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES ||
+				   u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES
+				   ) {
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+				unsigned long ptr = u.lpb.cpt_start;
+				u64 page_pos[16];
+				pos = offset + sizeof(u.pb);
+
+				err = ctx->pread(&page_pos,
+						 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE,
+						 ctx,
+						 pos);
+				if (err) {
+					eprintk_ctx("Oops\n");
+					goto out;
+				}
+
+				down_read(&mm->mmap_sem);
+				if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) {
+					up_read(&mm->mmap_sem);
+					eprintk_ctx("lost vm_area_struct\n");
+					err = -ESRCH;
+					goto out;
+				}
+				err = anon_vma_prepare(vma);
+				if (err) {
+					eprintk_ctx("%s: failed to prepare anon_vma\n", __func__);
+					up_read(&mm->mmap_sem);
+					goto out;
+				}
+				while (ptr < u.lpb.cpt_end) {
+					err = rst_iter(vma,
+						       page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE],
+						       ptr,
+						       ctx);
+					if (err) {
+						eprintk_ctx("%s: rst_iter failed\n", __func__);
+						break;
+					}
+					ptr += PAGE_SIZE;
+				}
+				if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) {
+					make_pages_present((unsigned long)u.lpb.cpt_start,
+							   (unsigned long)u.lpb.cpt_end);
+				}
+				up_read(&mm->mmap_sem);
+#else
+				err = -EINVAL;
+#endif
+				if (err)
+					goto out;
+				offset += u.cpb.cpt_next;
+				continue;
+			}
+			if (u.pb.cpt_object != CPT_OBJ_PAGES) {
+				eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object);
+				err = -EINVAL;
+				goto out;
+			}
+			pos = offset + sizeof(u.pb);
+			if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE) &&
+			    u.pb.cpt_content != CPT_CONTENT_PRAM) {
+				/* I guess this is get_user_pages() messed things,
+				 * this happens f.e. when gdb inserts breakpoints.
+				 */
+				int i;
+				for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) {
+					struct page *page;
+					void *maddr;
+					err = get_user_pages(current, current->mm,
+							     (unsigned long)u.pb.cpt_start + i*PAGE_SIZE,
+							     1, 1, 1, &page, NULL);
+					if (err == 0)
+						err = -EFAULT;
+					if (err < 0) {
+						eprintk_ctx("get_user_pages: %d\n", err);
+						goto out;
+					}
+					err = 0;
+					maddr = kmap(page);
+					if (u.pb.cpt_content == CPT_CONTENT_VOID) {
+						memset(maddr, 0, PAGE_SIZE);
+					} else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
+						err = ctx->pread(maddr, PAGE_SIZE,
+								 ctx, pos + i*PAGE_SIZE);
+						if (err)
+							eprintk_ctx("%s: ctx->pread failed\n", __func__);
+					} else {
+						eprintk_ctx("%s: unsupported cpt content (1): %d\n", __func__, u.pb.cpt_content);
+						err = -EINVAL;
+					}
+					if (!err)
+						set_page_dirty_lock(page);
+					kunmap(page);
+					page_cache_release(page);
+					if (err)
+						goto out;
+				}
+			} else {
+				if (!(prot&PROT_WRITE))
+					sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
+				if (u.pb.cpt_content == CPT_CONTENT_VOID) {
+					int i;
+					for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) {
+						err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i);
+						if (err) {
+							eprintk_ctx("__put_user 2 %d\n", err);
+							goto out;
+						}
+					}
+				} else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
+					/*
+					 * If this is a socket buffer mapping, all pages must be already there,
+					 * so there is no need in optimizing out page faults.
+					 */
+					if ((vma->vm_file && !S_ISSOCK(vma->vm_file->f_dentry->d_inode->i_mode)) ||
+						((vma->vm_flags & VM_GROWSDOWN) && u.pb.cpt_start == vma->vm_start))
+					{
+						struct vm_area_struct *vma;
+						struct page *page;
+						unsigned long addr;
+
+						/* Fill the area with zero pages in order to avoid IO
+						 * caused by page faults.
+						 */
+						down_read(&mm->mmap_sem);
+						if ((vma = find_vma(mm, u.pb.cpt_start)) == NULL) {
+							up_read(&mm->mmap_sem);
+							eprintk_ctx("lost vm_area_struct\n");
+							err = -ESRCH;
+							goto out;
+						}
+						for (addr=u.pb.cpt_start; addr<u.pb.cpt_end; addr+=PAGE_SIZE) {
+							err = -ENOMEM;
+							page = alloc_zeroed_user_highpage_movable(vma, addr);
+							if (!page) {
+								eprintk_ctx("%s: failed to alloc zeroed high page\n", __func__);
+								break;
+							}
+							err = install_anon_page(mm, vma, addr, page);
+							if (err) {
+								eprintk_ctx("install_anon_page: %d\n", err);
+								put_page(page);
+								break;
+							}
+						}
+						up_read(&mm->mmap_sem);
+						if (err)
+							goto out;
+					}
+
+					err = ctx->pread(cpt_ptr_import(u.pb.cpt_start), 
+							 u.pb.cpt_end-u.pb.cpt_start,
+							 ctx, pos);
+					if (err) {
+						eprintk_ctx("%s: VMA context read failed: 0x%Lx - 0x%Lx\n", __func__, vmai->cpt_start, vmai->cpt_end);
+						goto out;
+					}
+				} else if (u.pb.cpt_content == CPT_CONTENT_PRAM) {
+					err = rst_undump_pram(mm, u.pb.cpt_start, u.pb.cpt_end, pos, ctx);
+					if (err) {
+						eprintk_ctx("%s: PRAM undump failed: start %Ld, end %Ld\n", __func__, u.pb.cpt_start, u.pb.cpt_end);
+						goto out;
+					}
+				} else {
+					err = -EINVAL;
+					eprintk_ctx("%s: unsupported cpt content (2): %d\n", __func__, u.pb.cpt_content);
+					goto out;
+				}
+				if (!(prot&PROT_WRITE))
+					sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
+			}
+			err = 0;
+			offset += u.pb.cpt_next;
+		} while (offset < vmapos + vmai->cpt_next);
+	}
+
+check:
+	do {
+		struct vm_area_struct *vma;
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, addr);
+		if (vma) {
+
+			if (!(vmai->cpt_flags & VM_NOHUGEPAGE))
+				vma->vm_flags &= ~VM_NOHUGEPAGE;
+
+			if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) {
+				VM_ClearReadHint(vma);
+				vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK;
+			}
+			if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) {
+				dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos);
+				up_read(&mm->mmap_sem);
+				if (vma->vm_flags&VM_LOCKED)
+					err = __munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, false);
+				else {
+					int ret;
+					int should_set_cap;
+					unsigned long locked;
+					unsigned long lock_limit;
+
+					locked = ((vmai->cpt_end - vmai->cpt_start) >> PAGE_SHIFT) +
+					          current->mm->locked_vm;
+					lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+					should_set_cap = ((locked > lock_limit) && !capable(CAP_IPC_LOCK));
+					if (unlikely(should_set_cap)) {
+						if ((err = set_mlock_creds(1)) != 0) {
+							eprintk_ctx("set_mlock_creds: %d\n", err);
+							goto out;
+						}
+					}
+
+					ret = __mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, false);
+
+					if (unlikely(should_set_cap)) {
+						if ((err = set_mlock_creds(0)) != 0) {
+							eprintk_ctx("set_mlock_creds: %d\n", err);
+							goto out;
+						}
+					}
+					err = ret;
+				}
+				/* When mlock fails with EFAULT, it means
+				 * that it could not bring in pages.
+				 * It can happen after mlock() on unreadable
+				 * VMAs. But VMA is correctly locked,
+				 * so that this error can be ignored. */
+				if (err == -EFAULT)
+					err = 0;
+				if (err) {
+					eprintk_ctx("%s: sc_m(un)lock failed\n", __func__);
+					goto out;
+				}
+				goto check;
+			}
+			if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX)
+				wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
+					    (unsigned long long)vma->vm_page_prot.pgprot,
+					    (unsigned long long)vmai->cpt_pgprot);
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+			if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) &&
+			    (ctx->kernel_config_flags & (1 << CPT_KERNEL_CONFIG_PAE)))
+				wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
+				       (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot);
+#endif
+			if (vma->vm_flags != vmai->cpt_flags) {
+				unsigned long x = vma->vm_flags ^ vmai->cpt_flags;
+				if (x & VM_EXEC) {
+					/* Crap. On i386 this is OK.
+					 * It is impossible to make via mmap/mprotect
+					 * exec.c clears VM_EXEC on stack. */
+					vma->vm_flags &= ~VM_EXEC;
+				} else if ((x & VM_ACCOUNT) && !checked) {
+					checked = 1;
+					if (!(prot&PROT_WRITE)) {
+						up_read(&mm->mmap_sem);
+						sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
+						sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
+						goto check;
+					}
+					wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
+					       (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
+				} else {
+					wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
+					       (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
+				}
+			}
+		} else {
+			wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos);
+		}
+		up_read(&mm->mmap_sem);
+	} while (0);
+
+out:
+	if (file)
+		fput(file);
+	return err;
+}
+
+static int do_rst_auxv(struct cpt_object_hdr *hdr, loff_t pos,
+		       cpt_context_t *ctx)
+{
+	struct mm_struct *mm = current->mm;
+	__u64 auxv[AT_VECTOR_SIZE];
+	unsigned idx, nwords;
+	int err;
+
+	nwords = (hdr->cpt_next - hdr->cpt_hdrlen) / sizeof(auxv[0]);
+	if (nwords > AT_VECTOR_SIZE - 2)
+		return -E2BIG;
+
+	err = ctx->pread(auxv, nwords * sizeof(auxv[0]), ctx,
+			pos + hdr->cpt_hdrlen);
+	if (!err) {
+		mm->saved_auxv[nwords] = 0;
+		mm->saved_auxv[nwords + 1] = 0;
+		for (idx = 0; idx < nwords; idx++)
+			mm->saved_auxv[idx] = auxv[idx];
+	}
+	return err;
+}
+
+#ifndef CONFIG_IA64
+#define TASK_UNMAP_START	0
+#else
+/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping
+ * used to accelerate speculative dereferences of NULL pointer. */
+#define TASK_UNMAP_START	PAGE_SIZE
+#endif
+
+static int do_rst_mm(struct cpt_mm_image *vmi, struct cpt_task_image *ti,
+		struct cpt_context *ctx)
+{
+	int err = 0;
+	unsigned int def_flags;
+	struct mm_struct *mm = current->mm;
+	struct ve_struct *ve = get_exec_env();
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *bc;
+#endif
+
+	down_write(&mm->mmap_sem);
+	do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START);
+
+#ifdef CONFIG_BEANCOUNTERS
+	/*
+	 * MM beancounter is usually correct from the fork time,
+	 * but not for init, for example.
+	 * Luckily, mm_ub can be changed for a completely empty MM.
+	 */
+	bc = rst_lookup_ubc(vmi->cpt_mmub, ctx);
+	put_beancounter(bc);
+#endif
+
+	mm->start_code = vmi->cpt_start_code;
+	mm->end_code = vmi->cpt_end_code;
+	mm->start_data = vmi->cpt_start_data;
+	mm->end_data = vmi->cpt_end_data;
+	mm->start_brk = vmi->cpt_start_brk;
+	mm->brk = vmi->cpt_brk;
+	mm->start_stack = vmi->cpt_start_stack;
+	mm->arg_start = vmi->cpt_start_arg;
+	mm->arg_end = vmi->cpt_end_arg;
+	mm->env_start = vmi->cpt_start_env;
+	mm->env_end = vmi->cpt_end_env;
+	mm->def_flags = 0;
+	def_flags = vmi->cpt_def_flags;
+
+#ifdef CONFIG_X86_64
+	if (!ti->cpt_64bit) {
+		set_thread_flag(TIF_IA32);
+		/*
+		 * Task forked from 64bit app and thus has wrong binfmt pointer
+		 */
+#ifdef CONFIG_IA32_EMULATION
+		set_binfmt(&compat_elf_format);
+#endif
+	} else if (test_thread_flag(TIF_IA32)) {
+		clear_thread_flag(TIF_IA32);
+		/*
+		 * Task forked from 32bit app and thus has wrong binfmt pointer
+		 */
+#ifdef CONFIG_IA32_EMULATION
+		set_binfmt(&compat_elf_format);
+#else
+		set_binfmt(&elf_format);
+#endif
+	}
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	arch_pick_mmap_layout(mm);
+#endif
+
+	if (cpt_object_has(vmi, cpt_mm_flags))
+		mm->flags = vmi->cpt_mm_flags;
+	else
+		set_dumpable(mm, vmi->cpt_dumpable);
+
+	mm->vps_dumpable = vmi->cpt_vps_dumpable;
+#ifndef CONFIG_IA64
+	if (ctx->image_version >= CPT_VERSION_9) {
+		mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso);
+		current_thread_info()->sysenter_return = 
+			VDSO32_SYMBOL(mm->context.vdso, SYSENTER_RETURN);
+	}
+#endif
+
+#if 0 /* def CONFIG_HUGETLB_PAGE*/
+/* NB: ? */
+	int used_hugetlb;
+#endif
+	up_write(&mm->mmap_sem);
+
+	if (vmi->cpt_next > vmi->cpt_hdrlen) {
+		loff_t offset = ti->cpt_mm + vmi->cpt_hdrlen;
+		do {
+			union {
+				struct cpt_object_hdr hdr;
+				struct cpt_vma_image vmai;
+				struct cpt_aio_ctx_image aioi;
+				struct cpt_obj_bits bits;
+			} u;
+			err = rst_get_object(-1, offset, &u, ctx);
+			if (err)
+				goto out;
+			if (u.vmai.cpt_object == CPT_OBJ_VMA) {
+#ifdef CONFIG_IA64
+				//// Later...
+				if (u.vmai.cpt_start)
+#endif
+				err = do_rst_vma(&u.vmai, offset, ti->cpt_mm, ctx);
+				if (err) {
+					eprintk_ctx("%s: failed to restore vma 0x%08Lx-0x%08Lx: %d\n",
+							__func__, u.vmai.cpt_start, u.vmai.cpt_end, err);
+					goto out;
+				}
+#ifdef CONFIG_X86
+			} else if (u.bits.cpt_object == CPT_OBJ_BITS &&
+				   u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) {
+				err = do_rst_ldt(&u.bits, offset, ctx);
+				if (err) {
+					eprintk_ctx("%s: failed to restore ldt: %d\n",
+							__func__, err);
+					goto out;
+				}
+#endif
+			} else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) {
+				err = do_rst_aio(&u.aioi, offset, ctx);
+				if (err) {
+					eprintk_ctx("%s: failed to restore aio: %d\n",
+							__func__, err);
+					goto out;
+				}
+			} else if (u.hdr.cpt_object == CPT_OBJ_MM_AUXV) {
+				err = do_rst_auxv(&u.hdr, offset, ctx);
+				if (err) {
+					eprintk_ctx("%s: failed to restore auxv: %d\n",
+							__func__, err);
+					goto out;
+				}
+			} else {
+				eprintk_ctx("unknown object %u in mm image\n",
+						u.vmai.cpt_object);
+				err = -EINVAL;
+				goto out;
+			}
+			offset += u.vmai.cpt_next;
+		} while (offset < ti->cpt_mm + vmi->cpt_next);
+	}
+
+	down_write(&mm->mmap_sem);
+	mm->def_flags = def_flags;
+	up_write(&mm->mmap_sem);
+
+	if (ve->aio_nr > ve->aio_max_nr)
+		wprintk_ctx("aio-nr=%lu exceed aio-max-nr=%lu\n",
+				ve->aio_nr, ve->aio_max_nr);
+out:
+	return err;
+}
+
+extern void exit_mm(struct task_struct * tsk);
+
+int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	int err = 0;
+	cpt_object_t *mobj;
+	void *tmp = (void*)__get_free_page(GFP_KERNEL);
+	struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp;
+
+	if (!tmp)
+		return -ENOMEM;
+
+	if (ti->cpt_mm == CPT_NULL) {
+		if (current->mm)
+			exit_mm(current);
+		goto out;
+	}
+
+	mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
+	if (mobj) {
+		if (current->mm != mobj->o_obj) BUG();
+		goto out;
+	}
+
+	if (current->mm == NULL) {
+		struct mm_struct *mm = mm_alloc();
+		if (mm == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = init_new_context(current, mm);
+		if (err) {
+			mmdrop(mm);
+			goto out;
+		}
+		current->mm = mm;
+	}
+
+	if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0)
+		goto out;
+	if ((err = do_rst_mm(vmi, ti, ctx)) != 0) {
+		eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm);
+		goto out;
+	}
+	err = -ENOMEM;
+	mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx);
+	if (mobj != NULL) {
+		err = 0;
+		cpt_obj_setpos(mobj, ti->cpt_mm, ctx);
+	}
+
+out:
+	if (tmp)
+		free_page((unsigned long)tmp);
+	return err;
+}
+
+/* This is part of mm setup, made in parent context. Mostly, it is the place,
+ * where we graft mm of another process to child.
+ */
+
+int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	struct task_struct *tsk = obj->o_obj;
+	cpt_object_t *mobj;
+
+	/* Task without mm. Just get rid of this. */
+	if (ti->cpt_mm == CPT_NULL) {
+		if (tsk->mm) {
+			mmput(tsk->mm);
+			tsk->mm = NULL;
+		}
+		return 0;
+	}
+
+	mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
+	if (mobj) {
+		struct mm_struct *newmm = mobj->o_obj;
+		/* Good, the MM is already created. */
+		if (newmm == tsk->mm) {
+			/* Already done by clone(). */
+			return 0;
+		}
+		mmput(tsk->mm);
+		atomic_inc(&newmm->mm_users);
+		tsk->mm = newmm;
+		tsk->active_mm = newmm;
+	}
+	return 0;
+}
+
+/* We use CLONE_VM when mm of child is going to be shared with parent.
+ * Otherwise mm is copied.
+ */
+
+__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	if (ti->cpt_mm == CPT_NULL ||
+	    lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx))
+		return CLONE_VM;
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_net.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_net.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_net.c	2015-01-21 12:02:48.231093445 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_net.c	2015-01-21 12:02:51.087017630 +0300
@@ -0,0 +1,826 @@
+/*
+ *
+ *  kernel/cpt/rst_net.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/addrconf.h>
+#include <linux/if_tun.h>
+#include <linux/veth.h>
+#include <linux/venet.h>
+#include <linux/fdtable.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/cpt_export.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_kernel.h"
+#include "cpt_net.h"
+#include "cpt_files.h"
+
+#include "cpt_syscalls.h"
+
+extern struct in_ifaddr *inet_alloc_ifa(void);
+extern int inet_insert_ifa(struct in_ifaddr *ifa);
+extern struct in_device *inetdev_init(struct net_device *dev);
+
+int rst_restore_ifaddr(struct cpt_context *ctx)
+{
+	struct net *net = get_exec_env()->ve_netns;
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_ifaddr_image di;
+	struct net_device *dev;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		int cindex = -1;
+		int err;
+		err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx);
+		if (err)
+			return err;
+		cindex = di.cpt_index;
+		rtnl_lock();
+		dev = __dev_get_by_index(net, cindex);
+		if (dev && di.cpt_family == AF_INET) {
+			struct in_device *in_dev;
+			struct in_ifaddr *ifa;
+			if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
+				in_dev = inetdev_init(dev);
+			ifa = inet_alloc_ifa();
+			if (ifa) {
+				ifa->ifa_local = di.cpt_address[0];
+				ifa->ifa_address = di.cpt_peer[0];
+				ifa->ifa_broadcast = di.cpt_broadcast[0];
+				ifa->ifa_prefixlen = di.cpt_masklen;
+				ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+				ifa->ifa_flags = di.cpt_flags;
+				ifa->ifa_scope = di.cpt_scope;
+				memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ);
+				in_dev_hold(in_dev);
+				ifa->ifa_dev   = in_dev;
+				err = inet_insert_ifa(ifa);
+				if (err && err != -EEXIST) {
+					rtnl_unlock();
+					eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
+					return err;
+				}
+			}
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		} else if (dev && di.cpt_family == AF_INET6) {
+			__u32 prefered_lft;
+			__u32 valid_lft;
+			struct net *net = get_exec_env()->ve_ns->net_ns;
+
+			if (!ipv6_is_enabled()) {
+				rtnl_unlock();
+				eprintk_ctx("IPv6 is disabled\n");
+				return -ENOTSUPP;
+			}
+
+			prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ?
+				0 : di.cpt_prefered_lft;
+			valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ?
+				0xFFFFFFFF : di.cpt_valid_lft;
+			err = inet6_addr_add(net, dev->ifindex,
+					     (struct in6_addr *)di.cpt_address,
+					     di.cpt_masklen, 0,
+					     prefered_lft,
+					     valid_lft);
+			if (err && err != -EEXIST) {
+				rtnl_unlock();
+				eprintk_ctx("add ifaddr6 err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
+				return err;
+			}
+#endif
+		} else {
+			rtnl_unlock();
+			eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index);
+			return -EINVAL;
+		}
+		rtnl_unlock();
+		sec += di.cpt_next;
+	}
+	return 0;
+}
+
+static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx)
+{
+	int min_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	__u32 prefix0 = 0;
+
+	if (nlh->nlmsg_len > min_len) {
+		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+		struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len);
+
+		while (RTA_OK(rta, attrlen)) {
+			if (rta->rta_type == RTA_DST) {
+				prefix0 = *(__u32*)RTA_DATA(rta);
+			}
+			rta = RTA_NEXT(rta, attrlen);
+		}
+	}
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (rtm->rtm_family == AF_INET6) {
+		if (rtm->rtm_type == RTN_LOCAL)
+			return 2;
+		if (rtm->rtm_flags & RTM_F_CLONED)
+			return 2;
+		if (rtm->rtm_protocol == RTPROT_UNSPEC ||
+		    rtm->rtm_protocol == RTPROT_RA ||
+		    rtm->rtm_protocol == RTPROT_REDIRECT ||
+		    rtm->rtm_protocol == RTPROT_KERNEL)
+			return 2;
+		if (rtm->rtm_protocol == RTPROT_BOOT &&
+		    ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) ||
+		     (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000))))
+			return 2;
+	}
+#endif
+	return rtm->rtm_protocol == RTPROT_KERNEL;
+}
+
+int rst_restore_route(struct cpt_context *ctx)
+{
+	int err;
+	struct socket *sock;
+	struct msghdr msg;
+	struct iovec iov;
+	struct sockaddr_nl nladdr;
+	mm_segment_t oldfs;
+	loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_object_hdr v;
+	char *pg;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	if (h.cpt_hdrlen >= h.cpt_next)
+		return 0;
+
+	sec += h.cpt_hdrlen;
+	err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx);
+	if (err < 0)
+		return err;
+
+	err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
+	if (err)
+		return err;
+
+	pg = (char*)__get_free_page(GFP_KERNEL);
+	if (pg == NULL) {
+		err = -ENOMEM;
+		goto out_sock;
+	}
+
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+
+	endsec = sec + v.cpt_next;
+	sec += v.cpt_hdrlen;
+
+	while (sec < endsec) {
+		struct nlmsghdr *n;
+		struct nlmsghdr nh;
+		int kernel_flag;
+
+		if (endsec - sec < sizeof(nh))
+			break;
+
+		err = ctx->pread(&nh, sizeof(nh), ctx, sec);
+		if (err)
+			goto out_sock_pg;
+		if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE ||
+		    endsec - sec < nh.nlmsg_len) {
+			err = -EINVAL;
+			goto out_sock_pg;
+		}
+		err = ctx->pread(pg, nh.nlmsg_len, ctx, sec);
+		if (err)
+			goto out_sock_pg;
+
+		n = (struct nlmsghdr*)pg;
+		n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE;
+
+		err = rewrite_rtmsg(n, ctx);
+		if (err < 0)
+			goto out_sock_pg;
+		kernel_flag = err;
+
+		if (kernel_flag == 2)
+			goto do_next;
+
+		iov.iov_base=n;
+		iov.iov_len=nh.nlmsg_len;
+		msg.msg_name=&nladdr;
+		msg.msg_namelen=sizeof(nladdr);
+		msg.msg_iov=&iov;
+		msg.msg_iovlen=1;
+		msg.msg_control=NULL;
+		msg.msg_controllen=0;
+		msg.msg_flags=MSG_DONTWAIT;
+
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = sock_sendmsg(sock, &msg, nh.nlmsg_len);
+		set_fs(oldfs);
+
+		if (err < 0)
+			goto out_sock_pg;
+		err = 0;
+
+		iov.iov_base=pg;
+		iov.iov_len=PAGE_SIZE;
+
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+		set_fs(oldfs);
+		if (err != -EAGAIN) {
+			if (n->nlmsg_type == NLMSG_ERROR) {
+				struct nlmsgerr *e = NLMSG_DATA(n);
+				if (e->error != -EEXIST || !kernel_flag)
+					eprintk_ctx("NLMERR: %d\n", e->error);
+			} else {
+				eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type);
+			}
+		}
+do_next:
+		err = 0;
+		sec += NLMSG_ALIGN(nh.nlmsg_len);
+	}
+
+out_sock_pg:
+	free_page((unsigned long)pg);
+out_sock:
+	sock_release(sock);
+	return err;
+}
+
+int rst_resume_network(struct cpt_context *ctx)
+{
+	struct ve_struct *env;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+	env->disable_net = 0;
+	put_ve(env);
+	return 0;
+}
+
+static int rst_restore_netstats(loff_t pos, struct net_device *dev,
+			struct cpt_context * ctx)
+{
+	struct cpt_netstats_image *n;
+	struct net_device_stats *stats;
+	int err;
+
+	if (dev->netdev_ops->ndo_cpt == NULL) {
+		err = -ENODEV;
+		eprintk_ctx("Network device %s is not supported\n", dev->name);
+		return err;
+	}
+
+	n = cpt_get_buf(ctx);
+	err = rst_get_object(CPT_OBJ_NET_STATS, pos, n, ctx);
+	if (err)
+		goto out;
+	BUG_ON(sizeof(struct cpt_netstats_image) != n->cpt_hdrlen);
+	preempt_disable();
+
+	stats = &dev->s_stats;
+
+	stats->rx_packets = n->cpt_rx_packets;
+	stats->tx_packets = n->cpt_tx_packets;
+	stats->rx_bytes = n->cpt_rx_bytes;
+	stats->tx_bytes = n->cpt_tx_bytes;
+	stats->rx_errors = n->cpt_rx_errors;
+	stats->tx_errors = n->cpt_tx_errors;
+	stats->rx_dropped = n->cpt_rx_dropped;
+	stats->tx_dropped = n->cpt_tx_dropped;
+	stats->multicast = n->cpt_multicast;
+	stats->collisions = n->cpt_collisions;
+	stats->rx_length_errors = n->cpt_rx_length_errors;
+	stats->rx_over_errors = n->cpt_rx_over_errors;
+	stats->rx_crc_errors = n->cpt_rx_crc_errors;
+	stats->rx_frame_errors = n->cpt_rx_frame_errors;
+	stats->rx_fifo_errors = n->cpt_rx_fifo_errors;
+	stats->rx_missed_errors = n->cpt_rx_missed_errors;
+	stats->tx_aborted_errors = n->cpt_tx_aborted_errors;
+	stats->tx_carrier_errors = n->cpt_tx_carrier_errors;
+	stats->tx_fifo_errors = n->cpt_tx_fifo_errors;
+	stats->tx_heartbeat_errors = n->cpt_tx_heartbeat_errors;
+	stats->tx_window_errors = n->cpt_tx_window_errors;
+	stats->rx_compressed = n->cpt_rx_compressed;
+	stats->tx_compressed = n->cpt_tx_compressed;
+
+	preempt_enable();
+out:
+	cpt_release_buf(ctx);
+	return err;
+}
+
+static int rst_restore_idev_cnf(loff_t pos, struct net_device *dev,
+			struct cpt_context *ctx)
+{
+	struct cpt_idev_cnf_image *d;
+	struct in_device *in_dev;
+	int err;
+
+	d = cpt_get_buf(ctx);
+	err = rst_get_object(CPT_OBJ_NET_IDEV_CNF, pos, d, ctx);
+	if (err)
+		goto out;
+
+	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
+		if ((in_dev = inetdev_init(dev)) == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+	memcpy(in_dev->cnf.data, d->cpt_data, sizeof(d->cpt_data));
+out:
+	cpt_release_buf(ctx);
+	return err;
+}
+
+int rst_restore_netdev(struct cpt_context *ctx)
+{
+	struct net *net = get_exec_env()->ve_netns;
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_netdev_image di;
+	struct net_device *dev;
+
+	get_exec_env()->disable_net = 1;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		loff_t pos;
+		struct net_device *dev_new;
+		struct netdev_rst *ops;
+
+		err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx);
+		if (err)
+			return err;
+
+		rtnl_lock();
+		pos = sec + di.cpt_hdrlen;
+		if (di.cpt_next > sizeof(di)) {
+			struct cpt_object_hdr hdr;
+			err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr),
+					ctx, sec + di.cpt_hdrlen);
+			if (err)
+				goto out;
+
+			ops = NULL;
+			while (1) {
+				ops = netdev_find_rst(hdr.cpt_object, ops);
+				if (ops == NULL)
+					break;
+
+				err = ops->ndo_rst(sec, &di, &rst_ops, ctx);
+				if (!err) {
+					pos += hdr.cpt_next;
+					break;
+				} else if (err < 0) {
+					eprintk_ctx("netdev %d rst failed %d\n",
+							hdr.cpt_object, err);
+					goto out;
+				}
+			}
+		}
+
+		dev = __dev_get_by_name(net, di.cpt_name);
+		if (dev) {
+			if (dev->ifindex != di.cpt_index) {
+				dev_new = __dev_get_by_index(net, di.cpt_index);
+				if (!dev_new) {
+					write_lock_bh(&dev_base_lock);
+					hlist_del(&dev->index_hlist);
+					if (dev->iflink == dev->ifindex)
+						dev->iflink = di.cpt_index;
+					dev->ifindex = di.cpt_index;
+					hlist_add_head(&dev->index_hlist,
+							dev_index_hash(net, dev->ifindex));
+					write_unlock_bh(&dev_base_lock);
+				} else {
+					write_lock_bh(&dev_base_lock);
+					hlist_del(&dev->index_hlist);
+					hlist_del(&dev_new->index_hlist);
+					if (dev_new->iflink == dev_new->ifindex)
+						dev_new->iflink = dev->ifindex;
+					dev_new->ifindex = dev->ifindex;
+					if (dev->iflink == dev->ifindex)
+						dev->iflink = di.cpt_index;
+					dev->ifindex = di.cpt_index;
+					hlist_add_head(&dev->index_hlist,
+							dev_index_hash(net, dev->ifindex));
+					hlist_add_head(&dev_new->index_hlist,
+							dev_index_hash(net, dev_new->ifindex));
+					write_unlock_bh(&dev_base_lock);
+				}
+			}
+			if (di.cpt_flags^dev->flags) {
+				err = dev_change_flags(dev, di.cpt_flags);
+				if (err)
+					eprintk_ctx("dev_change_flags err: %d\n", err);
+			}
+			if (cpt_object_has(&di, cpt_mtu))
+				dev->mtu = di.cpt_mtu;
+			while (pos < sec + di.cpt_next) {
+				struct cpt_object_hdr hdr;
+				err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr),
+						ctx, pos);
+				if (err)
+					goto out;
+				if (hdr.cpt_object == CPT_OBJ_NET_HWADDR) {
+					/* Restore hardware address */
+					struct cpt_hwaddr_image hw;
+					err = rst_get_object(CPT_OBJ_NET_HWADDR,
+							pos, &hw, ctx);
+					if (err)
+						goto out;
+					BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) !=
+							MAX_ADDR_LEN);
+					memcpy(dev->dev_addr, hw.cpt_dev_addr,
+							sizeof(hw.cpt_dev_addr));
+				} else if (hdr.cpt_object == CPT_OBJ_NET_STATS) {
+					err = rst_restore_netstats(pos, dev, ctx);
+					if (err) {
+						eprintk_ctx("rst stats %s: %d\n",
+								di.cpt_name, err);
+						goto out;
+					}
+				} else if (hdr.cpt_object == CPT_OBJ_NET_IDEV_CNF) {
+					err = rst_restore_idev_cnf(pos, dev, ctx);
+					if (err) {
+						eprintk_ctx("rst idev config %s: %d\n",
+						di.cpt_name, err);
+						goto out;
+					}
+				}
+				pos += hdr.cpt_next;
+			}
+		} else {
+			eprintk_ctx("unknown interface 2 %s\n", di.cpt_name);
+		}
+		rtnl_unlock();
+		sec += di.cpt_next;
+	}
+	return 0;
+out:
+	rtnl_unlock();
+	return err;
+}
+
+struct args_t
+{
+	int *pfd;
+	bool is_ipv6;
+};
+
+static int dumpfn(void *arg)
+{
+	int i;
+	struct args_t *args = arg;
+	int *pfd = args->pfd;
+	char *argv[] = { "iptables-restore", "-c", NULL };
+	const char *path1, *path2;
+
+	if (!args->is_ipv6) {
+		path1 = "/sbin/iptables-restore";
+		path2 = "/usr/sbin/iptables-restore";
+	} else {
+		argv[0] = "ip6tables-restore";
+		path1 = "/sbin/ip6tables-restore";
+		path2 = "/usr/sbin/ip6tables-restore";
+	}
+
+	if (pfd[0] != 0)
+		sc_dup2(pfd[0], 0);
+
+	for (i=1; i<current->files->fdt->max_fds; i++)
+		sc_close(i);
+
+	module_put(THIS_MODULE);
+
+	set_fs(KERNEL_DS);
+	i = kernel_execve(path1, argv, NULL);
+	if (i == -ENOENT)
+		i = kernel_execve(path2, argv, NULL);
+	eprintk("failed to exec %s: %d\n", argv[0], i);
+	return 255 << 8;
+}
+
+static int rst_restore_xtables(struct cpt_context *ctx, loff_t *pos)
+{
+	int err;
+	int pfd[2];
+	struct file *f;
+	struct cpt_object_hdr v;
+	int n;
+	loff_t end;
+	int pid;
+	int status;
+	mm_segment_t oldfs;
+	sigset_t ignore, blocked;
+	struct args_t args;
+
+	err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx);
+	if (err < 0)
+		return err;
+
+	err = sc_pipe(pfd);
+	if (err < 0)
+		return err;
+	args.pfd = pfd;
+	args.is_ipv6 = (v.cpt_content == CPT_CONTENT_NAME ? false : true);
+	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+	sigprocmask(SIG_BLOCK, &ignore, &blocked);
+	pid = err = local_kernel_thread(dumpfn, (void*)&args, SIGCHLD, 0);
+	if (err < 0) {
+		eprintk_ctx("iptables local_kernel_thread: %d\n", err);
+		goto out;
+	}
+	f = fget(pfd[1]);
+	sc_close(pfd[1]);
+	sc_close(pfd[0]);
+
+	ctx->file->f_pos = *pos + v.cpt_hdrlen;
+	end = *pos + v.cpt_next;
+	do {
+		char *p;
+		char buf[16];
+
+		n = end - ctx->file->f_pos;
+		if (n > sizeof(buf))
+			n = sizeof(buf);
+
+		if (ctx->read(buf, n, ctx))
+			break;
+		if ((p = memchr(buf, 0, n)) != NULL)
+			n = p - buf;
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		f->f_op->write(f, buf, n, &f->f_pos);
+		set_fs(oldfs);
+	} while (ctx->file->f_pos < end);
+
+	fput(f);
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	if ((err = sc_waitx(pid, 0, &status)) < 0)
+		eprintk_ctx("wait4: %d\n", err);
+	else if ((status & 0x7f) == 0) {
+		err = (status & 0xff00) >> 8;
+		if (err != 0) {
+			eprintk_ctx("iptables-restore exited with %d\n", err);
+			eprintk_ctx("Most probably some iptables modules are not loaded\n");
+			eprintk_ctx("or CT's iptables utilities are incompatible with this kernel (version is older than 1.4.0)\n");
+			eprintk_ctx("(Offline migration and iptools upgrade might help).\n");
+			err = -EINVAL;
+		}
+	} else {
+		eprintk_ctx("iptables-restore terminated\n");
+		err = -EINVAL;
+	}
+	set_fs(oldfs);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+	*pos = end;
+
+	return err;
+
+out:
+	if (pfd[1] >= 0)
+		sc_close(pfd[1]);
+	if (pfd[0] >= 0)
+		sc_close(pfd[0]);
+	sigprocmask(SIG_SETMASK, &blocked, NULL);
+	return err;
+}
+
+static int rst_restore_iptables(struct cpt_context *ctx)
+{
+	loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES];
+	struct cpt_section_hdr h;
+	loff_t pos;
+	int err;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	if (h.cpt_hdrlen == h.cpt_next)
+		return 0;
+	if (h.cpt_hdrlen > h.cpt_next)
+		return -EINVAL;
+	pos = sec + h.cpt_hdrlen;
+
+	err = rst_restore_xtables(ctx, &pos);
+	if (err)
+		return err;
+	else if (pos == sec + h.cpt_next)
+		return 0;
+
+	return rst_restore_xtables(ctx, &pos);
+}
+
+static int rst_restore_snmp_stat(struct cpt_context *ctx, void *mib[], int n,
+		loff_t *ppos, loff_t endpos)
+{
+	int err, in, i;
+	struct cpt_object_hdr o;
+	__u32 *stats;
+
+	err = rst_get_object(CPT_OBJ_BITS, *ppos, &o, ctx);
+	if (err)
+		return err;
+
+	in = o.cpt_next - o.cpt_hdrlen;
+	if (in >= PAGE_SIZE - 4) {
+		eprintk_ctx("Too long SNMP buf (%d)\n", in);
+		return -EINVAL;
+	}
+
+	if (o.cpt_content != CPT_CONTENT_DATA) {
+		if (o.cpt_content == CPT_CONTENT_VOID)
+			return 1;
+
+		eprintk_ctx("Corrupted SNMP stats\n");
+		return -EINVAL;
+	}
+
+	stats = cpt_get_buf(ctx);
+	err = ctx->pread(stats, in, ctx, (*ppos) + o.cpt_hdrlen);
+	if (err)
+		goto out;
+	/*
+	 * IPv6 can be not loaded or disabled.
+	 */
+	if (mib[0] == NULL)
+		goto out;
+
+	in /= sizeof(*stats);
+	if (in > n)
+		wprintk_ctx("SNMP stats trimmed\n");
+	else
+		n = in;
+
+	for (i = 0; i < n; i++)
+		*((unsigned long *)(per_cpu_ptr(mib[0], 0)) + i) = stats[i];
+
+	*ppos += o.cpt_next;
+	if (*ppos < endpos)
+		err = 1; /* go on restoring */
+out:
+	cpt_release_buf(ctx);
+	return err;
+}
+
+static int rst_restore_snmp(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_SNMP_STATS];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct ve_struct *ve;
+	struct net *net;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_SNMP_STATS || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	ve = get_exec_env();
+	net = ve->ve_netns;
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	if (sec >= endsec)
+		goto out;
+
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.net_statistics,
+			LINUX_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ip_statistics,
+			IPSTATS_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.tcp_statistics,
+			TCP_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_statistics,
+			UDP_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmp_statistics,
+			ICMP_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics,
+			ICMPMSG_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ipv6_statistics,
+			IPSTATS_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_stats_in6,
+			UDP_MIB_MAX, &sec, endsec);
+	if (err <= 0)
+		goto out;
+	err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpv6_statistics,
+			ICMP6_MIB_MAX, &sec, endsec);
+#endif
+	if (err == 1)
+		err = 0;
+out:
+	return err;
+}
+
+int rst_restore_net(struct cpt_context *ctx)
+{
+	int err;
+
+	err = rst_restore_netdev(ctx);
+	if (!err)
+		err = rst_restore_ifaddr(ctx);
+	if (!err)
+		err = rst_restore_route(ctx);
+	if (!err)
+		err = rst_restore_iptables(ctx);
+	if (!err)
+		err = rst_restore_ip_conntrack(ctx);
+	if (!err)
+		err = rst_restore_snmp(ctx);
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_proc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_proc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_proc.c	2015-01-21 12:02:48.231093445 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_proc.c	2015-01-21 12:02:50.408035655 +0300
@@ -0,0 +1,606 @@
+/*
+ *
+ *  kernel/cpt/rst_proc.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_ioctl.h>
+#include <linux/kmod.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_dump.h"
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+
+MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>");
+MODULE_LICENSE("GPL");
+
+/* List of contexts and lock protecting the list */
+static struct list_head cpt_context_list;
+static spinlock_t cpt_context_lock;
+
+static int proc_read(char *buffer, char **start, off_t offset,
+		     int length, int *eof, void *data)
+{
+	off_t pos = 0;
+	off_t begin = 0;
+	int len = 0;
+	cpt_context_t *ctx;
+
+	len += sprintf(buffer, "Ctx      Id       VE       State\n");
+
+	spin_lock(&cpt_context_lock);
+
+	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+		len += sprintf(buffer+len,"%p %08x %-8u %d",
+			       ctx,
+			       ctx->contextid,
+			       ctx->ve_id,
+			       ctx->ctx_state
+			       );
+
+		buffer[len++] = '\n';
+
+		pos = begin+len;
+		if (pos < offset) {
+			len = 0;
+			begin = pos;
+		}
+		if (pos > offset+length)
+			goto done;
+	}
+	*eof = 1;
+
+done:
+	spin_unlock(&cpt_context_lock);
+	*start = buffer + (offset - begin);
+	len -= (offset - begin);
+	if(len > length)
+		len = length;
+	if(len < 0)
+		len = 0;
+	return len;
+}
+
+void rst_context_release(cpt_context_t *ctx)
+{
+	list_del(&ctx->ctx_list);
+	spin_unlock(&cpt_context_lock);
+
+	if (ctx->ctx_state > 0)
+		rst_kill(ctx);
+	ctx->ctx_state = CPT_CTX_ERROR;
+
+	rst_close_dumpfile(ctx);
+
+	rst_close_pram(ctx);
+
+	if (ctx->anonvmas) {
+		int h;
+		for (h = 0; h < CPT_ANONVMA_HSIZE; h++) {
+			while (!hlist_empty(&ctx->anonvmas[h])) {
+				struct hlist_node *elem = ctx->anonvmas[h].first;
+				hlist_del(elem);
+				kfree(elem);
+			}
+		}
+		free_page((unsigned long)ctx->anonvmas);
+	}
+	cpt_flush_error(ctx);
+	if (ctx->errorfile) {
+		fput(ctx->errorfile);
+		ctx->errorfile = NULL;
+	}
+	if (ctx->error_msg) {
+		free_page((unsigned long)ctx->error_msg);
+		ctx->error_msg = NULL;
+	}
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+	rst_drop_iter_rbtree(ctx);
+	if (ctx->pagein_file_out)
+		fput(ctx->pagein_file_out);
+	if (ctx->pagein_file_in)
+		fput(ctx->pagein_file_in);
+#endif
+	if (ctx->filejob_queue)
+		rst_flush_filejobs(ctx);
+	if (ctx->vdso)
+		free_page((unsigned long)ctx->vdso);
+	if (ctx->objcount)
+		eprintk_ctx("%d objects leaked\n", ctx->objcount);
+	kfree(ctx);
+
+	spin_lock(&cpt_context_lock);
+}
+
+static void __cpt_context_put(cpt_context_t *ctx)
+{
+	if (!--ctx->refcount)
+		rst_context_release(ctx);
+}
+
+static void cpt_context_put(cpt_context_t *ctx)
+{
+	spin_lock(&cpt_context_lock);
+	__cpt_context_put(ctx);
+	spin_unlock(&cpt_context_lock);
+}
+
+cpt_context_t * rst_context_open(void)
+{
+	cpt_context_t *ctx;
+
+	if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
+		rst_context_init(ctx);
+		spin_lock(&cpt_context_lock);
+		list_add_tail(&ctx->ctx_list, &cpt_context_list);
+		spin_unlock(&cpt_context_lock);
+		ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
+		if (ctx->error_msg != NULL)
+			ctx->error_msg[0] = 0;
+	}
+	return ctx;
+}
+
+void rst_report_error(int err, cpt_context_t *ctx)
+{
+	if (ctx->statusfile) {
+		mm_segment_t oldfs;
+		int status = 7 /* VZ_ENVCREATE_ERROR */;
+
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		if (ctx->statusfile->f_op && ctx->statusfile->f_op->write)
+			ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos);
+		set_fs(oldfs);
+		fput(ctx->statusfile);
+		ctx->statusfile = NULL;
+	}
+}
+
+
+static cpt_context_t * cpt_context_lookup(unsigned int ctxid)
+{
+	cpt_context_t *ctx;
+
+	spin_lock(&cpt_context_lock);
+	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+		if (ctx->contextid == ctxid) {
+			ctx->refcount++;
+			spin_unlock(&cpt_context_lock);
+			return ctx;
+		}
+	}
+	spin_unlock(&cpt_context_lock);
+	return NULL;
+}
+
+static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
+{
+	int err = 0;
+	cpt_context_t *ctx;
+	struct file *dfile = NULL;
+
+	unlock_kernel();
+
+	request_module("vzcptpram");
+
+	if (cmd == CPT_TEST_CAPS) {
+		err = test_cpu_caps_and_features();
+		goto out_lock;
+	}
+
+	if (cmd == CPT_TEST_VERSION) {
+		err = rst_image_acceptable(arg);
+		goto out_lock;
+	}
+
+	if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
+		cpt_context_t *old_ctx;
+
+		ctx = NULL;
+		if (cmd == CPT_JOIN_CONTEXT) {
+			err = -ENOENT;
+			ctx = cpt_context_lookup(arg);
+			if (!ctx)
+				goto out_lock;
+		}
+
+		spin_lock(&cpt_context_lock);
+		old_ctx = (cpt_context_t*)file->private_data;
+		file->private_data = ctx;
+
+		if (old_ctx) {
+			if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
+				old_ctx->sticky = 0;
+				old_ctx->refcount--;
+			}
+			__cpt_context_put(old_ctx);
+		}
+		spin_unlock(&cpt_context_lock);
+		err = 0;
+		goto out_lock;
+	}
+
+	spin_lock(&cpt_context_lock);
+	ctx = (cpt_context_t*)file->private_data;
+	if (ctx)
+		ctx->refcount++;
+	spin_unlock(&cpt_context_lock);
+
+	if (!ctx) {
+		cpt_context_t *old_ctx;
+
+		err = -ENOMEM;
+		ctx = rst_context_open();
+		if (!ctx)
+			goto out_lock;
+
+		spin_lock(&cpt_context_lock);
+		old_ctx = (cpt_context_t*)file->private_data;
+		if (!old_ctx) {
+			ctx->refcount++;
+			file->private_data = ctx;
+		} else {
+			old_ctx->refcount++;
+		}
+		if (old_ctx) {
+			__cpt_context_put(ctx);
+			ctx = old_ctx;
+		}
+		spin_unlock(&cpt_context_lock);
+	}
+
+	if (cmd == CPT_GET_CONTEXT) {
+		unsigned int contextid = (unsigned int)arg;
+
+		err = -EINVAL;
+		if (ctx->contextid && ctx->contextid != contextid)
+			goto out_nosem;
+		if (!ctx->contextid) {
+			cpt_context_t *c1 = cpt_context_lookup(contextid);
+			if (c1) {
+				cpt_context_put(c1);
+				err = -EEXIST;
+				goto out_nosem;
+			}
+			ctx->contextid = contextid;
+		}
+		spin_lock(&cpt_context_lock);
+		if (!ctx->sticky) {
+			ctx->sticky = 1;
+			ctx->refcount++;
+		}
+		spin_unlock(&cpt_context_lock);
+		err = 0;
+		goto out_nosem;
+	}
+
+	down(&ctx->main_sem);
+
+	err = -EBUSY;
+	if (ctx->ctx_state < 0)
+		goto out;
+
+	err = 0;
+	switch (cmd) {
+	case CPT_SET_DUMPFD:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		if (arg >= 0) {
+			err = -EBADF;
+			dfile = fget(arg);
+			if (dfile == NULL)
+				break;
+			if (dfile->f_op == NULL ||
+			    dfile->f_op->read == NULL) {
+				fput(dfile);
+				break;
+			}
+			err = 0;
+		}
+		if (ctx->file)
+			fput(ctx->file);
+		ctx->file = dfile;
+		break;
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+	case CPT_SET_PAGEINFDIN:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->pagein_file_in)
+			fput(ctx->pagein_file_in);
+		ctx->pagein_file_in = dfile;
+		break;
+	case CPT_SET_PAGEINFDOUT:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->pagein_file_out)
+			fput(ctx->pagein_file_out);
+		ctx->pagein_file_out = dfile;
+		break;
+	case CPT_ITER:
+		err = rst_iteration(ctx);
+		break;
+#endif
+	case CPT_SET_LOCKFD:
+	case CPT_SET_LOCKFD2:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->lockfile)
+			fput(ctx->lockfile);
+		ctx->lockfile = dfile;
+		ctx->lockfile_new = (cmd == CPT_SET_LOCKFD2);
+		break;
+	case CPT_SET_STATUSFD:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->statusfile)
+			fput(ctx->statusfile);
+		ctx->statusfile = dfile;
+		break;
+	case CPT_SET_ERRORFD:
+		if (arg >= 0) {
+			dfile = fget(arg);
+			if (dfile == NULL) {
+				err = -EBADF;
+				break;
+			}
+		}
+		if (ctx->errorfile)
+			fput(ctx->errorfile);
+		ctx->errorfile = dfile;
+		break;
+	case CPT_HARDLNK_ON:
+		ctx->hardlinked_on = 1;
+		break;
+	case CPT_SET_VEID:
+		if (ctx->ctx_state > 0) {
+			err = -EBUSY;
+			break;
+		}
+		ctx->ve_id = arg;
+		break;
+	case CPT_UNDUMP:
+		if (ctx->ctx_state > 0) {
+			err = -ENOENT;
+			break;
+		}
+		ctx->ctx_state = CPT_CTX_UNDUMPING;
+#ifdef ITER_DEBUG
+		rst_iteration(ctx);
+#endif
+		err = vps_rst_undump(ctx);
+		if (err) {
+			int ret;
+
+			rst_report_error(err, ctx);
+
+			ret = rst_kill(ctx);
+			if (ret == 0 || ret == -ESRCH)
+				ctx->ctx_state = CPT_CTX_IDLE;
+			else
+				ctx->ctx_state = CPT_CTX_ERROR;
+		} else {
+			ctx->ctx_state = CPT_CTX_UNDUMPED;
+			printk(KERN_INFO "CT: %d: restored\n", ctx->ve_id);
+		}
+		break;
+	case CPT_RESUME:
+		if (ctx->ctx_state != CPT_CTX_UNDUMPED) {
+			err = -ENOENT;
+			break;
+		}
+		err = rst_resume(ctx);
+		if (!err)
+			ctx->ctx_state = CPT_CTX_IDLE;
+		break;
+	case CPT_KILL:
+		if (!ctx->ctx_state) {
+			err = -ENOENT;
+			break;
+		}
+		err = rst_kill(ctx);
+		if (!err)
+			ctx->ctx_state = CPT_CTX_IDLE;
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	cpt_flush_error(ctx);
+	up(&ctx->main_sem);
+out_nosem:
+	cpt_context_put(ctx);
+out_lock:
+	lock_kernel();
+	if (err == -ERESTARTSYS || err == -ERESTARTNOINTR ||
+	    err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK)
+		err = -EINTR;
+	return err;
+}
+
+static int rst_open(struct inode * inode, struct file * file)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int rst_release(struct inode * inode, struct file * file)
+{
+	cpt_context_t *ctx;
+
+	spin_lock(&cpt_context_lock);
+	ctx = (cpt_context_t*)file->private_data;
+	file->private_data = NULL;
+	if (ctx)
+		__cpt_context_put(ctx);
+	spin_unlock(&cpt_context_lock);
+
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static struct file_operations rst_fops =
+{
+	.owner		= THIS_MODULE,
+	.ioctl		= rst_ioctl,
+	.open		= rst_open,
+	.release	= rst_release,
+};
+
+
+static struct proc_dir_entry *proc_ent;
+extern void *schedule_tail_p;
+extern void schedule_tail_hook(void);
+extern struct ctl_table delayfs_table[];
+
+static struct ctl_table_header *ctl_header;
+
+static ctl_table debug_table[] = {
+	{
+		.procname	= "rst",
+		.data		= &debug_level,
+		.maxlen		= sizeof(debug_level),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "delayfs",
+		.mode		= 0555,
+		.child		= delayfs_table,
+	},
+	{ .ctl_name = 0 }
+};
+static ctl_table root_table[] = {
+	{
+		.ctl_name	= CTL_DEBUG,
+		.procname	= "debug",
+		.mode		= 0555,
+		.child		= debug_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init init_rst(void)
+{
+	int err;
+
+	err = register_filesystem(&delayfs_type);
+	if (err)
+		goto err_fs;
+
+	err = -ENOMEM;
+	ctl_header = register_sysctl_table(root_table);
+	if (!ctl_header)
+		goto err_mon;
+
+	spin_lock_init(&cpt_context_lock);
+	INIT_LIST_HEAD(&cpt_context_list);
+
+	err = -EINVAL;
+	proc_ent = proc_create("rst", 0600, NULL, NULL);
+	if (!proc_ent)
+		goto err_out;
+
+	rst_fops.read = proc_ent->proc_fops->read;
+	rst_fops.write = proc_ent->proc_fops->write;
+	rst_fops.llseek = proc_ent->proc_fops->llseek;
+	proc_ent->proc_fops = &rst_fops;
+
+	proc_ent->read_proc = proc_read;
+	proc_ent->data = NULL;
+	return 0;
+
+err_out:
+	unregister_sysctl_table(ctl_header);
+err_mon:
+	unregister_filesystem(&delayfs_type);
+err_fs:
+	return err;
+}
+module_init(init_rst);
+
+static void __exit exit_rst(void)
+{
+	remove_proc_entry("rst", NULL);
+	unregister_sysctl_table(ctl_header);
+
+	spin_lock(&cpt_context_lock);
+	while (!list_empty(&cpt_context_list)) {
+		cpt_context_t *ctx;
+		ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
+
+		if (!ctx->sticky)
+			ctx->refcount++;
+		ctx->sticky = 0;
+
+		BUG_ON(ctx->refcount != 1);
+
+		__cpt_context_put(ctx);
+	}
+	spin_unlock(&cpt_context_lock);
+	unregister_filesystem(&delayfs_type);
+}
+module_exit(exit_rst);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_process.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_process.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_process.c	2015-01-21 12:02:48.232093419 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_process.c	2015-01-21 12:02:54.124936987 +0300
@@ -0,0 +1,1819 @@
+/*
+ *
+ *  kernel/cpt/rst_process.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/ptrace.h>
+#include <linux/tty.h>
+#include <linux/nsproxy.h>
+#include <linux/securebits.h>
+#ifdef CONFIG_X86
+#include <asm/desc.h>
+#include <asm/i387.h>
+#endif
+#include <asm/unistd.h>
+
+#include <bc/beancounter.h>
+#include <bc/misc.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_ubc.h"
+#include "cpt_process.h"
+#include "cpt_kernel.h"
+#include "cpt_syscalls.h"
+
+
+#define HOOK_RESERVE	256
+
+struct resume_info
+{
+	asmlinkage void (*hook)(struct resume_info *);
+	unsigned long	hooks;
+#define HOOK_TID	0
+#define HOOK_CONT	1
+#define HOOK_LSI	2
+#define HOOK_RESTART	3
+	unsigned long	tid_ptrs[2];
+	siginfo_t	last_siginfo;
+};
+
+#ifdef CONFIG_X86_32
+
+#define IN_SYSCALL(regs)	((long)(regs)->orig_ax >= 0)
+#define IN_ERROR(regs)		((long)(regs)->ax < 0)
+#define SYSCALL_ERRNO(regs)	(-(long)((regs)->ax))
+#define SYSCALL_RETVAL(regs)	((regs)->ax)
+#define SYSCALL_NR(regs)	((regs)->orig_ax)
+
+#define SYSCALL_SETRET(regs,val)	do { (regs)->ax = (val); } while (0)
+
+#define SYSCALL_RESTART2(regs,new)	do { (regs)->ax = (new); \
+					     (regs)->ip -= 2; } while (0) 
+
+#define syscall_is(tsk,regs,name)	(SYSCALL_NR(regs) == __NR_##name)
+
+/* In new kernels task_pt_regs() is define to something inappropriate */
+#undef task_pt_regs
+#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1)
+
+#elif defined(CONFIG_X86_64)
+
+#define IN_SYSCALL(regs)	((long)(regs)->orig_ax >= 0)
+#define IN_ERROR(regs)		((long)(regs)->ax < 0)
+#define SYSCALL_ERRNO(regs)	(-(long)((regs)->ax))
+#define SYSCALL_RETVAL(regs)	((regs)->ax)
+#define SYSCALL_NR(regs)	((regs)->orig_ax)
+
+#define SYSCALL_SETRET(regs,val)	do { (regs)->ax = (val); } while (0)
+
+#define SYSCALL_RESTART2(regs,new)	do { (regs)->ax = (new); \
+					     (regs)->ip -= 2; } while (0) 
+
+#define __NR32_restart_syscall	0
+#define __NR32_rt_sigtimedwait	177
+#define __NR32_pause		29
+#define __NR32_futex		240
+
+#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \
+				    SYSCALL_NR(regs) == __NR_##name) || \
+				   ((task_thread_info(tsk)->flags&_TIF_IA32) && \
+				    SYSCALL_NR(regs) == __NR32_##name))
+
+#elif defined (CONFIG_IA64)
+
+#define IN_SYSCALL(regs)	((long)(regs)->cr_ifs >= 0)
+#define IN_ERROR(regs)		((long)(regs)->r10 == -1)
+#define SYSCALL_ERRNO(regs)	((regs)->r10 == -1 ? (long)((regs)->r8) : 0)
+#define SYSCALL_RETVAL(regs)	((regs)->r8)
+#define SYSCALL_NR(regs)	((regs)->cr_ifs >= 0 ? (regs)->r15 : -1)
+
+#define SYSCALL_SETRET(regs,val)	do { (regs)->r8 = (val); } while (0)
+
+#define SYSCALL_RESTART2(regs,new)	do { (regs)->r15 = (new); \
+					     (regs)->r10 = 0; \
+					     ia64_decrement_ip(regs); } while (0) 
+
+#define syscall_is(tsk,regs,name)	(SYSCALL_NR(regs) == __NR_##name)
+
+#else
+
+#error This arch is not supported
+
+#endif
+
+#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs))
+
+pid_t vpid_to_pid(pid_t nr)
+{
+	pid_t vnr;
+	struct pid *pid;
+
+	rcu_read_lock();
+	pid = find_vpid(nr);
+	vnr = (pid == NULL ? -1 : pid->numbers[0].nr);
+	rcu_read_unlock();
+	return vnr;
+}
+
+static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si)
+{
+	memset(info, 0, sizeof(*info));
+
+	if (cpt_object_has(si, cpt_sifields)) {
+		memcpy(&info->_sifields, si->cpt_sifields, sizeof(si->cpt_sifields));
+		goto fill_common;
+	}
+
+	switch(si->cpt_code & __SI_MASK) {
+	case __SI_TIMER:
+		info->si_tid = si->cpt_pid;
+		info->si_overrun = si->cpt_uid;
+		info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval);
+		info->si_sys_private = si->cpt_utime;
+		break;
+	case __SI_POLL:
+		info->si_band = si->cpt_pid;
+		info->si_fd = si->cpt_uid;
+		break;
+	case __SI_FAULT:
+		info->si_addr = cpt_ptr_import(si->cpt_sigval);
+#ifdef __ARCH_SI_TRAPNO
+		info->si_trapno = si->cpt_pid;
+#endif
+		break;
+	case __SI_CHLD:
+		info->si_pid = si->cpt_pid;
+		info->si_uid = si->cpt_uid;
+		info->si_status = si->cpt_sigval;
+		info->si_stime = si->cpt_stime;
+		info->si_utime = si->cpt_utime;
+		break;
+	case __SI_KILL:
+	case __SI_RT:
+	case __SI_MESGQ:
+	default:
+		info->si_pid = si->cpt_pid;
+		info->si_uid = si->cpt_uid;
+		info->si_ptr = cpt_ptr_import(si->cpt_sigval);
+		break;
+	}
+
+fill_common:
+	info->si_signo = si->cpt_signo;
+	info->si_errno = si->cpt_errno;
+	info->si_code = si->cpt_code;
+}
+
+static int restore_sigqueue(struct task_struct *tsk,
+			    struct sigpending *queue, unsigned long start,
+			    unsigned long end)
+{
+	while (start < end) {
+		struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start;
+		if (si->cpt_object == CPT_OBJ_SIGINFO) {
+			struct sigqueue *q;
+
+			q = __sigqueue_alloc(tsk, GFP_KERNEL, 1);
+			if (!q)
+				return -ENOMEM;
+
+			/* Preallocated elements (posix timers) are
+			 * handled separately so this is OK */
+			decode_siginfo(&q->info, si);
+			list_add_tail(&q->list, &queue->list);
+		}
+		start += si->cpt_next;
+	}
+	return 0;
+}
+
+int rst_process_linkage(cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		struct cpt_task_image *ti = obj->o_image;
+
+		if (tsk == NULL) {
+			eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm);
+			return -EINVAL;
+		}
+
+		if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) {
+			struct pid *pid;
+
+			pid = alloc_vpid_safe(ti->cpt_pgrp);
+			if (!pid) {
+				eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk));
+				return -EINVAL;
+			}
+
+			write_lock_irq(&tasklist_lock);
+			detach_pid(tsk, PIDTYPE_PGID);
+			if (thread_group_leader(tsk))
+				attach_pid(tsk, PIDTYPE_PGID, pid);
+			else
+				put_pid(pid);
+			write_unlock_irq(&tasklist_lock);
+
+			if (task_pgrp_vnr(tsk) != pid_vnr(pid)) {
+				eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk));
+				return -EINVAL;
+			}
+		}
+		if (task_session_vnr(tsk) != ti->cpt_session) {
+			struct pid *pid;
+
+			pid = alloc_vpid_safe(ti->cpt_session);
+			if (!pid) {
+				eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk));
+				return -EINVAL;
+			}
+
+			write_lock_irq(&tasklist_lock);
+			detach_pid(tsk, PIDTYPE_SID);
+			if (thread_group_leader(tsk))
+				attach_pid(tsk, PIDTYPE_SID, pid);
+			else
+				put_pid(pid);
+			write_unlock_irq(&tasklist_lock);
+
+			if (task_session_vnr(tsk) != pid_vnr(pid)) {
+				eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk));
+				return -EINVAL;
+			}
+		}
+		if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) {
+			struct pid *pid;
+
+			pid = find_get_pid(ti->cpt_old_pgrp);
+			if (!pid) {
+				eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk));
+				return -EINVAL;
+			}
+			tsk->signal->tty_old_pgrp = pid;
+		}
+	}
+
+	return 0;
+}
+
+struct pid *alloc_vpid_safe(pid_t vnr)
+{
+	struct pid *pid;
+
+	pid = alloc_pid(current->nsproxy->pid_ns, vnr);
+	if (!pid)
+		pid = find_get_pid(vnr);
+	return pid;
+}
+
+struct pid *alloc_dummy_vpid(pid_t vnr)
+{
+	struct pid *pid;
+
+	pid = find_get_pid(vnr);
+	if (pid)
+		return pid;
+	/*
+	 * The pid belongs to a dead process. Allocate a new detached pid by
+	 * clearing any references to it in the pidmap after allocation.
+	 */
+	pid = alloc_pid(current->nsproxy->pid_ns, vnr);
+	if (pid) {
+		get_pid(pid);
+		free_pid(pid);
+	}
+	return pid;
+}
+
+int restore_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx)
+{
+	int err;
+	struct cpt_signal_image *si;
+
+	if (!thread_group_leader(current))
+		return 0;
+
+	si = cpt_get_buf(ctx);
+
+	tty_kref_put(current->signal->tty);
+	current->signal->tty = NULL;
+
+	err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx);
+	if (err) {
+		cpt_release_buf(ctx);
+		return err;
+	}
+
+#if 0 /* this should have been restored in rst_process_linkage */
+	if (task_pgrp_vnr(current) != si->cpt_pgrp) {
+		struct pid * pid = NULL, *free = NULL;
+
+		rcu_read_lock();
+		if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) {
+#if 0
+			if (!is_virtual_pid(si->cpt_pgrp)) {
+				eprintk_ctx("external process group " CPT_FID, CPT_TID(current));
+				cpt_release_buf(ctx);
+				return -EINVAL;
+			}
+#endif
+			pid = alloc_vpid_safe(si->cpt_pgrp);
+			free = pid;
+		}
+		write_lock_irq(&tasklist_lock);
+		if (pid != NULL) {
+			if (task_pgrp_nr(current) != pid_nr(pid)) {
+				detach_pid(current, PIDTYPE_PGID);
+				if (thread_group_leader(current)) {
+					attach_pid(current, PIDTYPE_PGID, pid);
+					free = NULL;
+				}
+			}
+		}
+		write_unlock_irq(&tasklist_lock);
+		if (free != NULL)
+			free_pid(free);
+		rcu_read_unlock();
+	}
+#endif
+
+	put_pid(current->signal->tty_old_pgrp);
+	current->signal->tty_old_pgrp = NULL;
+	if ((int)si->cpt_old_pgrp > 0) {
+		if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) {
+			current->signal->tty_old_pgrp =
+					alloc_pid(current->nsproxy->pid_ns, 0);
+			if (!current->signal->tty_old_pgrp) {
+				eprintk_ctx("failed to allocate stray tty_old_pgrp\n");
+				cpt_release_buf(ctx);
+				return -EINVAL;
+			}
+		} else {
+			struct pid *pid;
+
+			pid = alloc_vpid_safe(si->cpt_old_pgrp);
+			if (!pid)
+				dprintk_ctx("forward old tty PGID\n");
+			current->signal->tty_old_pgrp = pid;
+		}
+	}
+
+#if 0 /* this should have been restored in rst_process_linkage */
+	if (task_session_vnr(current) != si->cpt_session) {
+		struct pid * pid = NULL, *free = NULL;
+
+		rcu_read_lock();
+		if (si->cpt_session_type == CPT_PGRP_ORPHAN) {
+#if 0
+			if (!is_virtual_pid(si->cpt_session)) {
+				eprintk_ctx("external process session " CPT_FID, CPT_TID(current));
+				cpt_release_buf(ctx);
+				return -EINVAL;
+			}
+#endif
+			pid = alloc_vpid_safe(si->cpt_session);
+			free = pid;
+		}
+		write_lock_irq(&tasklist_lock);
+		if (pid == NULL)
+			pid = find_vpid(si->cpt_session);
+		if (pid != NULL) {
+			if (task_session_nr(current) != pid_nr(pid)) {
+				detach_pid(current, PIDTYPE_SID);
+				set_task_session(current, pid_nr(pid));
+				if (thread_group_leader(current)) {
+					attach_pid(current, PIDTYPE_SID, pid);
+					free = NULL;
+				}
+			}
+		}
+		write_unlock_irq(&tasklist_lock);
+		if (free != NULL)
+			free_pid(free);
+		rcu_read_unlock();
+	}
+#endif
+
+	flush_sigqueue(&current->signal->shared_pending);
+
+	cpt_sigset_import(&current->signal->shared_pending.signal, si->cpt_sigpending);
+	current->signal->leader = si->cpt_leader;
+	if (si->cpt_ctty != CPT_NULL) {
+		cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx);
+		if (obj) {
+			struct tty_struct *tty = obj->o_obj;
+
+			if (current->signal->tty) {
+				wprintk_ctx("strange, current->signal->tty == 0x%p for task '%s'\n",
+							current->signal->tty,
+							current->comm);
+				tty_kref_put(current->signal->tty);
+			}
+			current->signal->tty = tty_kref_get(tty);
+		} else {
+			wprintk_ctx("oops, can't find tty for task '%s' (si->cpt_ctty: %Ld)",
+						current->comm, si->cpt_ctty);
+		}
+	}
+
+	if (si->cpt_curr_target) {
+		current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target);
+		if (current->signal->curr_target == NULL) {
+			wprintk_ctx("oops, curr_target=NULL, pid=%u\n", si->cpt_curr_target);
+			current->signal->curr_target = current;
+		}
+	}
+	current->signal->flags = 0;
+	if (cpt_object_has(si, cpt_flags)) {
+		if (si->cpt_flags & CPT_SIGNAL_STOP_STOPPED)
+			current->signal->flags |= SIGNAL_STOP_STOPPED;
+		if (si->cpt_flags & CPT_SIGNAL_STOP_CONTINUED)
+			current->signal->flags |= SIGNAL_STOP_CONTINUED;
+		if (si->cpt_flags & CPT_SIGNAL_CLD_STOPPED)
+			current->signal->flags |= SIGNAL_CLD_STOPPED;
+		if (si->cpt_flags & CPT_SIGNAL_CLD_CONTINUED)
+			current->signal->flags |= SIGNAL_CLD_CONTINUED;
+	}
+
+	*exiting = si->cpt_group_exit;
+	current->signal->group_exit_code = si->cpt_group_exit_code;
+	if (si->cpt_group_exit_task) {
+		current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task);
+		if (current->signal->group_exit_task == NULL) {
+			eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task);
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+	}
+	current->signal->notify_count = si->cpt_notify_count;
+	current->signal->group_stop_count = si->cpt_group_stop_count;
+
+	if (si->cpt_next > si->cpt_hdrlen) {
+		char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL);
+		if (buf == NULL) {
+			cpt_release_buf(ctx);
+			return -ENOMEM;
+		}
+		err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx,
+				 ti->cpt_signal + si->cpt_hdrlen);
+		if (err) {
+			kfree(buf);
+			cpt_release_buf(ctx);
+			return err;
+		}
+		restore_sigqueue(current,
+				 &current->signal->shared_pending, (unsigned long)buf,
+				 (unsigned long)buf + si->cpt_next - si->cpt_hdrlen);
+		kfree(buf);
+	}
+	cpt_release_buf(ctx);
+	return 0;
+}
+
+int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_sighand_image si;
+	int i;
+	loff_t pos, endpos;
+	
+	err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx);
+	if (err)
+		return err;
+
+	for (i=0; i<_NSIG; i++) {
+		current->sighand->action[i].sa.sa_handler = SIG_DFL;
+#ifndef CONFIG_IA64
+		current->sighand->action[i].sa.sa_restorer = 0;
+#endif
+		current->sighand->action[i].sa.sa_flags = 0;
+		memset(&current->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t));
+	}
+
+	pos = ti->cpt_sighand + si.cpt_hdrlen;
+	endpos = ti->cpt_sighand + si.cpt_next;
+	while (pos < endpos) {
+		struct cpt_sighandler_image shi;
+
+		err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx);
+		if (err)
+			return err;
+		current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler;
+#ifndef CONFIG_IA64
+		current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer;
+#endif
+		current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags;
+		cpt_sigset_import(&current->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask);
+		pos += shi.cpt_next;
+	}
+
+	return 0;
+}
+
+
+__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	__u32 flag = 0;
+
+	if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx))
+		flag |= CLONE_THREAD;
+	if (ti->cpt_sighand == CPT_NULL ||
+	    lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx))
+		flag |= CLONE_SIGHAND;
+	return flag;
+}
+
+int
+rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx)
+{
+	int err;
+	cpt_object_t *obj;
+
+	if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) {
+		return -EINVAL;
+	}
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx);
+	if (obj) {
+		struct sighand_struct *sig = current->sighand;
+		if (obj->o_obj != sig) {
+			return -EINVAL;
+		}
+	} else {
+		obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx);
+		if (obj == NULL)
+			return -ENOMEM;
+		cpt_obj_setpos(obj, ti->cpt_sighand, ctx);
+		err = restore_one_sighand_struct(ti, ctx);
+		if (err)
+			return err;
+	}
+
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx);
+	if (obj) {
+		struct signal_struct *sig = current->signal;
+		if (obj->o_obj != sig) {
+			return -EINVAL;
+		}
+/*		if (current->signal) {
+			pid_t session;
+
+			session = process_session(current);
+			set_process_vgroup(current, session);
+			set_signal_vsession(current->signal, session);
+		}*/
+	} else {
+		obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx);
+		if (obj == NULL)
+			return -ENOMEM;
+		cpt_obj_setpos(obj, ti->cpt_signal, ctx);
+	}
+
+	return 0;
+}
+
+static int restore_posix_timer_list(struct cpt_object_hdr *tli, loff_t pos,
+				    struct cpt_context *ctx)
+{
+	loff_t offset;
+
+	offset = pos + tli->cpt_hdrlen;
+	while (offset < pos + tli->cpt_next) {
+		struct cpt_posix_timer_image timi;
+		struct timespec dump_time, delta_time;
+		struct itimerspec setting;
+		struct sigevent event;
+		int overrun, overrun_last;
+		int signal_pending;
+		clockid_t which_clock;
+		timer_t timer_id;
+		int err;
+
+		err = rst_get_object(CPT_OBJ_POSIX_TIMER, offset, &timi, ctx);
+		if (err)
+			return err;
+
+		timer_id = timi.cpt_timer_id;
+		which_clock = timi.cpt_timer_clock;
+		event.sigev_value.sival_ptr =
+			cpt_ptr_import(timi.cpt_sigev_value);
+		event.sigev_signo = timi.cpt_sigev_signo;
+		event.sigev_notify = timi.cpt_sigev_notify;
+		event.sigev_notify_thread_id = timi.cpt_sigev_notify_tid;
+
+		err = timer_create_id(which_clock, &event, &timer_id);
+		if (err) {
+			eprintk_ctx("timer_create_id: %d\n", err);
+			return err;
+		}
+
+		overrun = timi.cpt_timer_overrun;
+		overrun_last = timi.cpt_timer_overrun_last;
+		signal_pending = timi.cpt_timer_signal_pending;
+		cpt_timespec_import(&setting.it_interval,
+				    timi.cpt_timer_interval);
+		cpt_timespec_import(&setting.it_value,
+				    timi.cpt_timer_value);
+
+		if (cpt_object_has(&timi, cpt_dump_time))
+			cpt_timespec_import(&dump_time, timi.cpt_dump_time);
+		else
+			dump_time = ctx->start_time;
+
+		do_gettimespec(&delta_time);
+		if (which_clock == CLOCK_REALTIME ||
+		    which_clock == CLOCK_BOOTTIME) {
+			delta_time = timespec_sub(delta_time, dump_time);
+		} else if (which_clock == CLOCK_MONOTONIC) {
+			/* delta_time = now - rst_start_time */
+			delta_time = timespec_sub(delta_time, ctx->start_time);
+			delta_time = timespec_sub(delta_time, ctx->delta_time);
+		} else
+			delta_time.tv_sec = delta_time.tv_nsec = 0;
+
+		if ((setting.it_value.tv_sec || setting.it_value.tv_nsec) &&
+		    (delta_time.tv_sec || delta_time.tv_nsec)) {
+			ktime_t val = timespec_to_ktime(setting.it_value);
+			ktime_t delta = timespec_to_ktime(delta_time);
+			s64 incr = timespec_to_ns(&setting.it_interval);
+
+			val = ktime_sub(val, delta);
+			if (val.tv64 < 0 && incr > 0) {
+				int overrun_extra = 1 - ktime_divns(val, incr);
+				val = ktime_add_ns(val, incr * overrun_extra);
+				overrun += overrun_extra;
+			}
+
+			if (val.tv64 <= 0)
+				val = ktime_set(0, 1);
+
+			setting.it_value = ktime_to_timespec(val);
+		}
+
+		if (overrun >= 0)
+			signal_pending = 1;
+
+		err = timer_setup(timer_id, &setting,
+				  overrun, overrun_last, signal_pending);
+		if (err) {
+			eprintk_ctx("timer_setup: %d\n", err);
+			return err;
+		}
+
+		offset += timi.cpt_next;
+	}
+	return 0;
+}
+
+int rst_posix_timers(struct cpt_task_image *ti, cpt_context_t *ctx)
+{
+	int err;
+	struct cpt_object_hdr tli;
+
+	if (!cpt_object_has(ti, cpt_posix_timers) ||
+	    ti->cpt_posix_timers == CPT_NULL)
+		return 0;
+
+	err = rst_get_object(CPT_OBJ_POSIX_TIMER_LIST,
+			     ti->cpt_posix_timers, &tli, ctx);
+	if (err)
+		return err;
+
+	err = restore_posix_timer_list(&tli, ti->cpt_posix_timers, ctx);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+#ifdef CONFIG_X86
+static u32 decode_segment(u32 segid)
+{
+	if (segid == CPT_SEG_ZERO)
+		return 0;
+
+	/* TLS descriptors */
+	if (segid <= CPT_SEG_TLS3)
+		return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3;
+
+	/* LDT descriptor, it is just an index to LDT array */
+	if (segid >= CPT_SEG_LDT)
+		return ((segid - CPT_SEG_LDT) << 3) | 7;
+
+	/* Check for one of standard descriptors */
+#ifdef CONFIG_X86_64
+	if (segid == CPT_SEG_USER32_DS)
+		return __USER32_DS;
+	if (segid == CPT_SEG_USER32_CS)
+		return __USER32_CS;
+	if (segid == CPT_SEG_USER64_DS)
+		return __USER_DS;
+	if (segid == CPT_SEG_USER64_CS)
+		return __USER_CS;
+#else
+	if (segid == CPT_SEG_USER32_DS)
+		return __USER_DS;
+	if (segid == CPT_SEG_USER32_CS)
+		return __USER_CS;
+#endif
+	wprintk("Invalid segment reg %d\n", segid);
+	return 0;
+}
+#endif
+
+#if defined (CONFIG_IA64)
+void ia64_decrement_ip (struct pt_regs *regs)
+{
+	unsigned long w0, ri = ia64_psr(regs)->ri - 1;
+
+	if (ia64_psr(regs)->ri == 0) {
+		regs->cr_iip -= 16;
+		ri = 2;
+		get_user(w0, (char __user *) regs->cr_iip + 0);
+		if (((w0 >> 1) & 0xf) == 2) {
+			/*
+			 * rfi'ing to slot 2 of an MLX bundle causes
+			 * an illegal operation fault.  We don't want
+			 * that to happen...
+			 */
+			ri = 1;
+		}
+	}
+	ia64_psr(regs)->ri = ri;
+}
+#endif
+
+static void rst_child_tid(unsigned long *child_tids)
+{
+	dprintk("rct: " CPT_FID "\n", CPT_TID(current));
+	current->clear_child_tid = (void*)child_tids[0];
+	current->set_child_tid = (void*)child_tids[1];
+}
+
+static void rst_last_siginfo(void)
+{
+	int signr;
+	siginfo_t *info = current->last_siginfo;
+	struct k_sigaction *ka;
+
+	dprintk("rlsi: " CPT_FID "\n", CPT_TID(current));
+
+	spin_lock_irq(&current->sighand->siglock);
+	current->last_siginfo = NULL;
+	recalc_sigpending();
+
+	signr = current->exit_code;
+	if (signr == 0) {
+		dprintk("rlsi: canceled signal %d\n", info->si_signo);
+		goto out;
+	}
+	current->exit_code = 0;
+
+	if (signr != info->si_signo) {
+		info->si_signo = signr;
+		info->si_errno = 0;
+		info->si_code = SI_USER;
+		info->si_pid = task_pid_vnr(current->parent);
+		info->si_uid = current->parent->cred->uid;
+	}
+
+	/* If the (new) signal is now blocked, requeue it.  */
+	if (sigismember(&current->blocked, signr)) {
+		dprintk("going to requeue signal %d\n", signr);
+		goto out_resend_sig;
+	}
+
+	ka = &current->sighand->action[signr-1];
+	if (ka->sa.sa_handler == SIG_IGN) {
+		dprintk("going to resend signal %d (ignored)\n", signr);
+		goto out;
+	}
+	if (ka->sa.sa_handler != SIG_DFL) {
+		dprintk("going to resend signal %d (not SIG_DFL)\n", signr);
+		goto out_resend_sig;
+	}
+        if (signr == SIGCONT ||
+	    signr == SIGCHLD ||
+	    signr == SIGWINCH ||
+	    signr == SIGURG ||
+	    current->pid == 1)
+		goto out;
+
+	/* All the rest, which we cannot handle are requeued. */
+	dprintk("going to resend signal %d (sigh)\n", signr);
+out_resend_sig:
+	spin_unlock_irq(&current->sighand->siglock);
+	send_sig_info(signr, info, current);
+	return;
+
+out:
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void rst_finish_stop(void)
+{
+	/* ...
+	 * do_signal() ->
+	 *   get_signal_to_deliver() ->
+	 *     do_signal_stop() ->
+	 *       finish_stop()
+	 *
+	 * Normally after SIGCONT it will dequeue the next signal. If no signal
+	 * is found, do_signal restarts syscall unconditionally.
+	 * Otherwise signal handler is pushed on user stack.
+	 */
+
+	dprintk("rfs: " CPT_FID "\n", CPT_TID(current));
+
+	clear_stop_state(current);
+	current->exit_code = 0;
+}
+
+static void rst_restart_sys(void)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+
+	/* This hook is supposed to be executed, when we have
+	 * to complete some interrupted syscall.
+	 */
+	dprintk("rrs: " CPT_FID "\n", CPT_TID(current));
+
+	if (!IN_SYSCALL(regs) || !IN_ERROR(regs))
+		return;
+
+#ifdef __NR_pause
+	if (syscall_is(current,regs,pause)) {
+		if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
+			current->state = TASK_INTERRUPTIBLE;
+			schedule();
+		}
+	} else
+#else
+	/* On this arch pause() is simulated with sigsuspend(). */
+	if (syscall_is(current,regs,rt_sigsuspend)) {
+		if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
+			current->state = TASK_INTERRUPTIBLE;
+			schedule();
+		}
+	} else
+#endif
+	if (syscall_is(current,regs,rt_sigtimedwait)) {
+		if (SYSCALL_ERRNO(regs) == EAGAIN ||
+		    SYSCALL_ERRNO(regs) == EINTR) {
+			SYSCALL_RESTART(regs);
+		}
+	} else if (syscall_is(current,regs,futex)) {
+		if (SYSCALL_ERRNO(regs) == EINTR &&
+		    !signal_pending(current)) {
+			SYSCALL_RESTART(regs);
+		}
+	}
+
+	if (!signal_pending(current)) {
+		if (SYSCALL_ERRNO(regs) == ERESTARTSYS ||
+		    SYSCALL_ERRNO(regs) == ERESTARTNOINTR ||
+		    SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
+			SYSCALL_RESTART(regs);
+		} else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) {
+			int new = __NR_restart_syscall;
+#ifdef CONFIG_X86_64
+			if (task_thread_info(current)->flags&_TIF_IA32)
+				new = __NR32_restart_syscall;
+#endif
+			SYSCALL_RESTART2(regs, new);
+		}
+	}
+}
+
+#ifdef CONFIG_X86_32
+
+static int restore_registers(struct task_struct *tsk, struct pt_regs *regs,
+			     struct cpt_task_image *ti, struct cpt_x86_regs *b,
+			     struct resume_info **rip, struct cpt_context *ctx)
+{
+	extern char i386_ret_from_resume;
+
+	if (b->cpt_object != CPT_OBJ_X86_REGS)
+		return -EINVAL;
+
+	if (ctx->image_version < CPT_VERSION_32)
+		b->cpt_ugs = b->cpt_gs;
+
+	tsk->thread.sp = (unsigned long) regs;
+	tsk->thread.sp0 = (unsigned long) (regs+1);
+	tsk->thread.ip = (unsigned long) &i386_ret_from_resume;
+
+	tsk->thread.gs = decode_segment(b->cpt_gs);
+	task_user_gs(tsk) = decode_segment(b->cpt_ugs);
+	tsk->thread.debugreg0 = b->cpt_debugreg[0];
+	tsk->thread.debugreg1 = b->cpt_debugreg[1];
+	tsk->thread.debugreg2 = b->cpt_debugreg[2];
+	tsk->thread.debugreg3 = b->cpt_debugreg[3];
+	tsk->thread.debugreg6 = b->cpt_debugreg[6];
+	tsk->thread.debugreg7 = b->cpt_debugreg[7];
+
+	regs->bx = b->cpt_ebx;
+	regs->cx = b->cpt_ecx;
+	regs->dx = b->cpt_edx;
+	regs->si = b->cpt_esi;
+	regs->di = b->cpt_edi;
+	regs->bp = b->cpt_ebp;
+	regs->ax = b->cpt_eax;
+	regs->ds = b->cpt_xds;
+	regs->es = b->cpt_xes;
+	regs->orig_ax = b->cpt_orig_eax;
+	regs->ip = b->cpt_eip;
+	regs->cs = b->cpt_xcs;
+	regs->flags = b->cpt_eflags;
+	regs->sp = b->cpt_esp;
+	regs->ss = b->cpt_xss;
+
+	regs->cs = decode_segment(b->cpt_xcs);
+	regs->ss = decode_segment(b->cpt_xss);
+	regs->ds = decode_segment(b->cpt_xds);
+	regs->es = decode_segment(b->cpt_xes);
+	regs->fs = decode_segment(b->cpt_fs);
+
+	tsk->thread.sp -= HOOK_RESERVE;
+	memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
+	*rip = (void*)tsk->thread.sp;
+
+	return 0;
+}
+
+#elif defined(CONFIG_X86_64)
+
+static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s)
+{
+	memset(d, 0, sizeof(struct pt_regs));
+	d->bp = s->cpt_ebp;
+	d->bx = s->cpt_ebx;
+	d->ax = (s32)s->cpt_eax;
+	d->cx = s->cpt_ecx;
+	d->dx = s->cpt_edx;
+	d->si = s->cpt_esi;
+	d->di = s->cpt_edi;
+	d->orig_ax = (s32)s->cpt_orig_eax;
+	d->ip = s->cpt_eip;
+	d->cs = s->cpt_xcs;
+	d->flags = s->cpt_eflags;
+	d->sp = s->cpt_esp;
+	d->ss = s->cpt_xss;
+}
+
+static int restore_registers(struct task_struct *tsk, struct pt_regs *regs,
+			     struct cpt_task_image *ti, struct cpt_obj_bits *hdr,
+			     struct resume_info **rip, struct cpt_context *ctx)
+{
+	if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) {
+		struct cpt_x86_64_regs *b = (void*)hdr;
+
+		tsk->thread.sp = (unsigned long) regs;
+		tsk->thread.sp0 = (unsigned long) (regs+1);
+
+		tsk->thread.fs = b->cpt_fsbase;
+		tsk->thread.gs = b->cpt_gsbase;
+		tsk->thread.fsindex = decode_segment(b->cpt_fsindex);
+		tsk->thread.gsindex = decode_segment(b->cpt_gsindex);
+		tsk->thread.ds = decode_segment(b->cpt_ds);
+		tsk->thread.es = decode_segment(b->cpt_es);
+		tsk->thread.debugreg0 = b->cpt_debugreg[0];
+		tsk->thread.debugreg1 = b->cpt_debugreg[1];
+		tsk->thread.debugreg2 = b->cpt_debugreg[2];
+		tsk->thread.debugreg3 = b->cpt_debugreg[3];
+		tsk->thread.debugreg6 = b->cpt_debugreg[6];
+		tsk->thread.debugreg7 = b->cpt_debugreg[7];
+
+		memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs));
+
+		tsk->thread.usersp = regs->sp;
+		regs->cs = decode_segment(b->cpt_cs);
+		regs->ss = decode_segment(b->cpt_ss);
+	} else if (hdr->cpt_object == CPT_OBJ_X86_REGS) {
+		struct cpt_x86_regs *b = (void*)hdr;
+
+		if (ctx->image_version < CPT_VERSION_32)
+			b->cpt_ugs = b->cpt_gs;
+
+		tsk->thread.sp = (unsigned long) regs;
+		tsk->thread.sp0 = (unsigned long) (regs+1);
+
+		tsk->thread.fs = 0;
+		tsk->thread.gs = 0;
+		tsk->thread.fsindex = decode_segment(b->cpt_fs);
+		tsk->thread.gsindex = decode_segment(b->cpt_ugs);
+		tsk->thread.debugreg0 = b->cpt_debugreg[0];
+		tsk->thread.debugreg1 = b->cpt_debugreg[1];
+		tsk->thread.debugreg2 = b->cpt_debugreg[2];
+		tsk->thread.debugreg3 = b->cpt_debugreg[3];
+		tsk->thread.debugreg6 = b->cpt_debugreg[6];
+		tsk->thread.debugreg7 = b->cpt_debugreg[7];
+
+		xlate_ptregs_32_to_64(regs, b);
+
+		tsk->thread.usersp = regs->sp;
+		regs->cs = decode_segment(b->cpt_xcs);
+		regs->ss = decode_segment(b->cpt_xss);
+		tsk->thread.ds = decode_segment(b->cpt_xds);
+		tsk->thread.es = decode_segment(b->cpt_xes);
+	} else {
+		return -EINVAL;
+	}
+
+	tsk->thread.sp -= HOOK_RESERVE;
+	memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
+	*rip = (void*)tsk->thread.sp;
+
+	task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME;
+
+	return 0;
+}
+
+#elif defined(CONFIG_IA64)
+
+#define MASK(nbits)	((1UL << (nbits)) - 1)	/* mask with NBITS bits set */
+
+#define PUT_BITS(first, last, nat)					\
+	({								\
+		unsigned long bit = ia64_unat_pos(&pt->r##first);	\
+		unsigned long nbits = (last - first + 1);		\
+		unsigned long mask = MASK(nbits) << first;		\
+		long dist;						\
+		if (bit < first)					\
+			dist = 64 + bit - first;			\
+		else							\
+			dist = bit - first;				\
+		ia64_rotl(nat & mask, dist);				\
+	})
+
+unsigned long
+ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
+{
+	unsigned long scratch_unat;
+
+	/*
+	 * Registers that are stored consecutively in struct pt_regs
+	 * can be handled in parallel.  If the register order in
+	 * struct_pt_regs changes, this code MUST be updated.
+	 */
+	scratch_unat  = PUT_BITS( 1,  1, nat);
+	scratch_unat |= PUT_BITS( 2,  3, nat);
+	scratch_unat |= PUT_BITS(12, 13, nat);
+	scratch_unat |= PUT_BITS(14, 14, nat);
+	scratch_unat |= PUT_BITS(15, 15, nat);
+	scratch_unat |= PUT_BITS( 8, 11, nat);
+	scratch_unat |= PUT_BITS(16, 31, nat);
+
+	return scratch_unat;
+
+}
+
+static unsigned long
+ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat)
+{
+	unsigned long scratch_unat;
+
+	scratch_unat  = PUT_BITS( 4,  7, nat);
+
+	return scratch_unat;
+
+}
+
+#undef PUT_BITS
+
+
+static int restore_registers(struct task_struct *tsk, struct pt_regs *pt,
+			     struct cpt_task_image *ti,
+			     struct cpt_ia64_regs *r,
+			     struct resume_info **rip,
+			     struct cpt_context *ctx)
+{
+	extern char ia64_ret_from_resume;
+	struct switch_stack *sw;
+	struct resume_info *ri;
+	struct ia64_psr *psr = ia64_psr(pt);
+	void *krbs = (void *)tsk + IA64_RBS_OFFSET;
+	unsigned long reg;
+
+	if (r->cpt_object != CPT_OBJ_IA64_REGS)
+		return -EINVAL;
+
+	if (r->num_regs > 96) {
+		eprintk(CPT_FID " too much RSE regs %lu\n",
+			CPT_TID(tsk), r->num_regs);
+		return -EINVAL;
+	}
+
+	*rip = ri = ((void*)pt) - HOOK_RESERVE;
+	sw = ((struct switch_stack *) ri) - 1;
+
+	memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack));
+	memset(ri, 0, HOOK_RESERVE);
+
+	/* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */
+	memcpy(&pt->r1,  &r->gr[1],  8*(2-1));
+	memcpy(&pt->r2,  &r->gr[2],  8*(4-2));
+	memcpy(&pt->r8,  &r->gr[8],  8*(12-8));
+	memcpy(&pt->r12, &r->gr[12], 8*(14-12));
+	memcpy(&pt->r14, &r->gr[14], 8*(15-14));
+	memcpy(&pt->r15, &r->gr[15], 8*(16-15));
+	memcpy(&pt->r16, &r->gr[16], 8*(32-16));
+
+	pt->b0 = r->br[0];
+	pt->b6 = r->br[6];
+	pt->b7 = r->br[7];
+
+	pt->ar_bspstore	= r->ar_bspstore;
+	pt->ar_unat	= r->ar_unat;
+	pt->ar_pfs	= r->ar_pfs;
+	pt->ar_ccv	= r->ar_ccv;
+	pt->ar_fpsr	= r->ar_fpsr;
+	pt->ar_csd	= r->ar_csd;
+	pt->ar_ssd	= r->ar_ssd;
+	pt->ar_rsc	= r->ar_rsc;
+
+	pt->cr_iip	= r->cr_iip;
+	pt->cr_ipsr	= r->cr_ipsr;
+
+	pt->pr = r->pr;
+
+	pt->cr_ifs = r->cfm;
+
+	/* fpregs 6..9,10..11 are in pt_regs */
+	memcpy(&pt->f6,  &r->fr[2*6],  16*(10-6));
+	memcpy(&pt->f10, &r->fr[2*10], 16*(12-10));
+	/* fpreg 12..15 are on switch stack */
+	memcpy(&sw->f12, &r->fr[2*12], 16*(16-12));
+	/* fpregs 32...127 */
+	tsk->thread.flags |= IA64_THREAD_FPH_VALID;
+	memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32));
+	ia64_drop_fpu(tsk);
+	psr->dfh = 1;
+
+	memcpy(&sw->r4, &r->gr[4], 8*(8-4));
+	memcpy(&sw->b1, &r->br[1], 8*(6-1));
+	sw->ar_lc = r->ar_lc;
+
+	memcpy(&sw->f2, &r->fr[2*2], 16*(6-2));
+	memcpy(&sw->f16, &r->fr[2*16], 16*(32-16));
+
+	sw->caller_unat = 0;
+	sw->ar_fpsr = pt->ar_fpsr;
+	sw->ar_unat = 0;
+	if (r->nat[0] & 0xFFFFFF0FUL)
+		sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]);
+	if (r->nat[0] & 0xF0)
+		sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]);
+
+	sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs);
+	memset(krbs, 0, (void*)sw->ar_bspstore - krbs);
+	sw->ar_rnat = 0;
+	sw->ar_pfs = 0;
+
+	/* This is tricky. When we are in syscall, we have frame
+	 * of output register (sometimes, plus one input reg sometimes).
+	 * It is not so easy to restore such frame, RSE optimizes
+	 * and does not fetch those regs from backstore. So, we restore
+	 * the whole frame as local registers, and then repartition it
+	 * in ia64_ret_from_resume().
+	 */
+	if ((long)pt->cr_ifs >= 0) {
+		unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F);
+		sw->ar_pfs = out | (out<<7);
+	}
+	if (r->ar_ec)
+		sw->ar_pfs |= (r->ar_ec & 0x3F) << 52;
+
+	for (reg = 0; reg < r->num_regs; reg++) {
+		unsigned long *ptr = ia64_rse_skip_regs(krbs, reg);
+		unsigned long *rnatp;
+		unsigned long set_rnat = 0;
+
+		*ptr = r->gr[32+reg];
+
+		if (reg < 32)
+			set_rnat = (r->nat[0] & (1UL<<(reg+32)));
+		else
+			set_rnat = (r->nat[1] & (1UL<<(reg-32)));
+
+		if (set_rnat) {
+			rnatp = ia64_rse_rnat_addr(ptr);
+			if ((unsigned long)rnatp >= sw->ar_bspstore)
+				rnatp = &sw->ar_rnat;
+			*rnatp |= (1UL<<ia64_rse_slot_num(ptr));
+		}
+	}
+	
+	sw->b0 = (unsigned long) &ia64_ret_from_resume;
+	tsk->thread.ksp = (unsigned long) sw - 16;
+
+#define PRED_LEAVE_SYSCALL	1 /* TRUE iff leave from syscall */
+#define PRED_KERNEL_STACK	2 /* returning to kernel-stacks? */
+#define PRED_USER_STACK		3 /* returning to user-stacks? */
+#define PRED_SYSCALL		4 /* inside a system call? */
+#define PRED_NON_SYSCALL	5 /* complement of PRED_SYSCALL */
+
+	pt->loadrs = r->loadrs;
+	sw->pr = 0;
+	sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL);
+	sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL));
+	sw->pr &= ~(1UL << PRED_KERNEL_STACK);
+	sw->pr |= (1UL << PRED_USER_STACK);
+	if ((long)pt->cr_ifs < 0) {
+		sw->pr |= (1UL << PRED_NON_SYSCALL);
+	} else {
+		sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL));
+	}
+
+	return 0;
+}
+#endif
+
+asmlinkage void rst_resume_work(struct resume_info *ri)
+{
+	if (ri->hooks & (1<<HOOK_TID))
+		rst_child_tid(ri->tid_ptrs);
+	if (ri->hooks & (1<<HOOK_CONT))
+		rst_finish_stop();
+	if (ri->hooks & (1<<HOOK_LSI))
+		rst_last_siginfo();
+	if (ri->hooks & (1<<HOOK_RESTART))
+		rst_restart_sys();
+	module_put(THIS_MODULE);
+}
+
+static void rst_apply_mxcsr_mask(struct task_struct *tsk)
+{
+#ifdef CONFIG_X86_32
+	unsigned int flags;
+
+	flags = test_cpu_caps_and_features();
+
+	/* if cpu does not support sse2 mask 6 bit (DAZ flag) and 16-31 bits
+	   in MXCSR to avoid general protection fault */
+	if (!(flags & (1 << CPT_CPU_X86_SSE2)))
+		tsk->thread.xstate->fxsave.mxcsr &= 0x0000ffbf;
+#endif
+}
+
+#ifdef CONFIG_X86
+#include <asm/i387.h>
+#endif
+
+#define RLIM_INFINITY32		0xffffffff
+#define RLIM_INFINITY64		(~0ULL)
+
+#ifdef CONFIG_X86_64
+#define rst_rlim_32_to_64(a, i, t, im)					\
+do {									\
+	if (im->cpt_rlim_##a[i] == RLIM_INFINITY32)			\
+		t->signal->rlim[i].rlim_##a = RLIM_INFINITY64;		\
+	else								\
+		t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i];	\
+} while (0)
+#elif defined(CONFIG_X86_32)
+#define rst_rlim_64_to_32(a, i, t, im)					\
+do {									\
+	if (im->cpt_rlim_##a[i] == RLIM_INFINITY64)			\
+		t->signal->rlim[i].rlim_##a = RLIM_INFINITY32;		\
+	else if (im->cpt_rlim_##a[i] > RLIM_INFINITY32) {		\
+		eprintk_ctx("rlimit %Lu is too high for 32-bit task, "	\
+			    "dump file is corrupted\n",			\
+			    im->cpt_rlim_##a[i]);			\
+		return -EINVAL;						\
+	} else								\
+		t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i];	\
+} while (0)
+#endif
+
+#ifdef CONFIG_X86
+/* Restore task FPU context if needed */
+static int restore_task_fpu(struct task_struct *tsk,
+				const struct cpt_obj_bits *b,
+				const struct cpt_task_image *ti,
+				struct cpt_context *ctx)
+{
+	size_t size;
+
+	switch(b->cpt_content)
+	{
+	case CPT_CONTENT_X86_XSAVE:
+	case CPT_CONTENT_X86_FPUSTATE:
+		if (!cpu_has_xsave && !cpu_has_fxsr) {
+			eprintk_ctx(KERN_ERR "CPU doesn't support XSAVE/FXSR\n");
+			goto fault;
+		}
+
+		size = min_t(unsigned int, xstate_size, b->cpt_size);
+		break;
+#ifndef CONFIG_X86_64
+	case CPT_CONTENT_X86_FPUSTATE_OLD:
+		if (cpu_has_fxsr) {
+			eprintk_ctx(KERN_ERR "CPU's are incompatible: has FXSR\n");
+			goto fault;
+		}
+
+		size = sizeof(struct i387_fsave_struct);
+		break;
+#endif
+	default:
+		/* Looks like it is not our data */
+		return 0;
+	}
+
+	if (init_fpu(tsk))
+		return -ENOMEM;
+
+	memcpy(tsk->thread.xstate,
+		(void*)b + b->cpt_hdrlen, size);
+
+	if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE)
+		rst_apply_mxcsr_mask(tsk);
+
+	if (ti->cpt_used_math)
+		set_stopped_child_used_math(tsk);
+
+	return 0;
+fault:
+	eprintk_ctx("FPU context can't be restored. "
+			"The processor is incompatible.\n");
+	return -EFAULT;
+}
+#endif
+
+int rst_restore_process(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		struct cpt_task_image *ti = obj->o_image;
+		struct pt_regs * regs;
+		struct cpt_object_hdr *b;
+		struct cpt_siginfo_image *lsi = NULL;
+		struct resume_info *ri = NULL;
+		int i;
+#ifdef CONFIG_BEANCOUNTERS
+		struct task_beancounter *tbc;
+		struct user_beancounter *new_bc, *old_bc;
+#endif
+
+		if (tsk == NULL) {
+			eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm);
+			return -EFAULT;
+		}
+
+		if ((ti->cpt_state & __TASK_TRACED) && 
+				(ctx->image_version < CPT_VERSION_32)) {
+			eprintk_ctx("restoring traced task '%s' is not supported\n", ti->cpt_comm);
+			return -EFAULT;
+		}
+
+		if ((ti->cpt_state == __TASK_STOPPED) &&
+				(ctx->image_version >= CPT_VERSION_18) &&
+				(ctx->image_version < CPT_VERSION_20)) {
+			ti->cpt_state = TASK_STOPPED;
+		}
+
+		wait_task_inactive(tsk, 0);
+#ifdef CONFIG_BEANCOUNTERS
+		tbc = &tsk->task_bc;
+		new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx);
+		old_bc = tbc->exec_ub;
+		put_beancounter(new_bc);
+#endif
+		regs = task_pt_regs(tsk);
+
+		if (!tsk->exit_state) {
+			tsk->lock_depth = -1;
+#ifdef CONFIG_PREEMPT_COUNT
+			task_thread_info(tsk)->preempt_count--;
+#endif
+		}
+
+		if (tsk->static_prio != ti->cpt_static_prio)
+			set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio));
+
+		if (tsk->policy != ti->cpt_policy) {
+			struct sched_param param = { 0 };
+			sched_setscheduler_nocheck(tsk, ti->cpt_policy, &param);
+		}
+
+		cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked);
+		cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked);
+		cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked);
+		cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending);
+
+#ifdef CONFIG_IA64
+		SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac);
+		SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu);
+#endif
+		tsk->did_exec = (ti->cpt_did_exec != 0);
+		tsk->utime = ti->cpt_utime;
+		tsk->stime = ti->cpt_stime;
+		if (ctx->image_version == CPT_VERSION_8)
+			tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC);
+		else
+			cpt_timespec_import(&tsk->start_time, ti->cpt_starttime);
+		tsk->real_start_time = tsk->start_time;
+		_set_normalized_timespec(&tsk->start_time,
+					tsk->start_time.tv_sec +
+					VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec,
+					tsk->start_time.tv_nsec +
+					VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec);
+		_set_normalized_timespec(&tsk->real_start_time,
+					tsk->real_start_time.tv_sec +
+					VE_TASK_INFO(tsk)->owner_env->real_start_timespec.tv_sec,
+					tsk->real_start_time.tv_nsec +
+					VE_TASK_INFO(tsk)->owner_env->real_start_timespec.tv_nsec);
+
+		tsk->nvcsw = ti->cpt_nvcsw;
+		tsk->nivcsw = ti->cpt_nivcsw;
+		tsk->min_flt = ti->cpt_min_flt;
+		tsk->maj_flt = ti->cpt_maj_flt;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
+		tsk->cutime = ti->cpt_cutime;
+		tsk->cstime = ti->cpt_cstime;
+		tsk->cnvcsw = ti->cpt_cnvcsw;
+		tsk->cnivcsw = ti->cpt_cnivcsw;
+		tsk->cmin_flt = ti->cpt_cmin_flt;
+		tsk->cmaj_flt = ti->cpt_cmaj_flt;
+
+		BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS);
+
+		for (i=0; i<RLIM_NLIMITS; i++) {
+			tsk->rlim[i].rlim_cur = ti->cpt_rlim_cur[i];
+			tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i];
+		}
+#else
+		if (thread_group_leader(tsk) && tsk->signal) {
+			tsk->signal->utime = ti->cpt_utime;
+			tsk->signal->stime = ti->cpt_stime;
+			tsk->signal->cutime = ti->cpt_cutime;
+			tsk->signal->cstime = ti->cpt_cstime;
+			tsk->signal->nvcsw = ti->cpt_nvcsw;
+			tsk->signal->nivcsw = ti->cpt_nivcsw;
+			tsk->signal->cnvcsw = ti->cpt_cnvcsw;
+			tsk->signal->cnivcsw = ti->cpt_cnivcsw;
+			tsk->signal->min_flt = ti->cpt_min_flt;
+			tsk->signal->maj_flt = ti->cpt_maj_flt;
+			tsk->signal->cmin_flt = ti->cpt_cmin_flt;
+			tsk->signal->cmaj_flt = ti->cpt_cmaj_flt;
+
+			for (i=0; i<RLIM_NLIMITS; i++) {
+#ifdef CONFIG_X86_64
+				if (ctx->image_arch == CPT_OS_ARCH_I386) {
+					rst_rlim_32_to_64(cur, i, tsk, ti);
+					rst_rlim_32_to_64(max, i, tsk, ti);
+				} else 
+#elif defined(CONFIG_X86_32)
+				if (ctx->image_arch == CPT_OS_ARCH_EMT64) {
+					rst_rlim_64_to_32(cur, i, tsk, ti);
+					rst_rlim_64_to_32(max, i, tsk, ti);
+				} else 
+#endif
+				{
+					tsk->signal->rlim[i].rlim_cur =
+						ti->cpt_rlim_cur[i];
+					tsk->signal->rlim[i].rlim_max =
+						ti->cpt_rlim_max[i];
+				}
+			}
+		}
+#endif
+
+	if (thread_group_leader(tsk)) {
+		cputime_t virt_exp, prof_exp;
+
+		tsk->signal->it_real_incr.tv64 = 0;
+		if (ctx->image_version >= CPT_VERSION_9) {
+			tsk->signal->it_real_incr =
+			ktime_add_ns(tsk->signal->it_real_incr, ti->cpt_it_real_incr);
+		} else {
+			tsk->signal->it_real_incr =
+			ktime_add_ns(tsk->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC);
+		}
+		memset(tsk->signal->it, 0, sizeof(tsk->signal->it));
+		tsk->signal->it[CPUCLOCK_PROF].incr = ti->cpt_it_prof_incr;
+		tsk->signal->it[CPUCLOCK_VIRT].incr = ti->cpt_it_virt_incr; 
+		tsk->signal->it[CPUCLOCK_PROF].expires = prof_exp = ti->cpt_it_prof_value;
+		tsk->signal->it[CPUCLOCK_VIRT].expires = virt_exp = ti->cpt_it_virt_value;
+
+		if (!cputime_eq(virt_exp, cputime_zero))
+			set_process_cpu_timer(tsk, CPUCLOCK_VIRT, &virt_exp, NULL);
+
+		if (!cputime_eq(prof_exp, cputime_zero))
+			set_process_cpu_timer(tsk, CPUCLOCK_PROF, &prof_exp, NULL);
+	}
+
+#ifdef CONFIG_X86
+		for (i=0; i<3; i++) {
+			if (i >= GDT_ENTRY_TLS_ENTRIES) {
+				eprintk_ctx("too many tls descs\n");
+			} else {
+				tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF;
+				tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32;
+			}
+		}
+#endif
+
+		clear_stopped_child_used_math(tsk);
+
+		b = (void *)ti + ti->cpt_hdrlen;
+		while ((void*)b < ((void*)ti) + ti->cpt_next) {
+			/* Siginfo objects are at the end of obj array */
+			if (b->cpt_object == CPT_OBJ_SIGINFO) {
+				struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
+				restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next);
+				set_exec_env(env);
+				break;
+			}
+
+			switch (b->cpt_object) {
+#ifdef CONFIG_X86
+			case CPT_OBJ_BITS: {
+				int err = restore_task_fpu(tsk, (struct cpt_obj_bits *)b, ti, ctx);
+				if (err)
+					return err;
+				}
+				break;
+#endif
+			case CPT_OBJ_LASTSIGINFO:
+				lsi = (void*)b;
+				break;
+			case CPT_OBJ_X86_REGS:
+			case CPT_OBJ_X86_64_REGS:
+			case CPT_OBJ_IA64_REGS:
+				if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) {
+					eprintk_ctx("cannot restore registers: image is corrupted\n");
+					return -EINVAL;
+				}
+				break;
+			case CPT_OBJ_SIGALTSTACK: {
+				struct cpt_sigaltstack_image *sas;
+				sas = (struct cpt_sigaltstack_image *)b;
+				tsk->sas_ss_sp = sas->cpt_stack;
+				tsk->sas_ss_size = sas->cpt_stacksize;
+				break;
+			    }
+			case CPT_OBJ_TASK_AUX: {
+				struct cpt_task_aux_image *ai;
+				ai = (struct cpt_task_aux_image *)b;
+				tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list);
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_COMPAT
+				if (task_thread_info(tsk)->flags&_TIF_IA32) {
+					tsk->robust_list = (void __user *)NULL;
+					tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list);
+				}
+#endif
+#endif
+				break;
+			    }
+			}
+			b = ((void*)b) + b->cpt_next;
+		}
+
+		if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+			eprintk_ctx("missing register info\n");
+			return -EINVAL;
+		}
+
+		tsk->ptrace = ti->cpt_ptrace;
+
+		if (tsk->ptrace) {
+			struct ve_struct *env = VE_TASK_INFO(tsk)->owner_env;
+			struct task_struct *tracer;
+
+			write_lock_irq(&tasklist_lock);
+			tracer = pid_task(find_pid_ns(ti->cpt_ppid,
+					   env->ve_ns->pid_ns), PIDTYPE_PID);
+			if (tracer) {
+				tsk->parent = tracer;
+				list_add(&tsk->ptrace_entry, &tracer->ptraced);
+			} else {
+				eprintk_ctx("Tracer %d not found for %d(%s)\n",
+					ti->cpt_ppid, ti->cpt_pid, ti->cpt_comm);
+				tsk->ptrace = 0;
+			}
+			write_unlock_irq(&tasklist_lock);
+		}
+
+		tsk->ptrace_message = ti->cpt_ptrace_message;
+		tsk->stopped_state = ti->cpt_stopped_state;
+
+		/*
+		 * TIF_IA32 thread flag was restored early
+		 */
+		task_thread_info(tsk)->flags &= _TIF_IA32 | _TIF_FORK | _TIF_RESUME;
+		task_thread_info(tsk)->flags |= ti->cpt_thrflags;
+
+		/*
+		 * Drop rhel5's _TIF_RESTORE_SIGMASK.
+		 * The int_ret_from_sys_call gets confused by one.
+		 */
+		task_thread_info(tsk)->flags &= ~(1 << 9);
+
+#ifdef CONFIG_X86_32
+		do {
+			if (regs->orig_ax == __NR__newselect && regs->di) {
+				struct timeval tv;
+				if (access_process_vm(tsk, regs->di, &tv, 
+						sizeof(tv), 0) != sizeof(tv)) {
+					wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n",
+						task_pid_vnr(tsk), tsk->pid, tsk->comm,
+					       regs->di);
+					break;
+				}
+				dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n",
+				       task_pid_vnr(tsk), tsk->pid, tsk->comm,
+				       tv.tv_sec, tv.tv_usec);
+				tv.tv_sec -= ctx->delta_time.tv_sec;
+				if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
+					tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
+					tv.tv_sec--;
+				} else {
+					tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
+				}
+				if (tv.tv_sec < 0) {
+					tv.tv_sec = 0;
+					tv.tv_usec = 0;
+				}
+				dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n",
+					task_pid_vnr(tsk), tsk->pid, tsk->comm,
+				       tv.tv_sec, tv.tv_usec);
+				if (access_process_vm(tsk, regs->di, &tv, 
+						sizeof(tv), 1) != sizeof(tv)) {
+					wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n",
+						task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->di);
+				}
+				
+			} else if (regs->orig_ax == __NR_select && regs->di) {
+				struct {
+					unsigned long n;
+					fd_set __user *inp, *outp, *exp;
+					struct timeval __user *tvp;
+				} a;
+				struct timeval tv;
+				if (access_process_vm(tsk, regs->bx, &a, 
+						sizeof(a), 0) != sizeof(a)) {
+					wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid);
+					break;
+				}
+				if (access_process_vm(tsk, (unsigned long)a.tvp,
+						&tv, sizeof(tv), 0) != sizeof(tv)) {
+					wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid);
+					break;
+				}
+				dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n",
+					tsk->pid, tv.tv_sec, tv.tv_usec);
+				tv.tv_sec -= ctx->delta_time.tv_sec;
+				if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
+					tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
+					tv.tv_sec--;
+				} else {
+					tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
+				}
+				if (tv.tv_sec < 0) {
+					tv.tv_sec = 0;
+					tv.tv_usec = 0;
+				}
+				dprintk_ctx("task %d: New timeval in select: %ld.%ld\n",
+					tsk->pid, tv.tv_sec, tv.tv_usec);
+				if (access_process_vm(tsk, (unsigned long)a.tvp,
+						&tv, sizeof(tv), 1) != sizeof(tv)) {
+					wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid);
+				}
+			}
+		} while (0);
+#endif
+
+		if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) {
+			switch (SYSCALL_ERRNO(regs)) {
+			case ERESTARTSYS:
+			case ERESTARTNOINTR:
+			case ERESTARTNOHAND:
+			case ERESTART_RESTARTBLOCK:
+			case EAGAIN:
+			case EINTR:
+				ri->hooks |= (1<<HOOK_RESTART);
+			}
+		}
+
+		if (ri && lsi) {
+			/* ... -> ptrace_notify()
+			 * or
+			 * ... -> do_signal() -> get_signal_to_deliver() ->
+			 *   ptrace stop
+			 */
+			tsk->last_siginfo = &ri->last_siginfo;
+			ri->hooks |= (1<<HOOK_LSI);
+			decode_siginfo(tsk->last_siginfo, lsi);
+		}
+
+		/* PF_FREEZING is set in hook() to prevent task from being
+		 * accounted in loadavg, it will be cleared on task resume */
+		tsk->flags = (tsk->flags & (PF_USED_MATH|PF_FREEZING)) |
+			(ti->cpt_flags & CPT_TASK_FLAGS_MASK);
+		clear_tsk_thread_flag(tsk, TIF_FREEZE);
+		tsk->exit_signal = ti->cpt_exit_signal;
+
+		if (ri && tsk->stopped_state) {
+			dprintk_ctx("finish_stop\n");
+			if (ti->cpt_state != TASK_STOPPED)
+				eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state);
+			ri->hooks |= (1<<HOOK_CONT);
+		}
+
+		if (ri && (ti->cpt_set_tid || ti->cpt_clear_tid)) {
+			ri->hooks |= (1<<HOOK_TID);
+			ri->tid_ptrs[0] = ti->cpt_clear_tid;
+			ri->tid_ptrs[1] = ti->cpt_set_tid;
+			dprintk_ctx("settids\n");
+		}
+
+		if (ri && ri->hooks &&
+		    !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+			if (try_module_get(THIS_MODULE))
+				ri->hook = rst_resume_work;
+		}
+
+		if (ti->cpt_state == TASK_TRACED)
+			tsk->state = TASK_TRACED;
+		else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) {
+			tsk->signal->it[CPUCLOCK_VIRT].expires = 0;
+			tsk->signal->it[CPUCLOCK_PROF].expires = 0;
+			if (tsk->state != TASK_DEAD) {
+				eprintk_ctx("OVZ#3085 debug:\n"
+				"oops, schedule() did not make us dead\n"
+				"tsk %p pid %d state 0x%lx exit_state 0x%x "
+				"cpt_state 0x%llx\n",
+				tsk, tsk->pid, tsk->state, tsk->exit_state,
+				ti->cpt_state);
+				sched_show_task(tsk);
+			}
+		}
+
+		if (thread_group_leader(tsk) &&
+		    ti->cpt_it_real_value &&
+		    !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+			ktime_t val;
+			s64 nsec;
+			unsigned long flags;
+
+			nsec = ti->cpt_it_real_value;
+			val.tv64 = 0;
+
+			if (ctx->image_version < CPT_VERSION_9)
+				nsec *= TICK_NSEC;
+
+			val = ktime_add_ns(val, nsec);
+			if (val.tv64 <= 0)
+				val.tv64 = NSEC_PER_USEC;
+			dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk),
+				(long long)val.tv64,
+				(unsigned long long)ti->cpt_it_real_value);
+
+			if (lock_task_sighand(tsk, &flags)) {
+				if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) {
+					/* FIXME. Check!!!! */
+					hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL);
+				} else {
+					wprintk_ctx("Timer clash. Impossible?\n");
+				}
+				unlock_task_sighand(tsk, &flags);
+			}
+
+			dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk),
+				    (unsigned long long)val.tv64);
+		}
+
+		module_put(THIS_MODULE);
+	}
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_socket.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_socket.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_socket.c	2015-01-21 12:02:48.233093393 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_socket.c	2015-01-21 12:02:50.992020151 +0300
@@ -0,0 +1,1515 @@
+/*
+ *
+ *  kernel/cpt/rst_socket.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/mount.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/scm.h>
+#include <net/af_unix.h>
+
+#include <bc/kmem.h>
+#include <bc/sock_orphan.h>
+#include <bc/net.h>
+#include <bc/tcp.h>
+
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+#include "cpt_process.h"
+
+#include "cpt_syscalls.h"
+
+
+static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si,
+			     loff_t pos, struct cpt_context *ctx)
+{
+	struct timeval tmptv;
+
+	if (sk->sk_socket) {
+		sk->sk_socket->flags = si->cpt_ssflags;
+		sk->sk_socket->state = si->cpt_sstate;
+	}
+	sk->sk_reuse = si->cpt_reuse;
+	sk->sk_shutdown = si->cpt_shutdown;
+	sk->sk_userlocks = si->cpt_userlocks;
+	sk->sk_no_check = si->cpt_no_check;
+	sock_reset_flag(sk, SOCK_DBG);
+	if (si->cpt_debug)
+		sock_set_flag(sk, SOCK_DBG);
+	sock_reset_flag(sk, SOCK_RCVTSTAMP);
+	if (si->cpt_rcvtstamp)
+		sock_set_flag(sk, SOCK_RCVTSTAMP);
+	sock_reset_flag(sk, SOCK_LOCALROUTE);
+	if (si->cpt_localroute)
+		sock_set_flag(sk, SOCK_LOCALROUTE);
+	sk->sk_protocol = si->cpt_protocol;
+	sk->sk_err = si->cpt_err;
+	sk->sk_err_soft = si->cpt_err_soft;
+	sk->sk_priority = si->cpt_priority;
+	sk->sk_rcvlowat = si->cpt_rcvlowat;
+	sk->sk_rcvtimeo = si->cpt_rcvtimeo;
+	if (si->cpt_rcvtimeo == CPT_NULL)
+		sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	sk->sk_sndtimeo = si->cpt_sndtimeo;
+	if (si->cpt_sndtimeo == CPT_NULL)
+		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	sk->sk_rcvbuf = si->cpt_rcvbuf;
+	sk->sk_sndbuf = si->cpt_sndbuf;
+	sk->sk_bound_dev_if = si->cpt_bound_dev_if;
+	sk->sk_flags = si->cpt_flags;
+	sk->sk_lingertime = si->cpt_lingertime;
+	if (si->cpt_lingertime == CPT_NULL)
+		sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+	sk->sk_peercred.pid = si->cpt_peer_pid;
+	sk->sk_peercred.uid = si->cpt_peer_uid;
+	sk->sk_peercred.gid = si->cpt_peer_gid;
+	cpt_timeval_import(&tmptv, si->cpt_stamp);
+	sk->sk_stamp = timeval_to_ktime(tmptv);
+	return 0;
+}
+
+static struct file *sock_mapfile(struct socket *sock)
+{
+	int fd = sock_map_fd(sock, 0);
+
+	if (fd >= 0) {
+		struct file *file = sock->file;
+		get_file(file);
+		sc_close(fd);
+		return file;
+	}
+	return ERR_PTR(fd);
+}
+
+/* Assumption is that /tmp exists and writable.
+ * In previous versions we assumed that listen() will autobind
+ * the socket. It does not do this for AF_UNIX by evident reason:
+ * socket in abstract namespace is accessible, unlike socket bound
+ * to deleted FS object.
+ */
+
+static int
+select_deleted_name(char * name, cpt_context_t *ctx)
+{
+	int i;
+
+	for (i=0; i<100; i++) {
+		struct nameidata nd;
+		unsigned int rnd = net_random();
+
+		sprintf(name, "/tmp/SOCK.%08x", rnd);
+
+		if (path_lookup(name, 0, &nd) != 0)
+			return 0;
+
+		path_put(&nd.path);
+	}
+
+	eprintk_ctx("failed to allocate deleted socket inode\n");
+	return -ELOOP;
+}
+
+/*
+ * This function is used for backward compability with old image versions.
+ */
+static int unix_bind_to_path(struct socket *sock, char *name,
+				struct sockaddr* addr, int addrlen,
+				struct cpt_sock_image *si, cpt_context_t *ctx)
+{
+	struct sockaddr_un sun;
+	int err;
+	struct nameidata nd;
+
+	nd.path.dentry = NULL;
+
+	if (name[0]) {
+		if (si->cpt_sockflags & CPT_SOCK_DELETED) {
+			addr = (struct sockaddr*)&sun;
+			addr->sa_family = AF_UNIX;
+			name = ((char*)addr) + 2;
+			err = select_deleted_name(name, ctx);
+			if (err) {
+				eprintk_ctx("%s: can't select name\n", __func__);
+				return err;
+			}
+			addrlen = 2 + strlen(name);
+		} else {
+			if (path_lookup(name, 0, &nd))
+				nd.path.dentry = NULL;
+			else {
+				if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) {
+					eprintk_ctx("%s: not a socket dentry %s\n",
+							__func__, name);
+					return -EINVAL;
+				}
+				sc_unlink(name);
+			}
+		}
+	}
+
+	err = sock->ops->bind(sock, addr, addrlen);
+	if (!err && name[0]) {
+		if (si->cpt_sockflags & CPT_SOCK_DELETED)
+			sc_unlink(name);
+		else if (nd.path.dentry) {
+			sc_chown(name, nd.path.dentry->d_inode->i_uid,
+				 nd.path.dentry->d_inode->i_gid);
+			sc_chmod(name, nd.path.dentry->d_inode->i_mode);
+		}
+	}
+
+	if (nd.path.dentry)
+		path_put(&nd.path);
+
+	return err;
+}
+
+static int unix_bind_to_mntref(struct sock *sk, char *name,
+				struct sockaddr* addr, int addrlen,
+				struct cpt_sock_image *si, cpt_context_t *ctx)
+{
+	struct unix_bind_info bi;
+	int err;
+	cpt_object_t *mntobj;
+
+	if (!name[0]) {
+		if (addrlen <= sizeof(short)) {
+			eprintk_ctx("%s: unsupported hidden name len: %d\n",
+					__func__, addrlen);
+			return -EINVAL;
+		}
+		return sk->sk_socket->ops->bind(sk->sk_socket,  addr, addrlen);
+	}
+
+	err = unix_attach_addr(sk, (struct sockaddr_un *)addr,
+				addrlen);
+	if (err) {
+		eprintk_ctx("%s: can't attach unix address %d to %s\n",
+						__func__, err, name);
+		return err;
+	}
+
+	mntobj = lookup_cpt_obj_bypos(CPT_OBJ_VFSMOUNT_REF,
+			si->cpt_vfsmount_ref, ctx);
+	if (mntobj == NULL) {
+		eprintk_ctx("%s: can't find vfsmount for unix socket %s\n",
+				__func__, name);
+		return -EINVAL;
+	}
+
+	if (strlen(name) < mntobj->o_lock) {
+		eprintk_ctx("%s: unix socket with too short name (%d %s)\n",
+				__func__, mntobj->o_lock, name);
+		return -EINVAL;
+	}
+
+	bi.sk = sk;
+	strcpy(bi.path, name);
+	bi.path_off = mntobj->o_lock;
+	bi.i_mode = 0;
+	if (cpt_object_has(si, cpt_i_mode))
+		bi.i_mode = si->cpt_i_mode;
+	bi.uid = si->cpt_peer_uid;
+	bi.gid = si->cpt_peer_gid;
+	if (cpt_object_has(si, cpt_i_uid) && cpt_object_has(si, cpt_i_gid)) {
+		bi.uid = si->cpt_i_uid;
+		bi.gid = si->cpt_i_gid;
+	}
+	bi.next = NULL;
+
+	return rebind_unix_socket(mntobj->o_obj, &bi, LOOKUP_DIVE);
+}
+
+static int can_be_rebound_by_mntref(struct socket *sock,
+					struct cpt_sock_image *si,
+					cpt_context_t *ctx)
+{
+	if (ctx->image_version < CPT_VERSION_18_4)
+		return 0;
+
+	if (si->cpt_sockflags & CPT_SOCK_DELETED)
+		return 0;
+
+	if (si->cpt_vfsmount_ref == CPT_NULL)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * We use this special bind function instead of sock->ops->bind because
+ * overmounted sockets can't be binded that generic way. And we want to have
+ * only one function for rebinding all kinds of sockets. 
+ */
+static int bind_unix_socket(struct socket *sock, struct cpt_sock_image *si,
+		 cpt_context_t *ctx)
+{
+	int err;
+	char *name;
+	struct sockaddr* addr;
+	int addrlen;
+
+	if ((addrlen = si->cpt_laddrlen) <= 2)
+		return 0;
+
+	if (si->cpt_sockflags & CPT_SOCK_DELAYED)
+		return rst_delay_unix_bind(sock->sk, si, ctx);
+
+	name = ((char*)si->cpt_laddr) + 2;
+	addr = (struct sockaddr *)si->cpt_laddr;
+
+	if (can_be_rebound_by_mntref(sock, si, ctx))
+		err = unix_bind_to_mntref(sock->sk, name, addr, addrlen, si, ctx);
+	else
+		err = unix_bind_to_path(sock, name, addr, addrlen, si, ctx);
+
+	if (err)
+		eprintk_ctx("%s: can't rebind unix socket %d\n", __func__, err);
+
+	return err;
+}
+
+static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si,
+			      struct cpt_context *ctx)
+{
+	struct sock *sk = sock->sk;
+	cpt_object_t *obj;
+	struct sock *parent;
+
+	if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN)
+		return 0;
+
+	if (si->cpt_parent == -1)
+		return bind_unix_socket(sock, si, ctx);
+
+	obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+	if (!obj)
+		return 0;
+
+	parent = obj->o_obj;
+	if (unix_sk(parent)->addr) {
+		if (unix_sk(sk)->addr &&
+		    atomic_dec_and_test(&unix_sk(sk)->addr->refcnt))
+			kfree(unix_sk(sk)->addr);
+		atomic_inc(&unix_sk(parent)->addr->refcnt);
+		unix_sk(sk)->addr = unix_sk(parent)->addr;
+	}
+	return 0;
+}
+
+static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si,
+				  loff_t pos, struct cpt_context *ctx)
+{
+	loff_t endpos;
+
+	endpos = pos + si->cpt_next;
+	pos = pos + si->cpt_hdrlen;
+	while (pos < endpos) {
+		struct sk_buff *skb;
+		__u32 type;
+		int err;
+
+		err = rst_sock_attr(&pos, sk, ctx);
+		if (!err)
+			continue;
+		if (err < 0)
+			return err;
+
+		skb = rst_skb(sk, &pos, NULL, &type, ctx);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		if (type == CPT_SKB_RQ) {
+			skb_set_owner_r(skb, sk);
+			skb_queue_tail(&sk->sk_receive_queue, skb);
+		} else {
+			wprintk_ctx("strange socket queue type %u\n", type);
+			kfree_skb(skb);
+		}
+	}
+	return 0;
+}
+
+static struct file *open_socket_file(struct socket *sock,
+				     struct cpt_sock_image *si,
+				     struct cpt_context *ctx)
+{
+	struct file *file;
+	cpt_object_t *fobj;
+
+	file = sock_mapfile(sock);
+	if (IS_ERR(file))
+		goto err_file;
+
+	if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
+		goto err_fobj;
+
+	cpt_obj_setpos(fobj, si->cpt_file, ctx);
+	cpt_obj_setindex(fobj, si->cpt_index, ctx);
+
+	return file;
+
+err_fobj:
+	fput(file);
+	file = ERR_PTR(-ENOMEM);
+err_file:
+	return file;
+}
+
+static int open_socket_pair(struct socket *peer, struct cpt_sock_image *si,
+			    struct cpt_context *ctx)
+{
+	struct cpt_sock_image *si_peer = NULL;
+	int err;
+	struct socket *sock;
+	cpt_object_t *obj;
+
+	si_peer = kmalloc(sizeof(*si_peer), GFP_KERNEL);
+	if (!si_peer)
+		return -ENOMEM;
+
+	err = sock_create(si->cpt_family, si->cpt_type,
+			       si->cpt_protocol, &sock);
+	if (err)
+		goto err_out;
+
+	err = sock->ops->socketpair(sock, peer);
+	if (err < 0)
+		goto out_release;
+
+	/* Socketpair with a peer outside our environment.
+	 * So, we create real half-open pipe and do not worry
+	 * about dead end anymore. */
+	if (si->cpt_peer == -1)
+		goto out_release;
+
+	obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx);
+	if (!obj) BUG();
+	if (obj->o_obj) BUG();
+
+	err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, si_peer, ctx);
+	if (err)
+		goto out_release;
+
+	if (obj->o_ppos != CPT_NULL) {
+		struct file *file;
+
+		file = open_socket_file(sock, si_peer, ctx);
+		if (IS_ERR(file)) {
+			err = PTR_ERR(file);
+			goto out_release;
+		}
+		obj->o_parent = file;
+	}
+
+	setup_sock_common(sock->sk, si_peer, obj->o_pos, ctx);
+	err = fixup_unix_address(sock, si_peer, ctx);
+	if (err)
+		goto err_out;
+
+	cpt_obj_setobj(obj, sock->sk, ctx);
+	kfree(si_peer);
+
+	return 0;
+
+out_release:
+	sock_release(sock);
+err_out:
+	kfree(si_peer);
+	return err;
+}
+
+static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si,
+		       struct cpt_context *ctx)
+{
+	int err;
+	struct socket *sock;
+
+	err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol,
+			       &sock);
+	if (err)
+		return err;
+
+	if (si->cpt_file != CPT_NULL) {
+		struct file *file;
+
+		file = open_socket_file(sock, si, ctx);
+		if (IS_ERR(file)) {
+			err = PTR_ERR(file);
+			goto out_release;
+		}
+		obj->o_parent = file;
+	}
+
+	cpt_obj_setobj(obj, sock->sk, ctx);
+
+	if (si->cpt_socketpair) {
+		err = open_socket_pair(sock, si, ctx);
+		if (err)
+			goto err_out;
+	}
+
+	setup_sock_common(sock->sk, si, obj->o_pos, ctx);
+
+	if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) {
+		int saved_reuse = sock->sk->sk_reuse;
+
+		inet_sk(sock->sk)->freebind = 1;
+		sock->sk->sk_reuse = 2;
+		if (si->cpt_laddrlen) {
+			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+			if (err) {
+				dprintk_ctx("binding failed: %d, do not worry\n", err);
+			}
+		}
+		sock->sk->sk_reuse = saved_reuse;
+		err = rst_socket_in(si, obj->o_pos, sock->sk, ctx);
+		if (err) {
+			eprintk_ctx("open_socket: Warning! socket restoring "
+					"failed: %d\n", err);
+			/*
+			 * For now we do not want to abort migration
+			 * due to a socket restoring failure.
+			 */
+		}
+	} else if (sock->sk->sk_family == AF_NETLINK) {
+		struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr;
+		if (nl->nl_pid) {
+			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+			if (err) {
+				eprintk_ctx("AF_NETLINK binding failed: %d\n", err);
+			}
+		}
+		if (si->cpt_raddrlen && nl->nl_pid) {
+			err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK);
+			if (err) {
+				eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err);
+			}
+		}
+		generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
+	} else if (sock->sk->sk_family == PF_PACKET) {
+		struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr;
+		if (ll->sll_protocol || ll->sll_ifindex) {
+			int alen = si->cpt_laddrlen;
+			if (sock->type != SOCK_PACKET)
+				if (alen < sizeof(struct sockaddr_ll))
+					alen = sizeof(struct sockaddr_ll);
+			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen);
+			if (err) {
+				eprintk_ctx("AF_PACKET binding failed: %d\n", err);
+			}
+		}
+		generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
+	} else if (sock->sk->sk_family == AF_UNIX) {
+		/*
+		 * We can have a pipe with a pending data, which second end is closed.
+		 * In this case open_socket_pair() is not called above and sk_state
+		 * is not restored.
+		 */
+		if (!si->cpt_socketpair)
+			sock->sk->sk_state = si->cpt_state;
+	}
+
+	err = fixup_unix_address(sock, si, ctx);
+	if (err)
+		goto err_out;;
+
+	if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
+	    && (int)si->cpt_parent != -1) {
+		cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+		if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0)
+			sock->sk = NULL;
+	}
+
+	if (si->cpt_file == CPT_NULL && sock->sk &&
+	    sock->sk->sk_family == AF_INET) {
+		struct sock *sk = sock->sk;
+
+		if (sk) {
+			sock->sk = NULL;
+
+			local_bh_disable();
+			bh_lock_sock(sk);
+			if (sock_owned_by_user(sk))
+				eprintk_ctx("oops, sock is locked by user\n");
+
+			sock_hold(sk);
+			sock_orphan(sk);
+			ub_inc_orphan_count(sk);
+			bh_unlock_sock(sk);
+			local_bh_enable();
+			sock_put(sk);
+			dprintk_ctx("orphaning socket %p\n", sk);
+		}
+	}
+
+	if (si->cpt_file == CPT_NULL && sock->sk == NULL)
+		sock_release(sock);
+
+	return 0;
+
+out_release:
+	sock_release(sock);
+err_out:
+	return err;
+}
+
+#if 0
+static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si,
+		       struct cpt_context *ctx)
+{
+	int err;
+	struct socket *sock;
+	struct socket *sock2 = NULL;
+	struct file *file;
+	cpt_object_t *fobj;
+	cpt_object_t *pobj = NULL;
+
+	err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol,
+			       &sock);
+	if (err)
+		return err;
+
+	if (si->cpt_socketpair) {
+		err = sock_create(si->cpt_family, si->cpt_type,
+				       si->cpt_protocol, &sock2);
+		if (err)
+			goto err_out;
+
+		err = sock->ops->socketpair(sock, sock2);
+		if (err < 0)
+			goto err_out;
+
+		/* Socketpair with a peer outside our environment.
+		 * So, we create real half-open pipe and do not worry
+		 * about dead end anymore. */
+		if (si->cpt_peer == -1) {
+			sock_release(sock2);
+			sock2 = NULL;
+		}
+	}
+
+	cpt_obj_setobj(obj, sock->sk, ctx);
+
+	if (si->cpt_file != CPT_NULL) {
+		file = sock_mapfile(sock);
+		err = PTR_ERR(file);
+		if (IS_ERR(file))
+			goto err_out;
+
+		err = -ENOMEM;
+
+		obj->o_parent = file;
+
+		if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
+			goto err_out;
+		cpt_obj_setpos(fobj, si->cpt_file, ctx);
+		cpt_obj_setindex(fobj, si->cpt_index, ctx);
+	}
+
+	if (sock2) {
+		struct file *file2;
+
+		pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx);
+		if (!pobj) BUG();
+		if (pobj->o_obj) BUG();
+		cpt_obj_setobj(pobj, sock2->sk, ctx);
+
+		if (pobj->o_ppos != CPT_NULL) {
+			file2 = sock_mapfile(sock2);
+			err = PTR_ERR(file2);
+			if (IS_ERR(file2))
+				goto err_out;
+
+			err = -ENOMEM;
+			if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL)
+				goto err_out;
+			cpt_obj_setpos(fobj, pobj->o_ppos, ctx);
+			cpt_obj_setindex(fobj, si->cpt_peer, ctx);
+
+			pobj->o_parent = file2;
+		}
+	}
+
+	setup_sock_common(sock->sk, si, obj->o_pos, ctx);
+	if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) {
+		int saved_reuse = sock->sk->sk_reuse;
+
+		inet_sk(sock->sk)->freebind = 1;
+		sock->sk->sk_reuse = 2;
+		if (si->cpt_laddrlen) {
+			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+			if (err) {
+				dprintk_ctx("binding failed: %d, do not worry\n", err);
+			}
+		}
+		sock->sk->sk_reuse = saved_reuse;
+		err = rst_socket_in(si, obj->o_pos, sock->sk, ctx);
+		if (err) {
+			eprintk_ctx("open_socket: Warning! socket restoring "
+					"failed: %d\n", err);
+			/*
+			 * For now we do not want to abort migration
+			 * due to a socket restoring failure.
+			 */
+		}
+	} else if (sock->sk->sk_family == AF_NETLINK) {
+		struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr;
+		if (nl->nl_pid) {
+			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+			if (err) {
+				eprintk_ctx("AF_NETLINK binding failed: %d\n", err);
+			}
+		}
+		if (si->cpt_raddrlen && nl->nl_pid) {
+			err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK);
+			if (err) {
+				eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err);
+			}
+		}
+		generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
+	} else if (sock->sk->sk_family == PF_PACKET) {
+		struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr;
+		if (ll->sll_protocol || ll->sll_ifindex) {
+			int alen = si->cpt_laddrlen;
+			if (alen < sizeof(struct sockaddr_ll))
+				alen = sizeof(struct sockaddr_ll);
+			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen);
+			if (err) {
+				eprintk_ctx("AF_PACKET binding failed: %d\n", err);
+			}
+		}
+		generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
+	}
+
+	err = fixup_unix_address(sock, si, ctx);
+	if (err)
+		goto err_out;
+
+	if (sock2) {
+		err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx);
+		if (err)
+			goto err_out;
+		setup_sock_common(sock2->sk, si, pobj->o_pos, ctx);
+		err = fixup_unix_address(sock2, si, ctx);
+		if (err)
+			goto err_out;
+	}
+
+	if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
+	    && (int)si->cpt_parent != -1) {
+		cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+		if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0)
+			sock->sk = NULL;
+	}
+
+
+	if (si->cpt_file == CPT_NULL && sock->sk &&
+	    sock->sk->sk_family == AF_INET) {
+		struct sock *sk = sock->sk;
+
+		if (sk) {
+			sock->sk = NULL;
+
+			local_bh_disable();
+			bh_lock_sock(sk);
+			if (sock_owned_by_user(sk))
+				eprintk_ctx("oops, sock is locked by user\n");
+
+			sock_hold(sk);
+			sock_orphan(sk);
+			ub_inc_orphan_count(sk);
+			bh_unlock_sock(sk);
+			local_bh_enable();
+			sock_put(sk);
+			dprintk_ctx("orphaning socket %p\n", sk);
+		}
+	}
+
+	if (si->cpt_file == CPT_NULL && sock->sk == NULL)
+		sock_release(sock);
+
+	return 0;
+
+err_out:
+	if (sock2)
+		sock_release(sock2);
+	sock_release(sock);
+	return err;
+}
+#endif
+
+static int open_listening_socket(loff_t pos, struct cpt_sock_image *si,
+				 struct cpt_context *ctx)
+{
+	int err;
+	struct socket *sock;
+	struct file *file;
+	cpt_object_t *obj, *fobj;
+
+	err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol,
+			       &sock);
+	if (err) {
+		eprintk_ctx("open_listening_socket: sock_create: %d (family: %d, type: %d, protocol: %d)\n",
+				err, (int)si->cpt_family, (int)si->cpt_type, (int)si->cpt_protocol);
+		return err;
+	}
+
+	sock->sk->sk_reuse = 2;
+	sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if;
+
+	if (sock->sk->sk_family == AF_UNIX) {
+		err = bind_unix_socket(sock, si, ctx);
+		if (err) {
+			eprintk_ctx("bind unix: %d\n", err);
+			goto err_out;
+		}
+	} else if (si->cpt_laddrlen) {
+		if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
+			inet_sk(sock->sk)->freebind = 1;
+
+		err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+
+		if (err) {
+			eprintk_ctx("open_listening_socket: bind: %d\n", err);
+			goto err_out;
+		}
+	}
+
+	err = sock->ops->listen(sock, si->cpt_max_ack_backlog);
+	if (err) {
+		eprintk_ctx("open_listening_socket: listen: %d, %Ld, %x\n", err, pos, si->cpt_sockflags);
+		goto err_out;
+	}
+
+	/* Now we may access socket body directly and fixup all the things. */
+
+	file = sock_mapfile(sock);
+	err = PTR_ERR(file);
+	if (IS_ERR(file)) {
+		eprintk_ctx("open_listening_socket: map: %d\n", err);
+		goto err_out;
+	}
+
+	err = -ENOMEM;
+	if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
+		goto err_out;
+	if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL)
+		goto err_out;
+	cpt_obj_setpos(obj, pos, ctx);
+	cpt_obj_setindex(obj, si->cpt_index, ctx);
+	obj->o_parent = file;
+	cpt_obj_setpos(fobj, si->cpt_file, ctx);
+	cpt_obj_setindex(fobj, si->cpt_index, ctx);
+
+	setup_sock_common(sock->sk, si, pos, ctx);
+
+	if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) {
+		rst_listen_socket_in(sock->sk, si, pos, ctx);
+		rst_restore_synwait_queue(sock->sk, si, pos, ctx);
+	}
+
+	return 0;
+
+err_out:
+	sock_release(sock);
+	return err;
+}
+
+static int
+rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+	int err;
+	loff_t pos = *pos_p;
+	struct cpt_sockmc_image v;
+
+	err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx);
+	if (err)
+		return err;
+
+	*pos_p += v.cpt_next;
+
+	if (v.cpt_family == AF_INET)
+		return rst_sk_mcfilter_in(sk, &v, pos, ctx);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	else if (v.cpt_family == AF_INET6)
+		return rst_sk_mcfilter_in6(sk, &v, pos, ctx); 
+#endif
+	else
+		return -EAFNOSUPPORT;
+}
+
+
+static int
+rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+	int err;
+	struct sk_filter *fp, *old_fp; 
+	loff_t pos = *pos_p;
+	struct cpt_obj_bits v;
+
+	err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx);
+	if (err)
+		return err;
+
+	*pos_p += v.cpt_next;
+
+	if (v.cpt_size % sizeof(struct sock_filter))
+		return -EINVAL;
+
+	fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC);
+	if (fp == NULL)
+		return -ENOMEM;
+	atomic_set(&fp->refcnt, 1);
+	fp->len = v.cpt_size/sizeof(struct sock_filter);
+
+	err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen);
+	if (err) {
+		sk_filter_uncharge(sk, fp);
+		return err;
+	}
+
+	old_fp = sk->sk_filter;
+	sk->sk_filter = fp;
+	if (old_fp)
+		sk_filter_uncharge(sk, old_fp);
+	return 0;
+}
+
+static int
+rst_sock_attr_packet(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+	int err;
+	loff_t pos, endpos;
+	struct cpt_sock_packet_image v;
+	struct cpt_sock_packet_mc_image mi;
+
+	err = rst_get_object(CPT_OBJ_SOCK_PACKET, *pos_p, &v, ctx);
+	if (err)
+		return err;
+
+	if (sk->sk_family != AF_PACKET)
+		return -EINVAL;
+
+	err = sock_packet_rst_attr(sk, &v);
+	if (err)
+		return err;
+
+	pos = *pos_p + v.cpt_hdrlen;
+	endpos = *pos_p + v.cpt_next;
+	while (pos < endpos) {
+		err = rst_get_object(CPT_OBJ_SOCK_PACKET_MC, pos, &mi, ctx);
+		if (err)
+			return err;
+
+		err = sock_packet_rst_one_mc(sk, &mi);
+		if (err)
+			return err;
+
+		pos += mi.cpt_next;
+	}
+
+	*pos_p = endpos;
+	return 0;
+}
+
+/*
+ * returns:
+ *   0 - success, pos_p updated
+ * > 0 - type of next object
+ * < 0 - error
+ */
+int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+	int err;
+	loff_t pos = *pos_p;
+	struct cpt_object_hdr hdr;
+
+	err = rst_get_object(0, pos, &hdr, ctx);
+	if (err)
+		return err;
+
+	if (hdr.cpt_object == CPT_OBJ_SKFILTER)
+		err = rst_sock_attr_skfilter(pos_p, sk, ctx);
+	else if (hdr.cpt_object == CPT_OBJ_SOCK_MCADDR)
+		err = rst_sock_attr_mcfilter(pos_p, sk, ctx);
+	else if (hdr.cpt_object == CPT_OBJ_SOCK_PACKET)
+		err = rst_sock_attr_packet(pos_p, sk, ctx);
+	else
+		err = hdr.cpt_object;
+
+	return err;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void rst_tcp_cb_from_v4(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+	/*
+	 * sizeof(struct inet_skb_parm) == 16
+	 * sizeof(struct tcp_skb_cb) - sizeof(tcp_skb_cb.header) == 20
+	 *   => sizeof(struct tcp_skb_cb) == 36
+	 * sizeof(struct cpt_skb_image.cb) = 40
+	 *   => tcp_skb_cb in IPv4 format fits into cpt_skb_image.cb
+	 */
+	BUILD_BUG_ON(sizeof(skb->cb) - sizeof(struct inet6_skb_parm) <
+		sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm));
+	memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm));
+	memcpy(skb->cb + sizeof(struct inet6_skb_parm),
+		(void *)v->cpt_cb + sizeof(struct inet_skb_parm),
+		sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm));
+}
+static void rst_tcp_cb_from_v6(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+	memcpy(skb->cb, v->cpt_cb, sizeof(v->cpt_cb));
+}
+#else
+static void rst_tcp_cb_from_v4(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+	memcpy(skb->cb, v->cpt_cb, sizeof(v->cpt_cb));
+}
+static void rst_tcp_cb_from_v6(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+	/*
+	 * sizeof(struct inet6_skb_parm) == 24
+	 * sizeof(struct tcp_skb_cb) - sizeof(tcp_skb_cb.header) == 20
+	 *   => sizeof(struct tcp_skb_cb) == 44
+	 * sizeof(struct cpt_skb_image.cb) = 40
+	 *   => tcp_skb_cb in IPv6 format does not fit into cpt_skb_image.cb,
+	 *      do not write more than sizeof(v->cpt_cb)
+	 */
+	BUILD_BUG_ON(sizeof(skb->cb) - sizeof(struct inet_skb_parm) <
+		sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm));
+	memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm));
+	memcpy(skb->cb + sizeof(struct inet_skb_parm),
+		(void *)v->cpt_cb + sizeof(struct inet6_skb_parm),
+		min(sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm),
+			sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm)));
+}
+#endif
+
+struct tcp_skb_cb_ipv6 {
+	union {
+		struct inet_skb_parm	h4;
+		struct inet6_skb_parm	h6;
+	} header;
+	__u32		seq;
+	__u32		end_seq;
+	__u32		when;
+	__u8		flags;
+	__u8		sacked;
+	__u16		urg_ptr;
+	__u32		ack_seq;
+};
+
+static void generic_rst_skb_cb(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+	memcpy(skb->cb, v->cpt_cb, sizeof(v->cpt_cb));
+}
+
+static void rst_inet_skb_cb(struct cpt_skb_image *v, struct sk_buff *skb,
+			    struct sock *sk, struct cpt_context *ctx)
+{
+	if (sk->sk_protocol == IPPROTO_TCP) {
+		/*
+		 * 1) 2.6.9-x VZ kernels did not have IPv6 support compiled in
+		 *    => if image_version < CPT_VERSION_9*
+		 *    cpt_skb_image.cpt_cb is in IPv4 format.
+		 * 2) 2.6.18-x kernels with image_version >= CPT_VERSION_18_2
+		 *    and 2.6.16-x >= 027stab029 create cpt_skb_image.cpt_cb
+		 *    in IPv6 format despite the kernel IPv6 support.
+		 * 3) 2.6.18-x kernels with image_version < CPT_VERSION_18_2
+		 *    and 2.6.16-x < 027stab029 create cpt_cb in IPv4 format
+		 *    in case IPv6 support was not compiled in and
+		 *    in IPv6 format otherwise.
+		 *    All PVC 2.6.1[68]-x kernels have IPv6 support => we assume
+		 *    any 2.6.1[68]-x kernel produces cpt_cb in IPv6 format.
+		 *    Those, who compile old 2.6.1[68]-x kernels without IPv6
+		 *    support - beware!
+		 */
+		if (ctx->image_version >= CPT_VERSION_16) {
+			/*
+			 * we assume cpt_skb_image.cpt_cb is in IPv6 format
+			 * despite the kernel IPv6 support
+			 */
+			rst_tcp_cb_from_v6(v, skb);
+		} else {
+			/*
+			 * this case is for 2.6.9-x kernels which produce
+			 * cpt_skb_image.cpt_cb in IPv4 format
+			 */
+			rst_tcp_cb_from_v4(v, skb);
+		}
+	} else
+		generic_rst_skb_cb(v, skb);
+}
+
+static void rst_unix_skb_cb(struct cpt_skb_image *v, struct sk_buff *skb,
+			    struct sock *sk, struct cpt_context *ctx)
+{
+	/*
+	 * In kernels < 042stab085 UNIXCB keeps creds in raw format
+	 * (struct ucred) so there we simply copy it to skb image on
+	 * cpt and back on rst. Here we have to convert.
+	 */
+	struct ucred *ucred = (struct ucred *)v->cpt_cb;
+
+	if (ucred->pid) {
+		struct pid *pid;
+
+		/*
+		 * The process that issued the message might be dead already,
+		 * in which case we need a detached pid.
+		 */
+		pid = alloc_dummy_vpid(ucred->pid);
+		if (!pid)
+			wprintk_ctx("failed to restore unix skb pid\n");
+		UNIXCB(skb).pid = pid;
+	}
+	if (ucred->uid != -1) {
+		struct cred *cred = prepare_creds();
+
+		if (cred) {
+			cred->uid = ucred->uid;
+			cred->gid = ucred->gid;
+		} else
+			wprintk_ctx("failed to restore unix skb cred\n");
+		UNIXCB(skb).cred = cred;
+	}
+}
+
+struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner,
+			 __u32 *queue, struct cpt_context *ctx)
+{
+	int err;
+	struct sk_buff *skb;
+	struct cpt_skb_image v;
+	loff_t pos = *pos_p;
+	struct scm_fp_list *fpl = NULL;
+	struct timeval tmptv;
+
+	err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx);
+	if (err)
+		return ERR_PTR(err);
+	*pos_p = pos + v.cpt_next;
+
+	if (owner)
+		*owner = v.cpt_owner;
+	if (queue)
+		*queue = v.cpt_queue;
+
+	skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL);
+	if (skb == NULL)
+		return ERR_PTR(-ENOMEM);
+	skb_reserve(skb, v.cpt_hspace);
+	skb_put(skb, v.cpt_len);
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->transport_header = v.cpt_h;
+	skb->network_header = v.cpt_nh;
+	skb->mac_header = v.cpt_mac;
+#else
+	skb->transport_header = skb->head + v.cpt_h;
+	skb->network_header = skb->head + v.cpt_nh;
+	skb->mac_header = skb->head + v.cpt_mac;
+#endif
+
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb));
+	switch (sk->sk_family) {
+	case AF_INET:
+		rst_inet_skb_cb(&v, skb, sk, ctx);
+		break;
+	case AF_UNIX:
+		rst_unix_skb_cb(&v, skb, sk, ctx);
+		break;
+	default:
+		generic_rst_skb_cb(&v, skb);
+		break;
+	}
+
+	skb->mac_len = v.cpt_mac_len;
+	skb->csum = v.cpt_csum;
+	skb->local_df = v.cpt_local_df;
+	skb->pkt_type = v.cpt_pkt_type;
+	skb->ip_summed = v.cpt_ip_summed;
+	skb->priority = v.cpt_priority;
+	skb->protocol = v.cpt_protocol;
+	cpt_timeval_import(&tmptv, v.cpt_stamp);
+	skb->tstamp = timeval_to_ktime(tmptv);
+
+	skb_shinfo(skb)->gso_segs = v.cpt_gso_segs;
+	skb_shinfo(skb)->gso_size = v.cpt_gso_size;
+	if (cpt_object_has(&v, cpt_gso_type))
+		skb_shinfo(skb)->gso_type = v.cpt_gso_type;
+	if (ctx->image_version == 0) {
+		skb_shinfo(skb)->gso_segs = 1;
+		skb_shinfo(skb)->gso_size = 0;
+	}
+
+	if (v.cpt_next > v.cpt_hdrlen) {
+		pos = pos + v.cpt_hdrlen;
+		while (pos < *pos_p) {
+			union {
+				struct cpt_obj_bits b;
+				struct cpt_fd_image f;
+			} u;
+
+			err = rst_get_object(-1, pos, &u, ctx);
+			if (err) {
+				kfree_skb(skb);
+				return ERR_PTR(err);
+			}
+			if (u.b.cpt_object == CPT_OBJ_BITS) {
+				if (u.b.cpt_size != v.cpt_hspace + skb->len) {
+					eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len);
+					kfree_skb(skb);
+					return ERR_PTR(-EINVAL);
+				}
+
+				err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen);
+				if (err) {
+					kfree_skb(skb);
+					return ERR_PTR(err);
+				}
+			} else if (u.f.cpt_object == CPT_OBJ_FILEDESC) {
+				if (!fpl) {
+					fpl = kmalloc(sizeof(struct scm_fp_list),
+							GFP_KERNEL_UBC);
+					if (!fpl) {
+						kfree_skb(skb);
+						return ERR_PTR(-ENOMEM);
+					}
+					fpl->count = 0;
+					UNIXCB(skb).fp = fpl;
+				}
+				fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx);
+				if (!IS_ERR(fpl->fp[fpl->count]))
+					fpl->count++;
+			}
+			pos += u.b.cpt_next;
+		}
+	}
+
+	return skb;
+}
+
+static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si,
+			       loff_t pos, struct cpt_context *ctx)
+{
+	loff_t endpos;
+
+	endpos = pos + si->cpt_next;
+	pos = pos + si->cpt_hdrlen;
+	while (pos < endpos) {
+		struct sk_buff *skb;
+		struct sock *owner_sk;
+		__u32 owner;
+		int err;
+
+		err = rst_sock_attr(&pos, sk, ctx);
+		if (!err)
+			continue;
+		if (err < 0)
+			return err;
+
+		skb = rst_skb(sk, &pos, &owner, NULL, ctx);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		owner_sk = unix_peer(sk);
+		if (owner != -1) {
+			cpt_object_t *pobj;
+			pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx);
+			if (pobj == NULL) {
+				eprintk_ctx("orphan af_unix skb?\n");
+				kfree_skb(skb);
+				continue;
+			}
+			owner_sk = pobj->o_obj;
+		}
+		if (owner_sk == NULL) {
+			/*
+			 * Pipe with closed second end? Pass sk as an owner
+			 * to allow userspace to receive pending data.
+			 */
+			dprintk_ctx("orphan af_unix skb 2?\n");
+			owner_sk = sk;
+		}
+		skb_set_owner_w(skb, owner_sk);
+		skb->destructor = unix_destruct_scm;
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		if (sk->sk_state == TCP_LISTEN) {
+			struct socket *sock = skb->sk->sk_socket;
+			if (sock == NULL) BUG();
+			if (sock->file) BUG();
+			skb->sk->sk_socket = NULL;
+			skb->sk->sk_sleep = NULL;
+			sock->sk = NULL;
+			sock_release(sock);
+		}
+	}
+	return 0;
+}
+
+
+/* All the sockets are created before we start to open files */
+
+int rst_sockets(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_SOCKET];
+	loff_t endsec;
+	cpt_object_t *obj;
+	struct cpt_section_hdr h;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err) {
+		eprintk_ctx("rst_sockets: ctx->pread: %d\n", err);
+		return err;
+	}
+	if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) {
+		eprintk_ctx("rst_sockets: hdr err\n");
+		return -EINVAL;
+	}
+
+	/* The first pass: we create socket index and open listening sockets. */
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
+		err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
+		if (err) {
+			eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
+			cpt_release_buf(ctx);
+			return err;
+		}
+		if (sbuf->cpt_state == TCP_LISTEN) {
+			err = open_listening_socket(sec, sbuf, ctx); 
+			cpt_release_buf(ctx);
+			if (err) {
+				eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err);
+				return err;
+			}
+		} else {
+			cpt_release_buf(ctx);
+			obj = alloc_cpt_object(GFP_KERNEL, ctx);
+			if (obj == NULL)
+				return -ENOMEM;
+			cpt_obj_setindex(obj, sbuf->cpt_index, ctx);
+			cpt_obj_setpos(obj, sec, ctx);
+			obj->o_ppos  = sbuf->cpt_file;
+			intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx);
+		}
+		sec += sbuf->cpt_next;
+	}
+
+	/* Pass 2: really restore sockets */
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct cpt_sock_image *sbuf;
+		if (obj->o_obj != NULL)
+			continue;
+		sbuf = cpt_get_buf(ctx);
+		err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
+		if (err) {
+			eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
+			cpt_release_buf(ctx);
+			return err;
+		}
+		if (sbuf->cpt_state == TCP_LISTEN) BUG();
+		err = open_socket(obj, sbuf, ctx);
+		cpt_release_buf(ctx);
+		if (err) {
+			eprintk_ctx("rst_sockets: open_socket(fm=%u, prot=%u): "
+					"%d\n", err, sbuf->cpt_family,
+					sbuf->cpt_protocol);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+int rst_orphans(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_ORPHANS];
+	loff_t endsec;
+	cpt_object_t *obj;
+	struct cpt_section_hdr h;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
+		err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+		obj = alloc_cpt_object(GFP_KERNEL, ctx);
+		if (obj == NULL) {
+			cpt_release_buf(ctx);
+			return -ENOMEM;
+		}
+		obj->o_pos = sec;
+		obj->o_ppos  = sbuf->cpt_file;
+		err = open_socket(obj, sbuf, ctx);
+		dprintk_ctx("Restoring orphan: %d\n", err);
+		free_cpt_object(obj, ctx);
+		cpt_release_buf(ctx);
+		if (err)
+			return err;
+		sec += sbuf->cpt_next;
+	}
+
+	return 0;
+}
+
+/* In this function we release sockets without links.
+ * If nothing fails this sockets will be linked with skbs in
+ * rst_sockets_complete() -> restore_unix_rqueue()
+ */
+void rst_rollback_sockets(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct sock *sk = obj->o_obj;
+
+		if (sk == NULL) continue;
+
+		if (sk->sk_family != AF_UNIX)
+			continue;
+
+		if (sk->sk_socket->file == NULL)
+			sock_release(sk->sk_socket);
+	}
+}
+
+/* Pass 3: I understand, this is not funny already :-),
+ * but we have to do another pass to establish links between
+ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX
+ * skb queues with proper skb->sk links.
+ *
+ * This could be made at the end of rst_sockets(), but we defer
+ * restoring af_unix queues up to the end of restoring files to
+ * make restoring passed FDs cleaner.
+ */
+
+int rst_sockets_complete(struct cpt_context *ctx)
+{
+	int err;
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct cpt_sock_image *sbuf;
+		struct sock *sk = obj->o_obj;
+		struct sock *peer;
+
+		if (!sk) BUG();
+
+		if (sk->sk_family != AF_UNIX)
+			continue;
+
+		/*
+		 * Since 042stab085 SO_PEERCRED obtains peer credentials from
+		 * sk_peer_pid and sk_peer_cred so we have to init them.
+		 */
+		if (sk->sk_peercred.pid) {
+			struct pid *pid;
+
+			pid = alloc_dummy_vpid(sk->sk_peercred.pid);
+			if (pid) {
+				put_pid(sk_extended(sk)->sk_peer_pid);
+				sk_extended(sk)->sk_peer_pid = pid;
+			} else
+				wprintk_ctx("failed to restore unix socket peer pid\n");
+		}
+		if (sk->sk_peercred.uid != -1) {
+			struct cred *cred;
+
+			cred = prepare_creds();
+			if (cred) {
+				cred->euid = sk->sk_peercred.uid;
+				cred->egid = sk->sk_peercred.gid;
+				if (sk_extended(sk)->sk_peer_cred)
+					put_cred(sk_extended(sk)->sk_peer_cred);
+				sk_extended(sk)->sk_peer_cred = cred;
+			} else
+				wprintk_ctx("failed to restore unix socket peer cred\n");
+		}
+
+		sbuf = cpt_get_buf(ctx);
+		err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+
+		if (sbuf->cpt_next > sbuf->cpt_hdrlen)
+			restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx);
+
+		cpt_release_buf(ctx);
+
+		if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) {
+			cpt_object_t *pobj;
+
+			sbuf = cpt_get_buf(ctx);
+			err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
+			if (err) {
+				cpt_release_buf(ctx);
+				return err;
+			}
+
+			if (sbuf->cpt_peer != -1) {
+				pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx);
+				if (pobj) {
+					peer = pobj->o_obj;
+					sock_hold(peer);
+					unix_peer(sk) = peer;
+				}
+			}
+			cpt_release_buf(ctx);
+		}
+	}
+
+	rst_orphans(ctx);
+
+	return 0;
+}
+
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_socket_in.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_socket_in.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_socket_in.c	2015-01-21 12:02:48.233093393 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_socket_in.c	2015-01-21 12:02:50.915022195 +0300
@@ -0,0 +1,599 @@
+/*
+ *
+ *  kernel/cpt/rst_socket_in.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/jhash.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/ipv6.h>
+#include <linux/igmp.h>
+#include <net/addrconf.h>
+#include <net/inet6_connection_sock.h>
+#include <linux/nsproxy.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+
+static inline unsigned long jiffies_import(__u32 tmo)
+{
+	__s32 delta = tmo;
+	return jiffies + (long)delta;
+}
+
+static inline __u32 tcp_jiffies_import(__u32 tmo)
+{
+	return ((__u32)jiffies) + tmo;
+}
+
+
+static int restore_queues(struct sock *sk, struct cpt_sock_image *si,
+			  loff_t pos, struct cpt_context *ctx)
+{
+	loff_t endpos;
+
+	endpos = pos + si->cpt_next;
+	pos = pos + si->cpt_hdrlen;
+	while (pos < endpos) {
+		struct sk_buff *skb;
+		__u32 type;
+		int err;
+
+		err = rst_sock_attr(&pos, sk, ctx);
+		if (!err)
+			continue;
+		if (err < 0)
+			return err;
+
+		skb = rst_skb(sk, &pos, NULL, &type, ctx);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		if (sk->sk_type == SOCK_STREAM) {
+			if (type == CPT_SKB_RQ) {
+				skb_set_owner_r(skb, sk);
+				ub_tcprcvbuf_charge_forced(sk, skb);
+				skb_queue_tail(&sk->sk_receive_queue, skb);
+			} else if (type == CPT_SKB_OFOQ) {
+				struct tcp_sock *tp = tcp_sk(sk);
+				skb_set_owner_r(skb, sk);
+				ub_tcprcvbuf_charge_forced(sk, skb);
+				skb_queue_tail(&tp->out_of_order_queue, skb);
+			} else if (type == CPT_SKB_WQ) {
+				sk->sk_wmem_queued += skb->truesize;
+				sk->sk_forward_alloc -= skb->truesize;
+				ub_tcpsndbuf_charge_forced(sk, skb);
+				skb_queue_tail(&sk->sk_write_queue, skb);
+			} else {
+				wprintk_ctx("strange stream queue type %u\n", type);
+				kfree_skb(skb);
+			}
+		} else {
+			if (type == CPT_SKB_RQ) {
+				skb_set_owner_r(skb, sk);
+				skb_queue_tail(&sk->sk_receive_queue, skb);
+			} else if (type == CPT_SKB_WQ) {
+				struct inet_sock *inet = inet_sk(sk);
+				if (inet->cork.fragsize) {
+					skb_set_owner_w(skb, sk);
+					skb_queue_tail(&sk->sk_write_queue, skb);
+				} else {
+					eprintk_ctx("cork skb is dropped\n");
+					kfree_skb(skb);
+				}
+			} else {
+				wprintk_ctx("strange dgram queue type %u\n", type);
+				kfree_skb(skb);
+			}
+		}
+	}
+	return 0;
+}
+
+static struct sock *find_parent(__u16 sport, cpt_context_t *ctx)
+{
+	cpt_object_t *obj;
+	for_each_object(obj, CPT_OBJ_SOCKET) {
+		struct sock *sk = obj->o_obj;
+		if (sk &&
+		    sk->sk_state == TCP_LISTEN &&
+		    (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
+		    inet_sk(sk)->sport == sport)
+			return sk;
+	}
+	return NULL;
+}
+
+static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
+			  struct cpt_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	tp->pred_flags = si->cpt_pred_flags;
+	tp->rcv_nxt = si->cpt_rcv_nxt;
+	tp->snd_nxt = si->cpt_snd_nxt;
+	tp->snd_una = si->cpt_snd_una;
+	tp->snd_sml = si->cpt_snd_sml;
+	tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp);
+	tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime);
+	tp->tcp_header_len = si->cpt_tcp_header_len;
+	inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending;
+	inet_csk(sk)->icsk_ack.quick = si->cpt_quick;
+	inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong;
+	inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked;
+	inet_csk(sk)->icsk_ack.ato = si->cpt_ato;
+	inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout);
+	inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime);
+	inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size;
+	inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss;
+	tp->snd_wl1 = si->cpt_snd_wl1;
+	tp->snd_wnd = si->cpt_snd_wnd;
+	tp->max_window = si->cpt_max_window;
+	inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie;
+	tp->mss_cache = si->cpt_mss_cache;
+	tp->rx_opt.mss_clamp = si->cpt_mss_clamp;
+	inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len;
+	inet_csk(sk)->icsk_ca_state = si->cpt_ca_state;
+	inet_csk(sk)->icsk_retransmits = si->cpt_retransmits;
+	tp->reordering = si->cpt_reordering;
+	tp->frto_counter = si->cpt_frto_counter;
+	tp->frto_highmark = si->cpt_frto_highmark;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+	// // tp->adv_cong = si->cpt_adv_cong;
+#endif
+	inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept;
+	inet_csk(sk)->icsk_backoff = si->cpt_backoff;
+	tp->srtt = si->cpt_srtt;
+	tp->mdev = si->cpt_mdev;
+	tp->mdev_max = si->cpt_mdev_max;
+	tp->rttvar = si->cpt_rttvar;
+	tp->rtt_seq = si->cpt_rtt_seq;
+	inet_csk(sk)->icsk_rto = si->cpt_rto;
+	tp->packets_out = si->cpt_packets_out;
+	tp->retrans_out = si->cpt_retrans_out;
+	tp->lost_out = si->cpt_lost_out;
+	tp->sacked_out = si->cpt_sacked_out;
+	tp->fackets_out = si->cpt_fackets_out;
+	tp->snd_ssthresh = si->cpt_snd_ssthresh;
+	tp->snd_cwnd = si->cpt_snd_cwnd;
+	tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt;
+	tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp;
+	tp->snd_cwnd_used = si->cpt_snd_cwnd_used;
+	tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp);
+	inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout);
+	tp->rcv_wnd = si->cpt_rcv_wnd;
+	tp->rcv_wup = si->cpt_rcv_wup;
+	tp->write_seq = si->cpt_write_seq;
+	tp->pushed_seq = si->cpt_pushed_seq;
+	tp->copied_seq = si->cpt_copied_seq;
+	tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok;
+	tp->rx_opt.wscale_ok = si->cpt_wscale_ok;
+	tp->rx_opt.sack_ok = si->cpt_sack_ok;
+	tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp;
+	tp->rx_opt.snd_wscale = si->cpt_snd_wscale;
+	tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale;
+	tp->nonagle = si->cpt_nonagle;
+	tp->keepalive_probes = si->cpt_keepalive_probes;
+	tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval;
+	tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr;
+	tp->rx_opt.ts_recent = si->cpt_ts_recent;
+	tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp;
+	tp->rx_opt.user_mss = si->cpt_user_mss;
+	tp->rx_opt.dsack = si->cpt_dsack;
+	tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0];
+	tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1];
+	tp->selective_acks[0].start_seq = si->cpt_sack_array[2];
+	tp->selective_acks[0].end_seq = si->cpt_sack_array[3];
+	tp->selective_acks[1].start_seq = si->cpt_sack_array[4];
+	tp->selective_acks[1].end_seq = si->cpt_sack_array[5];
+	tp->selective_acks[2].start_seq = si->cpt_sack_array[6];
+	tp->selective_acks[2].end_seq = si->cpt_sack_array[7];
+	tp->selective_acks[3].start_seq = si->cpt_sack_array[8];
+	tp->selective_acks[3].end_seq = si->cpt_sack_array[9];
+
+	tp->window_clamp = si->cpt_window_clamp;
+	tp->rcv_ssthresh = si->cpt_rcv_ssthresh;
+	inet_csk(sk)->icsk_probes_out = si->cpt_probes_out;
+	tp->rx_opt.num_sacks = si->cpt_num_sacks;
+	tp->advmss = si->cpt_advmss;
+	inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries;
+	tp->ecn_flags = si->cpt_ecn_flags;
+	tp->prior_ssthresh = si->cpt_prior_ssthresh;
+	tp->high_seq = si->cpt_high_seq;
+	tp->retrans_stamp = si->cpt_retrans_stamp;
+	tp->undo_marker = si->cpt_undo_marker;
+	tp->undo_retrans = si->cpt_undo_retrans;
+	tp->urg_seq = si->cpt_urg_seq;
+	tp->urg_data = si->cpt_urg_data;
+	inet_csk(sk)->icsk_pending = si->cpt_pending;
+	tp->snd_up = si->cpt_snd_up;
+	tp->keepalive_time = si->cpt_keepalive_time;
+	tp->keepalive_intvl = si->cpt_keepalive_intvl;
+	tp->linger2 = si->cpt_linger2;
+
+	sk->sk_send_head = NULL;
+	for (skb = skb_peek(&sk->sk_write_queue);
+	     skb && skb != (struct sk_buff*)&sk->sk_write_queue;
+	     skb = skb->next) {
+		if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) {
+			sk->sk_send_head = skb;
+			break;
+		}
+	}
+
+	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) {
+		struct inet_sock *inet = inet_sk(sk);
+		if (inet->num == 0) {
+			cpt_object_t *lobj = NULL;
+
+			if ((int)si->cpt_parent != -1)
+				lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+
+			if (lobj && lobj->o_obj) {
+				inet->num = ntohs(inet->sport);
+				local_bh_disable();
+				__inet_inherit_port(lobj->o_obj, sk);
+				local_bh_enable();
+				dprintk_ctx("port inherited from parent\n");
+			} else {
+				struct sock *lsk = find_parent(inet->sport, ctx);
+				if (lsk) {
+					inet->num = ntohs(inet->sport);
+					local_bh_disable();
+					__inet_inherit_port(lsk, sk);
+					local_bh_enable();
+					dprintk_ctx("port inherited\n");
+				} else {
+					eprintk_ctx("we are kinda lost...\n");
+				}
+			}
+		}
+
+		sk->sk_prot->hash(sk);
+
+		if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER)
+			sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout);
+		if (inet_csk(sk)->icsk_pending && !skb_queue_empty(&sk->sk_write_queue))
+			sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer,
+				       inet_csk(sk)->icsk_timeout);
+		if (sock_flag(sk, SOCK_KEEPOPEN)) {
+			unsigned long expires = jiffies_import(si->cpt_ka_timeout);
+			if (time_after(jiffies, expires))
+				expires = jiffies + HZ;
+			sk_reset_timer(sk, &sk->sk_timer, expires);
+		}
+	}
+
+	if (sk->sk_family == AF_INET6)
+		sk->sk_gso_type = SKB_GSO_TCPV6;
+	else
+		sk->sk_gso_type = SKB_GSO_TCPV4;
+
+	return 0;
+}
+
+static void rst_listen_socket_tcp(struct cpt_sock_image *si, struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp);
+	tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime);
+	tp->tcp_header_len = si->cpt_tcp_header_len;
+	inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept;
+
+	/* Next options are inherited by children */
+	tp->mss_cache = si->cpt_mss_cache;
+	inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len;
+	tp->reordering = si->cpt_reordering;
+	tp->nonagle = si->cpt_nonagle;
+	tp->keepalive_probes = si->cpt_keepalive_probes;
+	tp->rx_opt.user_mss = si->cpt_user_mss;
+	inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries;
+	tp->keepalive_time = si->cpt_keepalive_time;
+	tp->keepalive_intvl = si->cpt_keepalive_intvl;
+	tp->linger2 = si->cpt_linger2;
+}
+
+int rst_listen_socket_in( struct sock *sk, struct cpt_sock_image *si,
+			  loff_t pos, struct cpt_context *ctx)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	lock_sock(sk);
+
+	inet->uc_ttl = si->cpt_uc_ttl;
+	inet->tos = si->cpt_tos;
+	inet->cmsg_flags = si->cpt_cmsg_flags;
+	inet->pmtudisc = si->cpt_pmtudisc;
+	inet->recverr = si->cpt_recverr;
+	inet->freebind = si->cpt_freebind;
+	inet->id = si->cpt_idcounter;
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		np->frag_size = si->cpt_frag_size6;
+		np->hop_limit = si->cpt_hop_limit6;
+
+		np->rxopt.all = si->cpt_rxopt6;
+		np->mc_loop = si->cpt_mc_loop6;
+		np->recverr = si->cpt_recverr6;
+		np->pmtudisc = si->cpt_pmtudisc6;
+		np->ipv6only = si->cpt_ipv6only6;
+	}
+
+	if (sk->sk_protocol == IPPROTO_TCP)
+		rst_listen_socket_tcp(si, sk);
+
+	release_sock(sk);
+	return 0;
+}
+
+int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
+		  struct cpt_context *ctx)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = get_exec_env()->ve_ns->net_ns;
+	int err, ret_err = 0;
+
+	lock_sock(sk);
+
+	sk->sk_state = si->cpt_state;
+
+	inet->daddr = si->cpt_daddr;
+	inet->dport = si->cpt_dport;
+	inet->saddr = si->cpt_saddr;
+	inet->rcv_saddr = si->cpt_rcv_saddr;
+	inet->sport = si->cpt_sport;
+	inet->uc_ttl = si->cpt_uc_ttl;
+	inet->tos = si->cpt_tos;
+	inet->cmsg_flags = si->cpt_cmsg_flags;
+	inet->mc_index = si->cpt_mc_index;
+	inet->mc_addr = si->cpt_mc_addr;
+	inet->hdrincl = si->cpt_hdrincl;
+	inet->mc_ttl = si->cpt_mc_ttl;
+	inet->mc_loop = si->cpt_mc_loop;
+	inet->pmtudisc = si->cpt_pmtudisc;
+	inet->recverr = si->cpt_recverr;
+	inet->freebind = si->cpt_freebind;
+	inet->id = si->cpt_idcounter;
+
+	inet->cork.flags = si->cpt_cork_flags;
+	inet->cork.fragsize = si->cpt_cork_fragsize;
+	inet->cork.length = si->cpt_cork_length;
+	inet->cork.addr = si->cpt_cork_addr;
+	inet->cork.fl.fl4_src = si->cpt_cork_saddr;
+	inet->cork.fl.fl4_dst = si->cpt_cork_daddr;
+	inet->cork.fl.oif = si->cpt_cork_oif;
+	if (inet->cork.fragsize) {
+		if (ip_route_output_key(net, (struct rtable **)&inet->cork.dst, &inet->cork.fl)) {
+			eprintk_ctx("failed to restore cork route\n");
+			inet->cork.fragsize = 0;
+		}
+	}
+
+	if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
+		struct udp_sock *up = udp_sk(sk);
+		up->pending = si->cpt_udp_pending;
+		up->corkflag = si->cpt_udp_corkflag;
+		up->encap_type = si->cpt_udp_encap;
+		up->len = si->cpt_udp_len;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		memcpy(&np->saddr, si->cpt_saddr6, 16);
+		memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16);
+		memcpy(&np->daddr, si->cpt_daddr6, 16);
+		np->flow_label = si->cpt_flow_label6;
+		np->frag_size = si->cpt_frag_size6;
+		np->hop_limit = si->cpt_hop_limit6;
+		np->mcast_hops = si->cpt_mcast_hops6;
+		np->mcast_oif = si->cpt_mcast_oif6;
+		np->rxopt.all = si->cpt_rxopt6;
+		np->mc_loop = si->cpt_mc_loop6;
+		np->recverr = si->cpt_recverr6;
+		np->sndflow = si->cpt_sndflow6;
+		np->pmtudisc = si->cpt_pmtudisc6;
+		np->ipv6only = si->cpt_ipv6only6;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (si->cpt_mapped) {
+			extern struct inet_connection_sock_af_ops ipv6_mapped;
+			if (sk->sk_type == SOCK_STREAM &&
+			    sk->sk_protocol == IPPROTO_TCP) {
+				inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
+				sk->sk_backlog_rcv = tcp_v4_do_rcv;
+			}
+		}
+#endif
+	}
+
+	err = restore_queues(sk, si, pos, ctx);
+
+	if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) {
+		ret_err = err;
+		rst_socket_tcp(si, pos, sk, ctx);
+	}
+
+	release_sock(sk);
+	return ret_err;
+}
+
+static struct request_sock *rst_reqsk_alloc(unsigned short family)
+{
+	struct request_sock *req;
+
+	if (family == AF_INET)
+		req = inet_reqsk_alloc(&tcp_request_sock_ops);
+	else
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
+#else
+		return ERR_PTR(-EINVAL);
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+	if (req) {
+		if (family == AF_INET)
+			tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
+		else
+			tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
+	}
+#endif
+	return req;
+}
+
+int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx)
+{
+	struct request_sock *req;
+
+	if (lsk->sk_state != TCP_LISTEN)
+		return -EINVAL;
+	req = rst_reqsk_alloc(sk->sk_family);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
+
+	sk->sk_socket = NULL;
+	sk->sk_sleep = NULL;
+	inet_csk_reqsk_queue_add(lsk, req, sk);
+	return 0;
+}
+
+int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si,
+			      loff_t pos, struct cpt_context *ctx)
+{
+	int err;
+	loff_t end = pos + si->cpt_next;
+
+	pos += si->cpt_hdrlen;
+
+	lock_sock(sk);
+	while (pos < end) {
+		struct cpt_openreq_image oi;
+
+		err = rst_sock_attr(&pos, sk, ctx);
+		if (!err)
+			continue;
+		if (err < 0)
+			goto out;
+
+		err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx);
+		if (err)
+			goto out;
+
+		if (oi.cpt_object == CPT_OBJ_OPENREQ) {
+			struct request_sock *req;
+
+			if (oi.cpt_family == AF_INET6 &&
+			    sk->sk_family != AF_INET6)
+				/* related to non initialized cpt_family bug */
+				goto next;
+			req = rst_reqsk_alloc(oi.cpt_family);
+			if (IS_ERR(req)) {
+				release_sock(sk);
+				return PTR_ERR(req);
+			}
+
+			if (req == NULL) {
+				release_sock(sk);
+				return -ENOMEM;
+			}
+
+			tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn;
+			tcp_rsk(req)->snt_isn = oi.cpt_snt_isn;
+			inet_rsk(req)->rmt_port = oi.cpt_rmt_port;
+			req->mss = oi.cpt_mss;
+			req->retrans = oi.cpt_retrans;
+			inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale;
+			inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale;
+			inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok;
+			inet_rsk(req)->sack_ok = oi.cpt_sack_ok;
+			inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok;
+			inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok;
+			inet_rsk(req)->acked = oi.cpt_acked;
+			inet_rsk(req)->opt = NULL;
+			req->window_clamp = oi.cpt_window_clamp;
+			req->rcv_wnd = oi.cpt_rcv_wnd;
+			req->ts_recent = oi.cpt_ts_recent;
+			req->expires = jiffies_import(oi.cpt_expires);
+			req->sk = NULL;
+			req->secid = 0;
+			req->peer_secid = 0;
+
+			if (oi.cpt_family == AF_INET6) {
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+				inet6_rsk(req)->pktopts = NULL;
+				memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16);
+				memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16);
+				inet6_rsk(req)->iif = oi.cpt_iif;
+				inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+#endif
+			} else {
+				memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4);
+				memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4);
+				inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+			}
+		}
+next:
+		pos += oi.cpt_next;
+	}
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
+		       loff_t pos, cpt_context_t *ctx)
+{
+	struct ip_mreqn imr;
+
+	if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
+		eprintk_ctx("IGMPv3 is still not supported\n");
+		return -EINVAL;
+	}
+
+	memset(&imr, 0, sizeof(imr));
+	imr.imr_ifindex = v->cpt_ifindex;
+	imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0];
+	return ip_mc_join_group(sk, &imr);
+}
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
+			loff_t pos, cpt_context_t *ctx)
+{
+
+	if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
+		eprintk_ctx("IGMPv3 is still not supported\n");
+		return -EINVAL;
+	}
+
+	return ipv6_sock_mc_join(sk, v->cpt_ifindex,
+				 (struct in6_addr*)v->cpt_mcaddr);
+}
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_sysvipc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_sysvipc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_sysvipc.c	2015-01-21 12:02:48.233093393 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_sysvipc.c	2015-01-21 12:02:49.997046565 +0300
@@ -0,0 +1,699 @@
+/*
+ *
+ *  kernel/cpt/rst_sysvipc.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/shm.h>
+#include <linux/msg.h>
+#include <linux/mount.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <bc/kmem.h>
+#include <linux/cpt_image.h>
+#include <linux/init_task.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_kernel.h"
+#include "cpt_mm.h"
+
+struct _warg {
+		struct file		*file;
+		struct cpt_sysvshm_image	*v;
+};
+
+static int fixup_one_shm(struct shmid_kernel *shp, void *arg)
+{
+	struct _warg *warg = arg;
+
+	if (shp->shm_file != warg->file)
+		return 0;
+	if (shp->shm_nattch)
+		return -EEXIST;
+
+	shp->shm_perm.uid = warg->v->cpt_uid;
+	shp->shm_perm.gid = warg->v->cpt_gid;
+	shp->shm_perm.cuid = warg->v->cpt_cuid;
+	shp->shm_perm.cgid = warg->v->cpt_cgid;
+	shp->shm_perm.mode = warg->v->cpt_mode;
+
+	shp->shm_atim = warg->v->cpt_atime;
+	shp->shm_dtim = warg->v->cpt_dtime;
+	shp->shm_ctim = warg->v->cpt_ctime;
+	shp->shm_cprid = warg->v->cpt_creator;
+	shp->shm_lprid = warg->v->cpt_last;
+
+	/* TODO: fix shp->mlock_user? */
+	return 1;
+}
+
+static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v)
+{
+	struct _warg warg;
+
+	warg.file = file;
+	warg.v = v;
+
+	return sysvipc_walk_shm(fixup_one_shm, &warg);
+}
+
+static int restore_shm_chunk(struct file *file, loff_t pos,
+		struct cpt_page_block * pgb, cpt_context_t *ctx)
+{
+	int err;
+	loff_t opos;
+	loff_t ipos;
+	int count;
+
+	ipos = pos + pgb->cpt_hdrlen;
+	opos = pgb->cpt_start;
+	count = pgb->cpt_end-pgb->cpt_start;
+	while (count > 0) {
+		mm_segment_t oldfs;
+		int copy = count;
+
+		if (copy > PAGE_SIZE)
+			copy = PAGE_SIZE;
+		(void)cpt_get_buf(ctx);
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
+		set_fs(oldfs);
+		if (err) {
+			__cpt_release_buf(ctx);
+			return err;
+		}
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		ipos += copy;
+		err = file->f_dentry->d_inode->i_fop->write(file, ctx->tmpbuf,
+								copy, &opos);
+		set_fs(oldfs);
+		__cpt_release_buf(ctx);
+		if (err != copy) {
+			eprintk_ctx("%s: write() failure - copy: %d, opos: %Ld\n",
+					__func__, copy, opos);
+			if (err >= 0)
+				err = -EIO;
+			return err;
+		}
+		count -= copy;
+	}
+	return 0;
+}
+
+static int fixup_shm_data(struct file *file, loff_t pos, loff_t end,
+			  struct cpt_context *ctx)
+{
+	struct cpt_page_block pgb;
+
+	if (file->f_dentry->d_inode->i_fop->write == NULL) {
+		eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n");
+		return -EINVAL;
+	}
+
+	while (pos < end) {
+		int err;
+
+		err = rst_get_object(-1, pos, &pgb, ctx);
+		if (err)
+			return err;
+		dprintk_ctx("restoring SHM block: %08x-%08x\n",
+		       (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
+
+		switch (pgb.cpt_object) {
+			case CPT_OBJ_PAGES:
+				err = restore_shm_chunk(file, pos, &pgb, ctx);
+				if (err) {
+					eprintk_ctx("%s: restore_shm_chunk failed\n", __func__);
+					return err;
+				}
+				break;
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+			case CPT_OBJ_ITERPAGES:
+			case CPT_OBJ_ITERYOUNGPAGES:
+				err = rst_iter_chunk(file, pos, &pgb, ctx);
+				if (err)
+					return err;
+				break;
+#endif
+			default:
+				eprintk_ctx("unsupported page type: %d.\n",
+							pgb.cpt_object);
+				return -EINVAL;
+		}
+
+
+		pos += pgb.cpt_next;
+	}
+	return 0;
+}
+
+struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx)
+{
+	struct file *file;
+	int err;
+	loff_t dpos, epos;
+	union {
+		struct cpt_file_image		fi;
+		struct cpt_sysvshm_image	shmi;
+		struct cpt_inode_image 		ii;
+	} u;
+	const struct cred *curr_cred;
+
+	err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
+	if (err < 0)
+		goto err_out;
+	pos = u.fi.cpt_inode;
+	err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
+	if (err < 0)
+		goto err_out;
+	dpos = pos + u.ii.cpt_hdrlen;
+	epos = pos + u.ii.cpt_next;
+	err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
+	if (err < 0)
+		goto err_out;
+	dpos += u.shmi.cpt_next;
+
+	curr_cred = override_creds(get_exec_env()->init_cred);
+	file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id,
+				 u.shmi.cpt_segsz, u.shmi.cpt_mode);
+	revert_creds(curr_cred);
+	if (!IS_ERR(file)) {
+		err = fixup_shm(file, &u.shmi);
+		if (err != -EEXIST && dpos < epos) {
+			err = fixup_shm_data(file, dpos, epos, ctx);
+			if (err) {
+				eprintk_ctx("%s: fixup_shm_data failed: %d\n",
+						__func__, err);
+				goto err_put;
+			}
+		}
+	} else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) {
+		struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+		struct shmid_kernel *shp;
+
+		shp = shm_lock(ipc_ns, u.shmi.cpt_id);
+		BUG_ON(IS_ERR(shp));
+		get_file(shp->shm_file);
+		file = shp->shm_file;
+		shm_unlock(shp);
+	} else
+		eprintk_ctx("%s: sysvipc setup failed: %ld (key: %Ld)\n",
+				__func__, PTR_ERR(file), u.shmi.cpt_key);
+	return file;
+
+err_put:
+	fput(file);
+err_out:
+	return ERR_PTR(err);
+}
+
+struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx)
+{
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct file *file;
+	union {
+		struct cpt_file_image		fi;
+		struct cpt_inode_image		ii;
+		struct cpt_sysvshm_image	shmi;
+	} u;
+	struct shmid_kernel *shp;
+	struct shm_file_data *sfd;
+	struct path path;
+	mode_t f_mode;
+	loff_t pos;
+	int err;
+
+	pos = vmai->cpt_file;
+	file = rst_sysv_shm_itself(pos, ctx);
+	if (IS_ERR(file) && PTR_ERR(file) != -EEXIST) {
+		eprintk_ctx("%s: rst_sysv_shm_itself failed: %ld\n",
+				__func__, PTR_ERR(file));
+		return file;
+	}
+	fput(file);
+
+	err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
+	if (err < 0)
+		goto err_out;
+	pos = u.fi.cpt_inode;
+	err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
+	if (err < 0)
+		goto err_out;
+	err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
+	if (err < 0)
+		goto err_out;
+
+	shp = shm_lock(ipc_ns, u.shmi.cpt_id);
+	BUG_ON(IS_ERR(shp));
+	path = shp->shm_file->f_path;
+	path_get(&shp->shm_file->f_path);
+	shm_unlock(shp);
+
+	err = -ENOMEM;
+	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
+	if (!sfd)
+		goto out_put_dentry;
+
+	f_mode = 0;
+	if (vmai->cpt_flags & VM_READ)
+		f_mode |= FMODE_READ;
+	if (vmai->cpt_flags & VM_WRITE)
+		f_mode |= FMODE_WRITE;
+	if (vmai->cpt_flags & VM_EXEC)
+		f_mode |= FMODE_EXEC;
+
+	err = -ENOMEM;
+	file = alloc_file(&path, f_mode, &shm_file_operations);
+	if (!file)
+		goto out_free;
+
+	file->private_data = sfd;
+	file->f_mapping = shp->shm_file->f_mapping;
+	sfd->id = shp->shm_perm.id;
+	sfd->ns = get_ipc_ns(ipc_ns);
+	sfd->file = shp->shm_file;
+	sfd->vm_ops = NULL;
+
+	return file;
+
+out_free:
+	kfree(sfd);
+out_put_dentry:
+	path_put(&path);
+err_out:
+	return ERR_PTR(err);
+}
+
+static int attach_one_undo(int semid, struct sem_array *sma, void *arg)
+{
+	struct sem_undo *su = arg;
+	struct sem_undo_list *undo_list = current->sysvsem.undo_list;
+
+	if (semid != su->semid)
+		return 0;
+
+	spin_lock(&undo_list->lock);
+	su->ulp = undo_list;
+	list_add(&su->list_proc, &undo_list->list_proc);
+	list_add(&su->list_id, &sma->list_id);
+	spin_unlock(&undo_list->lock);
+
+	return 1;
+}
+
+static int attach_undo(struct sem_undo *su)
+{
+	return sysvipc_walk_sem(attach_one_undo, su);
+}
+
+static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx)
+{
+	int err;
+	struct sem_undo_list *undo_list;
+
+	if (current->sysvsem.undo_list) {
+		eprintk_ctx("Funny undo_list\n");
+		return 0;
+	}
+
+	undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC);
+	if (undo_list == NULL)
+		return -ENOMEM;
+
+	atomic_set(&undo_list->refcnt, 1);
+	spin_lock_init(&undo_list->lock);
+	INIT_LIST_HEAD(&undo_list->list_proc);
+	current->sysvsem.undo_list = undo_list;
+
+	if (sui->cpt_next > sui->cpt_hdrlen) {
+		loff_t offset = pos + sui->cpt_hdrlen;
+		do {
+			struct sem_undo *new;
+			struct cpt_sysvsem_undo_image spi;
+			err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx);
+			if (err)
+				goto out;
+			new = kmalloc(sizeof(struct sem_undo) +
+					sizeof(short)*spi.cpt_nsem,
+					GFP_KERNEL_UBC);
+			if (!new) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem);
+			new->semadj = (short *) &new[1];
+			new->semid = spi.cpt_id;
+			err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen);
+			if (err) {
+				kfree(new);
+				goto out;
+			}
+			err = attach_undo(new);
+			if (err <= 0) {
+				if (err == 0)
+					err = -ENOENT;
+				kfree(new);
+				goto out;
+			}
+			offset += spi.cpt_next;
+		} while (offset < pos + sui->cpt_next);
+	}
+	err = 0;
+
+out:
+	return err;
+}
+
+__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	__u32 flag = 0;
+
+#if 0
+	if (ti->cpt_sysvsem_undo == CPT_NULL ||
+	    lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo))
+		flag |= CLONE_SYSVSEM;
+#endif
+	return flag;
+}
+
+int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	int err;
+	struct sem_undo_list *f = current->sysvsem.undo_list;
+	cpt_object_t *obj;
+	struct cpt_object_hdr sui;
+
+	if (ti->cpt_sysvsem_undo == CPT_NULL) {
+		exit_sem(current);
+		return 0;
+	}
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx);
+	if (obj) {
+		if (obj->o_obj != f) {
+			exit_sem(current);
+			f = obj->o_obj;
+			atomic_inc(&f->refcnt);
+			current->sysvsem.undo_list = f;
+		}
+		return 0;
+	}
+
+	if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0)
+		goto out;
+
+	if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0)
+		goto out;
+
+	err = -ENOMEM;
+	obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx);
+	if (obj) {
+		err = 0;
+		cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx);
+	}
+
+	return 0;
+
+out:
+	return err;
+}
+
+struct _sarg {
+	int semid;
+	struct cpt_sysvsem_image	*v;
+	__u32				*arr;
+};
+
+static int fixup_one_sem(int semid, struct sem_array *sma, void *arg)
+{
+	struct _sarg *warg = arg;
+
+	if (semid != warg->semid)
+		return 0;
+
+	sma->sem_perm.uid = warg->v->cpt_uid;
+	sma->sem_perm.gid = warg->v->cpt_gid;
+	sma->sem_perm.cuid = warg->v->cpt_cuid;
+	sma->sem_perm.cgid = warg->v->cpt_cgid;
+	sma->sem_perm.mode = warg->v->cpt_mode;
+	sma->sem_perm.seq = warg->v->cpt_seq;
+
+	sma->sem_ctime = warg->v->cpt_ctime;
+	sma->sem_otime = warg->v->cpt_otime;
+	{
+		int i;
+		struct {
+			__u32 semval;
+			__u32 sempid;
+		} *s = (void*)warg->arr;
+
+		for (i=0; i < sma->sem_nsems; i++) {
+			sma->sem_base[i].semval = s[i].semval;
+			sma->sem_base[i].sempid = s[i].sempid;
+		}
+	}
+	return 1;
+}
+
+static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr)
+{
+	struct _sarg warg;
+
+	warg.semid = semid;
+	warg.v = v;
+	warg.arr = arr;
+
+	return sysvipc_walk_sem(fixup_one_sem, &warg);
+}
+
+
+static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si,
+		       struct cpt_context *ctx)
+{
+	int err;
+	__u32 *arr;
+	int nsems = (si->cpt_next - si->cpt_hdrlen)/8;
+
+	arr = kmalloc(nsems*8, GFP_KERNEL);
+	if (!arr)
+		return -ENOMEM;
+
+	err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen);
+	if (err)
+		goto out;
+	err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode);
+	if (err < 0) {
+		eprintk_ctx("SEM 3\n");
+		goto out;
+	}
+	err = fixup_sem(si->cpt_id, si, arr);
+	if (err == 0)
+		err = -ESRCH;
+	if (err > 0)
+		err = 0;
+out:
+	kfree(arr);
+	return err;
+}
+
+static int rst_sysv_sem(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_sysvsem_image sbuf;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		int err;
+		err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx);
+		if (err)
+			return err;
+		err = restore_sem(sec, &sbuf, ctx);
+		if (err)
+			return err;
+		sec += sbuf.cpt_next;
+	}
+	return 0;
+}
+
+struct _marg {
+	int				msqid;
+	struct cpt_sysvmsg_image	*v;
+	struct msg_queue		*m;
+};
+
+static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg)
+{
+	struct _marg *warg = arg;
+
+	if (msqid != warg->msqid)
+		return 0;
+
+	msq->q_perm.uid = warg->v->cpt_uid;
+	msq->q_perm.gid = warg->v->cpt_gid;
+	msq->q_perm.cuid = warg->v->cpt_cuid;
+	msq->q_perm.cgid = warg->v->cpt_cgid;
+	msq->q_perm.mode = warg->v->cpt_mode;
+	msq->q_perm.seq = warg->v->cpt_seq;
+
+	msq->q_stime = warg->v->cpt_stime;
+	msq->q_rtime = warg->v->cpt_rtime;
+	msq->q_ctime = warg->v->cpt_ctime;
+	msq->q_lspid = warg->v->cpt_last_sender;
+	msq->q_lrpid = warg->v->cpt_last_receiver;
+	msq->q_qbytes = warg->v->cpt_qbytes;
+
+	warg->m = msq;
+	return 1;
+}
+
+struct _larg
+{
+	cpt_context_t * ctx;
+	loff_t		pos;
+};
+
+static int do_load_msg(void * dst, int len, int offset, void * data)
+{
+	struct _larg * arg = data;
+	return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset);
+}
+
+static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos,
+		     cpt_context_t * ctx)
+{
+	int err;
+	struct _marg warg;
+	loff_t endpos = pos + v->cpt_next;
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+	pos += v->cpt_hdrlen;
+
+	warg.msqid = msqid;
+	warg.v = v;
+
+	err = sysvipc_walk_msg(fixup_one_msg, &warg);
+	if (err <= 0)
+		return err;
+
+	while (pos < endpos) {
+		struct cpt_sysvmsg_msg_image mi;
+		struct msg_msg *m;
+		struct _larg data = {
+			.ctx = ctx
+		};
+
+		err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx);
+		if (err)
+			return err;
+		data.pos = pos + mi.cpt_hdrlen;
+		m = sysv_msg_load(do_load_msg, mi.cpt_size, &data);
+		if (IS_ERR(m))
+			return PTR_ERR(m);
+		m->m_type = mi.cpt_type;
+		m->m_ts = mi.cpt_size;
+		list_add_tail(&m->m_list, &warg.m->q_messages);
+		warg.m->q_cbytes += m->m_ts;
+		warg.m->q_qnum++;
+		atomic_add(m->m_ts, &ns->msg_bytes);
+		atomic_inc(&ns->msg_hdrs);
+			
+		pos += mi.cpt_next;
+	}
+	return 1;
+}
+
+static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si,
+		       struct cpt_context *ctx)
+{
+	int err;
+
+	err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode);
+	if (err < 0) {
+		eprintk_ctx("MSG 3\n");
+		goto out;
+	}
+	err = fixup_msg(si->cpt_id, si, pos, ctx);
+	if (err == 0)
+		err = -ESRCH;
+	if (err > 0)
+		err = 0;
+out:
+	return err;
+}
+
+static int rst_sysv_msg(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_sysvmsg_image sbuf;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		int err;
+		err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx);
+		if (err)
+			return err;
+		err = restore_msg(sec, &sbuf, ctx);
+		if (err)
+			return err;
+		sec += sbuf.cpt_next;
+	}
+	return 0;
+}
+
+
+int rst_sysv_ipc(struct cpt_context *ctx)
+{
+	int err;
+
+	err = rst_sysv_sem(ctx);
+	if (!err)
+		err = rst_sysv_msg(ctx);
+
+	return err;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_tty.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_tty.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_tty.c	2015-01-21 12:02:48.234093367 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_tty.c	2015-01-21 12:02:51.088017603 +0300
@@ -0,0 +1,410 @@
+/*
+ *
+ *  kernel/cpt/rst_tty.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/tty.h>
+#include <linux/vmalloc.h>
+#include <linux/nsproxy.h>
+#include <asm/unistd.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_mm.h"
+#include "cpt_process.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+
+static int pty_setup(struct tty_struct *stty, loff_t pos,
+		     struct cpt_tty_image *pi, struct cpt_context *ctx)
+{
+	stty->pgrp = NULL;
+	stty->session = NULL;
+	stty->packet = pi->cpt_packet;
+	stty->stopped = pi->cpt_stopped;
+	stty->hw_stopped = pi->cpt_hw_stopped;
+	stty->flow_stopped = pi->cpt_flow_stopped;
+#define TTY_BEHAVIOR_FLAGS ((1<<TTY_EXCLUSIVE)|(1<<TTY_HW_COOK_OUT)| \
+				(1<<TTY_HW_COOK_IN)|(1<<TTY_PTY_LOCK))
+	stty->flags &= ~TTY_BEHAVIOR_FLAGS;
+	stty->flags |= pi->cpt_flags & TTY_BEHAVIOR_FLAGS;
+	stty->ctrl_status = pi->cpt_ctrl_status;
+	stty->winsize.ws_row = pi->cpt_ws_row;
+	stty->winsize.ws_col = pi->cpt_ws_col;
+	stty->winsize.ws_ypixel = pi->cpt_ws_prow;
+	stty->winsize.ws_xpixel = pi->cpt_ws_pcol;
+	stty->canon_column = pi->cpt_canon_column;
+	stty->column = pi->cpt_column;
+	stty->raw = pi->cpt_raw;
+	stty->real_raw = pi->cpt_real_raw;
+	stty->erasing = pi->cpt_erasing;
+	stty->lnext = pi->cpt_lnext;
+	stty->icanon = pi->cpt_icanon;
+	stty->closing = pi->cpt_closing;
+	stty->minimum_to_wake = pi->cpt_minimum_to_wake;
+
+	stty->termios->c_iflag = pi->cpt_c_iflag;
+	stty->termios->c_oflag = pi->cpt_c_oflag;
+	stty->termios->c_lflag = pi->cpt_c_lflag;
+	stty->termios->c_cflag = pi->cpt_c_cflag;
+	memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS);
+	memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags));
+
+	if (pi->cpt_next > pi->cpt_hdrlen) {
+		int err;
+		struct cpt_obj_bits b;
+		err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx);
+		if (err)
+			return err;
+		if (b.cpt_size == 0)
+			return 0;
+		err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen);
+		if (err)
+			return err;
+
+		spin_lock_irq(&stty->read_lock);
+		stty->read_tail = 0;
+		stty->read_cnt = b.cpt_size;
+		stty->read_head = b.cpt_size;
+		stty->canon_head = stty->read_tail + pi->cpt_canon_head;
+		stty->canon_data = pi->cpt_canon_data;
+		spin_unlock_irq(&stty->read_lock);
+	}
+
+	return 0;
+}
+
+/* Find slave/master tty in image, when we already know master/slave.
+ * It might be optimized, of course. */
+static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_TTY];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_tty_image *pibuf;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return CPT_NULL;
+	if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
+		return CPT_NULL;
+	pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL);
+	if (pibuf == NULL) {
+		eprintk_ctx("cannot allocate buffer\n");
+		return CPT_NULL;
+	}
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx))
+			return CPT_NULL;
+		if (pibuf->cpt_index == pi->cpt_index &&
+		    !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) &&
+		    pos != sec &&
+		    ((pi->cpt_drv_flags & TTY_DRIVER_DEVPTS_MEM) == 0 ||
+		     ((pi->cpt_name[0] == 'v') == (pibuf->cpt_name[0] == 'v')))) {
+			pty_setup(stty, sec, pibuf, ctx);
+			return sec;
+		}
+		sec += pibuf->cpt_next;
+	}
+	kfree(pibuf);
+	return CPT_NULL;
+}
+
+static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master,
+			   struct cpt_context *ctx)
+{
+	int err;
+	struct iattr newattrs;
+	struct dentry *d = master->f_dentry;
+
+	newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE;
+	newattrs.ia_uid = ii->cpt_uid;
+	newattrs.ia_gid = ii->cpt_gid;
+	newattrs.ia_mode = ii->cpt_mode;
+
+	mutex_lock(&d->d_inode->i_mutex);
+	err = notify_change(d, &newattrs);
+	mutex_unlock(&d->d_inode->i_mutex);
+
+	return err;
+}
+
+/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open
+ * /dev/ptmx until we get pty with desired index.
+ */
+
+struct file *ptmx_open(int index, unsigned int flags)
+{
+	struct file *file;
+	struct file **stack = NULL;
+	int depth = 0;
+
+	for (;;) {
+		struct tty_struct *tty;
+
+		file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
+		if (IS_ERR(file))
+			break;
+		tty = file_tty(file);
+		if (tty->index == index)
+			break;
+
+		if (depth == PAGE_SIZE/sizeof(struct file *)) {
+			fput(file);
+			file = ERR_PTR(-EBUSY);
+			break;
+		}
+		if (stack == NULL) {
+			stack = (struct file **)__get_free_page(GFP_KERNEL);
+			if (!stack) {
+				fput(file);
+				file = ERR_PTR(-ENOMEM);
+				break;
+			}
+		}
+		stack[depth] = file;
+		depth++;
+	}
+	while (depth > 0) {
+		depth--;
+		fput(stack[depth]);
+	}
+	if (stack)
+		free_page((unsigned long)stack);
+	return file;
+}
+
+
+struct file * rst_open_tty(cpt_object_t *mntobj, char *name,
+			   struct cpt_file_image *fi, struct cpt_inode_image *ii,
+			   unsigned flags, struct cpt_context *ctx)
+{
+	int err;
+	cpt_object_t *obj;
+	struct file *master, *slave;
+	struct tty_struct *stty;
+	struct cpt_tty_image *pi;
+	static char *a = "pqrstuvwxyzabcde";
+	static char *b = "0123456789abcdef";
+	char pairname[16];
+	unsigned master_flags, slave_flags;
+
+	if (fi->cpt_priv == CPT_NULL)
+		return ERR_PTR(-EINVAL);
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx);
+	if (obj && obj->o_parent) {
+		dprintk_ctx("obtained pty as pair to existing\n");
+		master = obj->o_parent;
+		stty = file_tty(master);
+
+		if (stty->driver->subtype == PTY_TYPE_MASTER &&
+		    (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) {
+			wprintk_ctx("cloning ptmx\n");
+			get_file(master);
+			return master;
+		}
+
+		master = dentry_open(dget(master->f_dentry),
+				     mntget(master->f_vfsmnt), flags,
+				     current_cred());
+		if (!IS_ERR(master)) {
+			stty = file_tty(master);
+			if (stty->driver->subtype != PTY_TYPE_MASTER)
+				fixup_tty_attrs(ii, master, ctx);
+		}
+		return master;
+	}
+
+	pi = cpt_get_buf(ctx);
+	err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx);
+	if (err) {
+		cpt_release_buf(ctx);
+		return ERR_PTR(err);
+	}
+
+	if (MAJOR(ii->cpt_rdev) == TTY_MAJOR ||
+	    ii->cpt_rdev == MKDEV(TTYAUX_MAJOR, 1) ||
+	    (ii->cpt_rdev == MKDEV(TTYAUX_MAJOR, 0) &&
+	     !strncmp(pi->cpt_name, "vtty", 4))) {
+		if (mntobj && (mntobj->o_flags & CPT_VFSMOUNT_DELAYFS)) {
+			cpt_release_buf(ctx);
+			return ERR_PTR(-ENOTSUPP);
+		}
+		master = rst_open_file(mntobj, name, fi,
+				flags|O_NONBLOCK|O_NOCTTY, ctx);
+		if (IS_ERR(master)) {
+			eprintk_ctx("rst_open_tty: %s %Ld %ld\n",
+					name, (long long)fi->cpt_priv,
+					PTR_ERR(master));
+			cpt_release_buf(ctx);
+			return master;
+		}
+
+		stty = file_tty(master);
+		obj = cpt_object_add(CPT_OBJ_TTY, stty, ctx);
+		obj->o_parent = master;
+		cpt_obj_setpos(obj, fi->cpt_priv, ctx);
+
+		obj = cpt_object_add(CPT_OBJ_FILE, master, ctx);
+		cpt_obj_setpos(obj, CPT_NULL, ctx);
+		get_file(master);
+
+		/* Do not restore /dev/ttyX state */
+		cpt_release_buf(ctx);
+		return master;
+	}
+
+	master_flags = slave_flags = 0;
+	if (pi->cpt_drv_subtype == PTY_TYPE_MASTER)
+		master_flags = flags;
+	else
+		slave_flags = flags;
+
+	/*
+	 * Open pair master/slave.
+	 */
+	if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) {
+		master = ptmx_open(pi->cpt_index, master_flags);
+	} else {
+		sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]);
+		master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
+	}
+	if (IS_ERR(master)) {
+		eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master));
+		cpt_release_buf(ctx);
+		return master;
+	}
+	stty = file_tty(master);
+	clear_bit(TTY_PTY_LOCK, &stty->flags);
+	if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM)
+		sprintf(pairname, "/dev/pts/%d", stty->index);
+	else
+		sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]);
+	slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
+	if (IS_ERR(slave)) {
+		eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave));
+		fput(master);
+		cpt_release_buf(ctx);
+		return slave;
+	}
+
+	if (pi->cpt_drv_subtype != PTY_TYPE_MASTER)
+		fixup_tty_attrs(ii, slave, ctx);
+
+	cpt_object_add(CPT_OBJ_TTY, file_tty(master), ctx);
+	cpt_object_add(CPT_OBJ_TTY, file_tty(slave), ctx);
+	cpt_object_add(CPT_OBJ_FILE, master, ctx);
+	cpt_object_add(CPT_OBJ_FILE, slave, ctx);
+
+	if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) {
+		loff_t pos;
+		obj = lookup_cpt_object(CPT_OBJ_TTY, file_tty(master), ctx);
+		obj->o_parent = master;
+		cpt_obj_setpos(obj, fi->cpt_priv, ctx);
+		pty_setup(stty, fi->cpt_priv, pi, ctx);
+
+		obj = lookup_cpt_object(CPT_OBJ_TTY, file_tty(slave), ctx);
+		obj->o_parent = slave;
+		pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx);
+		cpt_obj_setpos(obj, pos, ctx);
+
+		obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx);
+		cpt_obj_setpos(obj, CPT_NULL, ctx);
+		get_file(master);
+		cpt_release_buf(ctx);
+		return master;
+	} else {
+		loff_t pos;
+		obj = lookup_cpt_object(CPT_OBJ_TTY, file_tty(slave), ctx);
+		obj->o_parent = slave;
+		cpt_obj_setpos(obj, fi->cpt_priv, ctx);
+		pty_setup(stty->link, fi->cpt_priv, pi, ctx);
+
+		obj = lookup_cpt_object(CPT_OBJ_TTY, file_tty(master), ctx);
+		obj->o_parent = master;
+		pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx);
+		cpt_obj_setpos(obj, pos, ctx);
+
+		obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx);
+		cpt_obj_setpos(obj, CPT_NULL, ctx);
+		get_file(slave);
+		cpt_release_buf(ctx);
+		return slave;
+	}
+}
+
+int rst_tty_jobcontrol(struct cpt_context *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_TTY];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		cpt_object_t *obj;
+		struct cpt_tty_image *pibuf = cpt_get_buf(ctx);
+
+		if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) {
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+
+		obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx);
+		if (obj) {
+			struct tty_struct *stty = obj->o_obj;
+			if ((int)pibuf->cpt_pgrp > 0) {
+				stty->pgrp = alloc_vpid_safe(pibuf->cpt_pgrp);
+				if (!stty->pgrp)
+					dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp);
+			} else if (pibuf->cpt_pgrp) {
+				stty->pgrp = alloc_pid(current->nsproxy->pid_ns,
+							0);
+				if (!stty->pgrp) {
+					eprintk_ctx("cannot allocate stray tty->pgr\n");
+					cpt_release_buf(ctx);
+					return -EINVAL;
+				}
+			}
+			if ((int)pibuf->cpt_session > 0) {
+				stty->session = alloc_vpid_safe(pibuf->cpt_session);
+				if (!stty->session)
+					dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session);
+			}
+		}
+		sec += pibuf->cpt_next;
+		cpt_release_buf(ctx);
+	}
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_ubc.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_ubc.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_ubc.c	2015-01-21 12:02:48.234093367 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_ubc.c	2015-01-21 12:02:50.398035919 +0300
@@ -0,0 +1,135 @@
+/*
+ *
+ *  kernel/cpt/rst_ubc.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <bc/beancounter.h>
+#include <asm/signal.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_ubc.h"
+
+struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx);
+	if (obj == NULL) {
+		eprintk("RST: unknown ub @%Ld\n", (long long)pos);
+		return get_beancounter(get_exec_ub());
+	}
+	return get_beancounter(obj->o_obj);
+}
+
+static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm,
+		int held)
+{
+	prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier);
+	prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit);
+	if (held)
+		prm->held = dmp->held;
+	prm->maxheld = dmp->maxheld;
+	prm->minheld = dmp->minheld;
+	prm->failcnt = max_t(long, prm->failcnt, dmp->failcnt);
+}
+
+static int restore_one_bc(struct cpt_beancounter_image *v,
+		cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct user_beancounter *bc;
+	int resources, i;
+
+	if (v->cpt_parent != CPT_NULL) {
+		/*
+		 * No subbeancounters supported anymore. So just exit.
+		 */
+		return 0;
+	} else {
+		bc = get_exec_ub();
+		get_beancounter(bc);
+	}
+	if (bc == NULL)
+		return -ENOMEM;
+	obj->o_obj = bc;
+
+	if (ctx->image_version < CPT_VERSION_18 &&
+			CPT_VERSION_MINOR(ctx->image_version) < 1)
+		return 0;
+
+	if (v->cpt_content == CPT_CONTENT_ARRAY)
+		resources = v->cpt_ub_resources;
+	else
+		resources = UB_RESOURCES_COMPAT;
+
+	if (resources > UB_RESOURCES)
+		return -EINVAL;
+
+	if (!(v->cpt_ub_flags & CPT_UB_NOSTORE)) {
+		int res;
+
+		res = ubstat_alloc_store(bc);
+		if (res)
+			return res;
+	}
+
+	for (i = 0; i < resources; i++) {
+		restore_one_bc_parm(v->cpt_parms + i * 2, ctx->saved_ubc + i, 0);
+		if (!(v->cpt_ub_flags & CPT_UB_NOSTORE))
+			restore_one_bc_parm(v->cpt_parms + i * 2 + 1,
+						bc->ub_store + i, 1);
+	}
+
+	return 0;
+}
+
+int rst_undump_ubc(struct cpt_context *ctx)
+{
+	loff_t start, end;
+	struct cpt_beancounter_image *v;
+	cpt_object_t *obj;
+	int err;
+
+	err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end);
+	if (err)
+		return err;
+
+	while (start < end) {
+		v = cpt_get_buf(ctx);
+		err = rst_get_object(CPT_OBJ_UBC, start, v, ctx);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+
+		obj = alloc_cpt_object(GFP_KERNEL, ctx);
+		cpt_obj_setpos(obj, start, ctx);
+		intern_cpt_object(CPT_OBJ_UBC, obj, ctx);
+
+		err = restore_one_bc(v, obj, ctx);
+
+		cpt_release_buf(ctx);
+		if (err)
+			return err;
+
+		start += v->cpt_next;
+	}
+
+	return 0;
+}
+
+void rst_finish_ubc(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+
+	for_each_object(obj, CPT_OBJ_UBC)
+		put_beancounter(obj->o_obj);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_undump.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_undump.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpt/rst_undump.c	2015-01-21 12:02:48.234093367 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpt/rst_undump.c	2015-01-21 12:02:54.169935791 +0300
@@ -0,0 +1,1162 @@
+/*
+ *
+ *  kernel/cpt/rst_undump.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fs_struct.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/mnt_namespace.h>
+#include <linux/posix-timers.h>
+#include <linux/personality.h>
+#include <linux/smp_lock.h>
+#include <linux/ve_proto.h>
+#include <linux/compat.h>
+#include <linux/vzcalluser.h>
+#include <linux/posix-timers.h>
+#include <linux/securebits.h>
+#include <bc/beancounter.h>
+#ifdef CONFIG_X86
+#include <asm/desc.h>
+#endif
+#include <asm/unistd.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/utsname.h>
+#include <linux/futex.h>
+#include <linux/shm.h>
+#include <linux/freezer.h>
+
+#include <linux/cpt_obj.h>
+#include <linux/cpt_context.h>
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_process.h"
+#include "cpt_socket.h"
+#include "cpt_net.h"
+#include "cpt_ubc.h"
+#include "cpt_kernel.h"
+#include "cpt_syscalls.h"
+
+static int rst_utsname(cpt_context_t *ctx);
+
+
+struct thr_context {
+	struct completion init_complete;
+	struct completion task_done;
+	int error;
+	struct cpt_context *ctx;
+	cpt_object_t	*tobj;
+};
+
+static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx);
+
+static int vps_rst_veinfo(struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_veinfo_image *i;
+	struct ve_struct *ve;
+	struct timespec delta;
+	loff_t start, end;
+	struct ipc_namespace *ns;
+
+	err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end);
+	if (err)
+		goto out;
+
+	i = cpt_get_buf(ctx);
+	memset(i, 0, sizeof(*i));
+	err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx);
+	if (err)
+		goto out_rel;
+
+	ve = get_exec_env();
+	ns = ve->ve_ns->ipc_ns;
+
+	/* Damn. Fatal mistake, these two values are size_t! */
+	ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU;
+	ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU;
+	ns->shm_ctlmni = i->shm_ctl_mni;
+
+	ns->msg_ctlmax = i->msg_ctl_max;
+	ns->msg_ctlmni = i->msg_ctl_mni;
+	ns->msg_ctlmnb = i->msg_ctl_mnb;
+
+	BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr));
+	ns->sem_ctls[0] = i->sem_ctl_arr[0];
+	ns->sem_ctls[1] = i->sem_ctl_arr[1];
+	ns->sem_ctls[2] = i->sem_ctl_arr[2];
+	ns->sem_ctls[3] = i->sem_ctl_arr[3];
+
+	cpt_timespec_import(&delta, i->start_timespec_delta);
+	_set_normalized_timespec(&ve->start_timespec,
+			ve->start_timespec.tv_sec - delta.tv_sec,
+			ve->start_timespec.tv_nsec - delta.tv_nsec);
+	ve->start_jiffies -= i->start_jiffies_delta;
+	// // FIXME: what???
+	// // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy;
+
+	if (i->real_start_timespec_delta)
+		cpt_timespec_import(&delta, i->real_start_timespec_delta);
+	_set_normalized_timespec(&ve->real_start_timespec,
+			ve->real_start_timespec.tv_sec - delta.tv_sec - ctx->delta_time.tv_sec,
+			ve->real_start_timespec.tv_nsec - delta.tv_nsec - ctx->delta_time.tv_nsec);
+
+#ifdef CONFIG_VZ_FAIRSCHED
+	sched_group_set_start_time(current, &ve->real_start_timespec);
+#endif
+
+	ctx->last_vpid = i->last_pid;
+	if (i->rnd_va_space)
+		ve->_randomize_va_space = i->rnd_va_space - 1;
+	if (i->vpid_max && i->vpid_max < PID_MAX_LIMIT)
+		ve->ve_ns->pid_ns->pid_max = i->vpid_max;
+
+	if (cpt_object_has(i, aio_max_nr))
+		ve->aio_max_nr = i->aio_max_nr;
+
+	err = 0;
+out_rel:
+	cpt_release_buf(ctx);
+out:
+	return err;
+}
+
+static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	int err;
+	struct env_create_param3 param;
+
+	do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time);
+	do_gettimespec(&ctx->delta_time);
+
+	_set_normalized_timespec(&ctx->delta_time,
+				 ctx->delta_time.tv_sec - ctx->start_time.tv_sec,
+				 ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec);
+	ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec;
+	if (ctx->delta_nsec < 0) {
+		wprintk_ctx("Wall time is behind source by %Ld ns, "
+			    "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec);
+	}
+
+        _set_normalized_timespec(&ctx->cpt_monotonic_time,
+                                 ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec,
+                                 ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec);
+
+	memset(&param, 0, sizeof(param));
+	param.iptables_mask = ctx->iptables_mask;
+	param.feature_mask = ctx->features;
+
+	/* feature_mask is set as required - pretend we know everything */
+	param.known_features = (ctx->image_version < CPT_VERSION_18) ?
+		VE_FEATURES_OLD : ~(__u64)0;
+
+	err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK|VE_EXCLUSIVE, 2,
+			&param, sizeof(param));
+	if (err < 0)
+		eprintk_ctx("real_env_create: %d\n", err);
+
+	get_exec_env()->jiffies_fixup =
+		(ctx->delta_time.tv_sec < 0 ?
+		 0 : timespec_to_jiffies(&ctx->delta_time)) -
+		(unsigned long)(get_jiffies_64() - ctx->virt_jiffies64);
+	dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup,
+		    (long long)ctx->delta_nsec);
+	return err < 0 ? err : 0;
+}
+
+int set_mlock_creds(int cap)
+{
+	struct cred *cred;
+
+	cred = prepare_creds();
+	if (cred == NULL)
+		goto err_cred;
+
+	if (cap)
+		cap_raise(cred->cap_effective, CAP_IPC_LOCK);
+	else
+		cap_lower(cred->cap_effective, CAP_IPC_LOCK);
+
+	commit_creds(cred);
+	return 0;
+
+err_cred:
+	return -ENOMEM;
+}
+
+static int rst_creds(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	struct cred *cred;
+	struct user_struct *user;
+	struct group_info *gids;
+	int i;
+
+	cred = prepare_creds();
+	if (cred == NULL)
+		goto err_cred;
+
+	user = alloc_uid(get_exec_env()->user_ns, ti->cpt_user);
+	if (user == NULL)
+		goto err_uid;
+
+	gids = groups_alloc(ti->cpt_ngids);
+	if (gids == NULL)
+		goto err_gids;
+
+	free_uid(cred->user);
+	cred->user = user;
+
+	for (i=0; i<32; i++)
+		gids->small_block[i] = ti->cpt_gids[i];
+
+	put_group_info(cred->group_info);
+	cred->group_info = gids;
+
+	cred->uid = ti->cpt_uid;
+	cred->euid = ti->cpt_euid;
+	cred->suid = ti->cpt_suid;
+	cred->fsuid = ti->cpt_fsuid;
+	cred->gid = ti->cpt_gid;
+	cred->egid = ti->cpt_egid;
+	cred->sgid = ti->cpt_sgid;
+	cred->fsgid = ti->cpt_fsgid;
+
+	memcpy(&cred->cap_effective, &ti->cpt_ecap,
+			sizeof(cred->cap_effective));
+	memcpy(&cred->cap_inheritable, &ti->cpt_icap,
+			sizeof(cred->cap_inheritable));
+	memcpy(&cred->cap_permitted, &ti->cpt_pcap,
+			sizeof(cred->cap_permitted));
+	if (cpt_object_has(ti, cpt_bcap))
+		memcpy(&cred->cap_bset, &ti->cpt_bcap,
+				sizeof(cred->cap_bset));
+
+	if (ctx->image_version < CPT_VERSION_26)
+		cred->securebits = (ti->cpt_keepcap != 0) ?
+			issecure_mask(SECURE_KEEP_CAPS) : 0;
+	else
+		cred->securebits = ti->cpt_keepcap;
+
+	commit_creds(cred);
+	return 0;
+
+err_gids:
+	free_uid(user);
+err_uid:
+	abort_creds(cred);
+err_cred:
+	return -ENOMEM;
+}
+
+static int hook(void *arg)
+{
+	struct thr_context *thr_ctx = arg;
+	struct cpt_context *ctx;
+	cpt_object_t *tobj;
+	struct cpt_task_image *ti;
+	int err = 0;
+	int exiting = 0;
+
+	current->state = TASK_UNINTERRUPTIBLE;
+	complete(&thr_ctx->init_complete);
+	schedule();
+
+	ctx = thr_ctx->ctx;
+	tobj = thr_ctx->tobj;
+	ti = tobj->o_image;
+
+	current->fs->umask = 0;
+
+	if (ti->cpt_pid == 1) {
+		if ((err = rst_creds(ti, ctx)) != 0) {
+			eprintk_ctx("rst_creds: %d\n", err);
+			goto out;
+		}
+
+		err = vps_rst_reparent_root(tobj, ctx);
+
+		if (err) {
+			rst_report_error(err, ctx);
+			goto out;
+		}
+
+		set_bit(VE_RESTORE, &get_exec_env()->flags);
+
+		memcpy(&get_exec_env()->ve_cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t));
+
+		if (ctx->statusfile) {
+			fput(ctx->statusfile);
+			ctx->statusfile = NULL;
+		}
+
+		if (ctx->lockfile) {
+			char b;
+			mm_segment_t oldfs;
+			err = -EINVAL;
+
+			oldfs = get_fs(); set_fs(KERNEL_DS);
+			if (ctx->lockfile->f_op && ctx->lockfile->f_op->read)
+				err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos);
+
+			if (err > 0) /* bytes appearred */
+				err = (ctx->lockfile_new ? 0 : -ECANCELED);
+			else if (err == 0) /* pipe was closed */
+				err = (ctx->lockfile_new ? -ECANCELED : 0);
+
+			set_fs(oldfs);
+			fput(ctx->lockfile);
+			ctx->lockfile = NULL;
+		}
+
+		if (err) {
+			eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err);
+			goto out;
+		}
+		err = vps_rst_veinfo(ctx);
+		if (err) {
+			eprintk_ctx("rst_veinfo: %d\n", err);
+			goto out;
+		}
+
+		err = rst_utsname(ctx);
+		if (err) {
+			eprintk_ctx("rst_utsname: %d\n", err);
+			goto out;
+		}
+
+		err = rst_cgroups(ctx);
+		if (err) {
+			eprintk_ctx("rst_cgroups: %d\n", err);
+			goto out;
+		}
+
+		err = rst_root_namespace(ctx);
+		if (err) {
+			eprintk_ctx("rst_namespace: %d\n", err);
+			goto out;
+		}
+
+		if ((err = rst_restore_net(ctx)) != 0) {
+			eprintk_ctx("rst_restore_net: %d\n", err);
+			goto out;
+		}
+
+		err = rst_sockets(ctx);
+		if (err) {
+			eprintk_ctx("rst_sockets: %d\n", err);
+			goto out;
+		}
+		err = rst_sysv_ipc(ctx);
+		if (err) {
+			eprintk_ctx("rst_sysv_ipc: %d\n", err);
+			goto out;
+		}
+
+		/*
+		 * Set filen rlimit maximut to OS maximum for undump stage
+		 */
+		current->signal->rlim[RLIMIT_NOFILE].rlim_max = sysctl_nr_open;
+	}
+
+	if ((err = rst_creds(ti, ctx)) != 0) {
+		eprintk_ctx("rst_creds: %d\n", err);
+		goto out;
+	}
+
+	if ((err = rst_mm_complete(ti, ctx)) != 0) {
+		eprintk_ctx("rst_mm: %d\n", err);
+		goto out;
+	}
+
+	if ((err = rst_task_namespace(ti, ctx)) != 0) {
+		eprintk_ctx("rst_task_namespace: %d\n", err);
+		goto out;
+	}
+
+	if ((err = rst_fs_complete(ti, ctx)) != 0) {
+		eprintk_ctx("rst_fs: %d\n", err);
+		goto out;
+	}
+
+	if ((err = rst_semundo_complete(ti, ctx)) != 0) {
+		eprintk_ctx("rst_semundo: %d\n", err);
+		goto out;
+	}
+
+	if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) {
+		eprintk_ctx("rst_signal: %d\n", err);
+		goto out;
+	}
+
+#ifdef CONFIG_X86_64
+	if (ctx->image_arch == CPT_OS_ARCH_I386)
+		ti->cpt_personality |= PER_LINUX32;
+#endif
+	if (ti->cpt_personality != current->personality)
+		__set_personality(ti->cpt_personality);
+
+	current->set_child_tid = NULL;
+	current->clear_child_tid = NULL;
+	current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV);
+	current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV);
+	current->exit_code = ti->cpt_exit_code;
+	current->pdeath_signal = ti->cpt_pdeath_signal;
+
+	if (ti->cpt_restart.fn != CPT_RBL_0) {
+		if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP
+		    || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP
+		    ) {
+			struct restart_block *rb;
+			ktime_t e;
+
+			e.tv64 = 0;
+
+			rb = &task_thread_info(current)->restart_block;
+			rb->fn = hrtimer_nanosleep_restart;
+#ifdef CONFIG_COMPAT
+			if (ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP)
+				rb->fn = compat_nanosleep_restart;
+			rb->nanosleep.compat_rmtp = NULL;
+#endif
+			if (ctx->image_version >= CPT_VERSION_32) {
+				rb->nanosleep.index = ti->cpt_restart.arg0;
+				rb->nanosleep.rmtp = (void *)ti->cpt_restart.arg1;
+#ifdef CONFIG_COMPAT
+				rb->nanosleep.compat_rmtp = (void *)ti->cpt_restart.arg2;
+#endif
+				e = ktime_add_ns(e, ti->cpt_restart.arg3);
+			} else if (ctx->image_version >= CPT_VERSION_18_3) {
+				rb->nanosleep.index = ti->cpt_restart.arg3;
+				rb->nanosleep.rmtp = (void *)ti->cpt_restart.arg2;
+				e = ktime_add_ns(e, ti->cpt_restart.arg0);
+			}
+			if (e.tv64 < 0)
+				e.tv64 = TICK_NSEC;
+			e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+			if (e.tv64 < 0)
+				e.tv64 = 0;
+			rb->nanosleep.expires = e.tv64;
+		} else if (ti->cpt_restart.fn == CPT_RBL_POLL) {
+			struct restart_block *rb;
+			ktime_t e;
+			struct timespec ts;
+			
+			e.tv64 = ti->cpt_restart.arg2;
+			e = ktime_sub(e, timespec_to_ktime(ctx->delta_time));
+			ts = ns_to_timespec(ktime_to_ns(e));
+
+			rb = &task_thread_info(current)->restart_block;
+			rb->fn = do_restart_poll;
+
+			rb->poll.ufds = (void *)ti->cpt_restart.arg0;
+			rb->poll.nfds = ti->cpt_restart.arg1 & 0xFFFFFFFF;
+			rb->poll.has_timeout = ti->cpt_restart.arg1 >> 32;
+			rb->poll.tv_sec = ts.tv_sec;
+			rb->poll.tv_nsec = ts.tv_nsec;
+		} else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) {
+			struct restart_block *rb;
+			ktime_t e;
+
+			e.tv64 = 0;
+			e = ktime_add_ns(e, ti->cpt_restart.arg2);
+			e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+			if (e.tv64 < 0)
+				e.tv64 = 0;
+
+			rb = &task_thread_info(current)->restart_block;
+			rb->fn = futex_wait_restart;
+			rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0;
+			rb->futex.val   = ti->cpt_restart.arg1;
+			rb->futex.time  = e.tv64;
+			rb->futex.flags = ti->cpt_restart.arg3;
+		} else if (ti->cpt_restart.fn == CPT_RBL_POSIX_CPU_NSLEEP) {
+			struct restart_block *rb;
+
+			rb = &task_thread_info(current)->restart_block;
+			rb->fn = posix_cpu_nsleep_restart;
+			rb->arg0 = ti->cpt_restart.arg0;
+			rb->arg1 = ti->cpt_restart.arg1;
+			rb->arg2 = ti->cpt_restart.arg2;
+			rb->arg3 = ti->cpt_restart.arg3;
+		} else
+			eprintk_ctx("unknown restart block (%d)\n", (int)ti->cpt_restart.fn);
+	}
+
+	err = rst_clone_children(tobj, ctx);
+	if (err) {
+		eprintk_ctx("rst_clone_children\n");
+		goto out;
+	}
+
+	if ((err = rst_files(ti, ctx)) != 0) {
+		eprintk_ctx("rst_files: %d\n", err);
+		if (err == -EMFILE) {
+			eprintk(KERN_ERR "rst_files: to many open files. \
+					Try to increase global open \
+					files limiter: \
+					/proc/sys/fs/nr_open\n");
+		}
+		goto out;
+	}
+
+	if ((err = restore_signal_struct(ti, &exiting, ctx)) != 0) {
+		eprintk_ctx("rst_signal: %d\n", err);
+		goto out;
+	}
+
+	if ((err = rst_posix_timers(ti, ctx)) != 0) {
+		eprintk_ctx("rst_posix_timers: %d\n", err);
+		goto out;
+	}
+
+
+	if (exiting)
+		current->signal->flags |= SIGNAL_GROUP_EXIT;
+
+	if (ti->cpt_pid == 1) {
+		if ((err = rst_process_linkage(ctx)) != 0) {
+			eprintk_ctx("rst_process_linkage: %d\n", err);
+			goto out;
+		}
+		if ((err = rst_do_filejobs(ctx)) != 0) {
+			eprintk_ctx("rst_do_filejobs: %d\n", err);
+			goto out;
+		}
+		if ((err = rst_eventpoll(ctx)) != 0) {
+			eprintk_ctx("rst_eventpoll: %d\n", err);
+			goto out;
+		}
+#ifdef CONFIG_INOTIFY_USER
+		if ((err = rst_inotify(ctx)) != 0) {
+			eprintk_ctx("rst_inotify: %d\n", err);
+			goto out;
+		}
+#endif
+		if ((err = rst_sockets_complete(ctx)) != 0) {
+			eprintk_ctx("rst_sockets_complete: %d\n", err);
+			goto out_sock;
+		}
+		if ((err = rst_stray_files(ctx)) != 0) {
+			eprintk_ctx("rst_stray_files: %d\n", err);
+			goto out_sock;
+		}
+		if ((err = rst_posix_locks(ctx)) != 0) {
+			eprintk_ctx("rst_posix_locks: %d\n", err);
+			goto out_sock;
+		}
+		if ((err = rst_tty_jobcontrol(ctx)) != 0) {
+			eprintk_ctx("rst_tty_jobcontrol: %d\n", err);
+			goto out_sock;
+		}
+		if ((err = rst_restore_fs(ctx)) != 0) {
+			eprintk_ctx("rst_restore_fs: %d\n", err);
+			goto out_sock;
+		}
+		if ((err = rst_init_delayfs_daemon(ctx)) != 0) {
+			eprintk_ctx("rst_init_delayfs_daemon: %d\n", err);
+			goto out_sock;
+		}
+		if ((err = rst_cgroup_task(ctx)) != 0) {
+			eprintk_ctx("rst_cgroup_task: %d\n", err);
+			goto out_sock;
+		}
+		if (ctx->last_vpid)
+			get_exec_env()->ve_ns->pid_ns->last_pid =
+				ctx->last_vpid;
+	}
+
+out:
+	if (err && ti->cpt_pid == 1)
+		rst_rollback_sockets(ctx);
+out_sock:
+	if (err && ti->cpt_pid == 1)
+		rst_put_delayed_sockets(ctx);
+
+	thr_ctx->error = err;
+	complete(&thr_ctx->task_done);
+
+	if (ti->cpt_pid == 1)
+		rst_cgroup_close(ctx);
+
+	if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+		current->flags |= PF_EXIT_RESTART;
+		do_exit(ti->cpt_exit_code);
+	} else if (!err) {
+		/* prevent accounting of that task to load */
+		current->flags |= PF_FREEZING;
+
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule();
+
+		/* Remove the accounting blocker */
+		current->flags &= ~PF_FREEZING;
+	}
+
+	dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
+
+	module_put(THIS_MODULE);
+	complete_and_exit(NULL, 0);
+	return 0;
+}
+
+#if 0
+static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+	struct task_beancounter *tbc;
+
+	tbc = task_bc(current);
+
+	if (ti->cpt_mm_ub != CPT_NULL) {
+		put_beancounter(tbc->exec_ub);
+		tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx);
+	}
+}
+#endif
+
+static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx,
+		struct thr_context *thr_ctx)
+{
+	struct task_struct *tsk;
+	int pid;
+
+	thr_ctx->ctx = ctx;
+	thr_ctx->error = 0;
+	init_completion(&thr_ctx->init_complete);
+	init_completion(&thr_ctx->task_done);
+#if 0
+	set_task_ubs(obj->o_image, ctx);
+#endif
+
+	pid = local_kernel_thread(hook, thr_ctx, 0, 0);
+	if (pid < 0)
+		return pid;
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_vpid(pid);
+	if (tsk)
+		get_task_struct(tsk);
+	read_unlock(&tasklist_lock);
+	if (tsk == NULL)
+		return -ESRCH;
+	cpt_obj_setobj(obj, tsk, ctx);
+	thr_ctx->tobj = obj;
+	return 0;
+}
+
+static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	struct task_struct *tsk = obj->o_obj;
+	struct cpt_task_image *ti = obj->o_image;
+
+	memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm));
+	rst_mm_basic(obj, ti, ctx);
+	return 0;
+}
+
+static int make_baby(cpt_object_t *cobj,
+		     struct cpt_task_image *pi,
+		     struct cpt_context *ctx)
+{
+	unsigned long flags;
+	struct cpt_task_image *ci = cobj->o_image;
+	struct thr_context thr_ctx;
+	struct task_struct *tsk;
+	pid_t pid;
+	struct fs_struct *tfs = NULL;
+
+	flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx)
+		| rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx);
+	if (ci->cpt_rppid != pi->cpt_pid) {
+		flags |= CLONE_THREAD|CLONE_PARENT;
+		if (ci->cpt_signal != pi->cpt_signal ||
+		    !(flags&CLONE_SIGHAND) ||
+		    (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) {
+			eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n",
+			       (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid,
+			       (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags
+			       );
+			return -EINVAL;
+		}
+	}
+
+	thr_ctx.ctx = ctx;
+	thr_ctx.error = 0;
+	init_completion(&thr_ctx.init_complete);
+	init_completion(&thr_ctx.task_done);
+	thr_ctx.tobj = cobj;
+
+#if 0
+	set_task_ubs(ci, ctx);
+#endif
+
+	if (current->fs == NULL) {
+		tfs = get_exec_env_init()->fs;
+		if (tfs == NULL)
+			return -EINVAL;
+		spin_lock(&tfs->lock);
+		tfs->users++;
+		spin_unlock(&tfs->lock);
+		current->fs = tfs;
+	}
+	pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid);
+	if (tfs) {
+		current->fs = NULL;
+		spin_lock(&tfs->lock);
+		tfs->users--;
+		WARN_ON(tfs->users == 0);
+		spin_unlock(&tfs->lock);
+	}
+	if (pid < 0)
+		return pid;
+
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_vpid(pid);
+	if (tsk)
+		get_task_struct(tsk);
+	read_unlock(&tasklist_lock);
+	if (tsk == NULL)
+		return -ESRCH;
+	cpt_obj_setobj(cobj, tsk, ctx);
+	thr_ctx.tobj = cobj;
+	wait_for_completion(&thr_ctx.init_complete);
+	wait_task_inactive(cobj->o_obj, 0);
+	rst_basic_init_task(cobj, ctx);
+
+	/* clone() increases group_stop_count if it was not zero and
+	 * CLONE_THREAD was asked. Undo.
+	 */
+	if (current->signal->group_stop_count && (flags & CLONE_THREAD)) {
+		if (tsk->signal != current->signal) BUG();
+		current->signal->group_stop_count--;
+	}
+
+	wake_up_process(tsk);
+	wait_for_completion(&thr_ctx.task_done);
+	wait_task_inactive(tsk, 0);
+
+	if (thr_ctx.error) {
+		put_task_struct(tsk);
+		cpt_obj_setobj(cobj, NULL, ctx);
+	}
+
+	return thr_ctx.error;
+}
+
+static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx)
+{
+	int err = 0;
+	struct cpt_task_image *ti = obj->o_image;
+	cpt_object_t *cobj;
+
+	for_each_object(cobj, CPT_OBJ_TASK) {
+		struct cpt_task_image *ci = cobj->o_image;
+		if (cobj == obj)
+			continue;
+		if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) ||
+		    (ci->cpt_leader == ti->cpt_pid &&
+		     ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) {
+			err = make_baby(cobj, ti, ctx);
+			if (err) {
+				eprintk_ctx("make_baby: %d\n", err);
+				return err;
+			}
+		}
+	}
+	return 0;
+}
+
+static int read_task_images(struct cpt_context *ctx)
+{
+	int err;
+	loff_t start, end;
+
+	err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end);
+	if (err)
+		return err;
+
+	while (start < end) {
+		cpt_object_t *obj;
+		struct cpt_task_image *ti = cpt_get_buf(ctx);
+
+		err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx);
+		if (err) {
+			cpt_release_buf(ctx);
+			return err;
+		}
+#if 0
+		if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) {
+			eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid);
+			cpt_release_buf(ctx);
+			return -EINVAL;
+		}
+#endif
+		obj = alloc_cpt_object(GFP_KERNEL, ctx);
+		cpt_obj_setpos(obj, start, ctx);
+		intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
+		obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL);
+		if (obj->o_image == NULL) {
+			cpt_release_buf(ctx);
+			return -ENOMEM;
+		}
+		memcpy(obj->o_image, ti, ti->cpt_hdrlen);
+		err = ctx->pread(obj->o_image + ti->cpt_hdrlen,
+				 ti->cpt_next - ti->cpt_hdrlen, ctx,
+				 start + ti->cpt_hdrlen);
+		cpt_release_buf(ctx);
+		if (err)
+			return err;
+		start += ti->cpt_next;
+	}
+	return 0;
+}
+
+
+static int vps_rst_restore_tree(struct cpt_context *ctx)
+{
+	int err;
+	cpt_object_t *obj;
+	struct thr_context thr_ctx_root;
+
+	err = read_task_images(ctx);
+	if (err)
+		return err;
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		err = create_root_task(obj, ctx, &thr_ctx_root);
+		if (err)
+			return err;
+
+		wait_for_completion(&thr_ctx_root.init_complete);
+		wait_task_inactive(obj->o_obj, 0);
+		rst_basic_init_task(obj, ctx);
+
+		wake_up_process(obj->o_obj);
+		wait_for_completion(&thr_ctx_root.task_done);
+		wait_task_inactive(obj->o_obj, 0);
+		err = thr_ctx_root.error;
+		break;
+	}
+
+	return err;
+}
+
+#ifndef CONFIG_IA64
+int rst_read_vdso(struct cpt_context *ctx)
+{
+	int err;
+	loff_t start, end;
+	struct cpt_page_block *pgb;
+
+	ctx->vdso = NULL;
+	err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end);
+	if (err)
+		return err;
+	if (start == CPT_NULL)
+		return 0;
+	if (end < start + sizeof(*pgb) + PAGE_SIZE)
+		return -EINVAL;
+
+	pgb = cpt_get_buf(ctx);
+	err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx);
+	if (err) {
+		goto err_buf;
+	}
+	ctx->vdso = (char*)__get_free_page(GFP_KERNEL);
+	if (ctx->vdso == NULL) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb));
+	if (err)
+		goto err_page;
+	if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) {
+		free_page((unsigned long)ctx->vdso);
+		ctx->vdso = NULL;
+	}
+
+	cpt_release_buf(ctx);
+	return 0;
+err_page:
+	free_page((unsigned long)ctx->vdso);
+	ctx->vdso = NULL;
+err_buf:
+	cpt_release_buf(ctx);
+	return err;
+}
+#endif
+
+int vps_rst_undump(struct cpt_context *ctx)
+{
+	int err;
+	unsigned long umask;
+
+	set_ubc_unlimited(ctx, get_exec_ub());
+
+	err = rst_open_dumpfile(ctx);
+	if (err)
+		return err;
+
+	if (ctx->tasks64) {
+#if defined(CONFIG_IA64)
+		if (ctx->image_arch != CPT_OS_ARCH_IA64)
+#elif defined(CONFIG_X86_64)
+		if (ctx->image_arch != CPT_OS_ARCH_EMT64)
+#else
+		if (1)
+#endif
+		{
+			eprintk_ctx("Cannot restore 64 bit container on this architecture\n");
+			return -EINVAL;
+		}
+	}
+
+	umask = current->fs->umask;
+	current->fs->umask = 0;
+
+#ifndef CONFIG_IA64
+	err = rst_read_vdso(ctx);
+#endif
+
+	if (err == 0)
+		err = rst_open_pram(ctx);
+
+	if (err == 0)
+		err = rst_undump_ubc(ctx);
+
+	if (err == 0)
+		err = vps_rst_restore_tree(ctx);
+
+	if (err == 0)
+		err = rst_restore_process(ctx);
+
+	current->fs->umask = umask;
+
+        return err;
+}
+
+static int rst_unlock_ve(struct cpt_context *ctx)
+{
+	struct ve_struct *env;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+	down_write(&env->op_sem);
+	env->is_locked = 0;
+	up_write(&env->op_sem);
+	clear_bit(VE_RESTORE, &env->flags);
+	put_ve(env);
+	return 0;
+}
+
+int recalc_sigpending_tsk(struct task_struct *t);
+
+int rst_resume(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+	int err = 0;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *bc;
+#endif
+
+	rst_freeze_delayfs(ctx);
+
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+
+		fput(file);
+	}
+
+#ifdef CONFIG_BEANCOUNTERS
+	bc = get_beancounter_byuid(ctx->ve_id, 0);
+	BUG_ON(!bc);
+	restore_ubc_limits(ctx, bc);
+	put_beancounter_longterm(bc);
+#endif
+
+	rst_resume_network(ctx);
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		struct cpt_task_image *ti = obj->o_image;
+
+		if (!tsk)
+			continue;
+
+		if (ti->cpt_state == TASK_UNINTERRUPTIBLE) {
+			unsigned long flags;
+
+			dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
+
+			/* Weird... If a signal is sent to stopped task,
+			 * nobody makes recalc_sigpending(). We have to do
+			 * this by hands after wake_up_process().
+			 * if we did this before a signal could arrive before
+			 * wake_up_process() and stall.
+			 */
+			if (lock_task_sighand(tsk, &flags)) {
+				if (!signal_pending(tsk))
+					recalc_sigpending_tsk(tsk);
+				unlock_task_sighand(tsk, &flags);
+			}
+
+			wake_up_process(tsk);
+		} else {
+			if (ti->cpt_state == TASK_STOPPED ||
+			    ti->cpt_state == TASK_TRACED) {
+				set_task_state(tsk, ti->cpt_state);
+			}
+		}
+		put_task_struct(tsk);
+	}
+
+	rst_unlock_ve(ctx);
+
+	if (ctx->dctx && ctx->dctx->dfs_daemon)
+		wake_up_process(ctx->dctx->dfs_daemon);
+
+	rst_finish_ubc(ctx);
+	rst_finish_vfsmount_ref(ctx);
+	cpt_object_destroy(ctx);
+
+        return err;
+}
+
+int rst_kill(struct cpt_context *ctx)
+{
+	cpt_object_t *obj;
+	struct ve_struct *env;
+	pid_t init_pid = 0;
+	int err = 0;
+
+	if (!ctx->ve_id)
+		return -EINVAL;
+
+	env = get_ve_by_id(ctx->ve_id);
+	if (!env)
+		return -ESRCH;
+
+	if (current->ve_task_info.owner_env == env) {
+		wprintk_ctx("attempt to kill ve from inside, escaping...\n");
+		err = -EPERM;
+		goto out;
+	}
+
+	for_each_object(obj, CPT_OBJ_FILE) {
+		struct file *file = obj->o_obj;
+
+		fput(file);
+	}
+
+	for_each_object(obj, CPT_OBJ_TASK) {
+		struct task_struct *tsk = obj->o_obj;
+		struct cpt_task_image *ti = obj->o_image;
+
+		if (tsk == NULL)
+			continue;
+
+		if (ti->cpt_pid == 1)
+			init_pid = task_pgrp_vnr(tsk);
+
+		if (tsk->exit_state == 0) {
+			send_sig(SIGKILL, tsk, 1);
+			if (!thaw_process(tsk))
+				wake_up_process(tsk);
+		}
+
+		put_task_struct(tsk);
+	}
+
+	if (ctx->dctx && ctx->dctx->dfs_daemon) {
+		send_sig(SIGKILL, ctx->dctx->dfs_daemon, 1);
+		wake_up_process(ctx->dctx->dfs_daemon);
+	}
+
+	rst_finish_ubc(ctx);
+	rst_finish_vfsmount_ref(ctx);
+	cpt_object_destroy(ctx);
+
+	if (ctx->ctx_state == CPT_CTX_UNDUMPING && init_pid) {
+		int ret;
+
+		ret = sc_waitx(init_pid, __WALL, NULL);
+		if (ret < 0)
+			eprintk_ctx("wait init (%d) failed: %d\n", init_pid, ret);
+	}
+
+	wait_event_interruptible(env->ve_list_wait, list_empty(&env->ve_list));
+
+out:
+	put_ve(env);
+        return err;
+}
+
+static int rst_utsname(cpt_context_t *ctx)
+{
+	int err;
+	loff_t sec = ctx->sections[CPT_SECT_UTSNAME];
+	loff_t endsec;
+	struct cpt_section_hdr h;
+	struct cpt_object_hdr o;
+	struct ve_struct *ve;
+	struct uts_namespace *ns;
+	int i;
+
+	if (sec == CPT_NULL)
+		return 0;
+
+	err = ctx->pread(&h, sizeof(h), ctx, sec);
+	if (err)
+		return err;
+	if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h))
+		return -EINVAL;
+
+	ve = get_exec_env();
+	ns = ve->ve_ns->uts_ns;
+
+	i = 0;
+	endsec = sec + h.cpt_next;
+	sec += h.cpt_hdrlen;
+	while (sec < endsec) {
+		int len;
+		char *ptr;
+		err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx);
+		if (err)
+			return err;
+		len = o.cpt_next - o.cpt_hdrlen;
+		if (len > __NEW_UTS_LEN + 1)
+			return -ENAMETOOLONG;
+		switch (i) {
+		case 0:
+			ptr = ns->name.nodename; break;
+		case 1:
+			ptr = ns->name.domainname; break;
+		case 2:
+			ptr = ns->name.release; break;
+		default:
+			return -EINVAL;
+		}
+		err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen);
+		if (err)
+			return err;
+		i++;
+		sec += o.cpt_next;
+	}
+
+	return 0;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpu.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpu.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpu.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpu.c	2015-01-21 12:02:43.793211267 +0300
@@ -175,7 +175,7 @@ static inline void check_for_tasks(int c
 	struct task_struct *p;
 
 	write_lock_irq(&tasklist_lock);
-	for_each_process(p) {
+	for_each_process_all(p) {
 		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
 		    (!cputime_eq(p->utime, cputime_zero) ||
 		     !cputime_eq(p->stime, cputime_zero)))
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cpuset.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpuset.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cpuset.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cpuset.c	2015-01-21 12:02:54.615923952 +0300
@@ -53,6 +53,7 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/mmgang.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
@@ -60,6 +61,15 @@
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
 
+#include <bc/beancounter.h>
+
+/*
+ * If unset, memory allocator will use cpuset constraints only as a hint,
+ * permitting to allocate memory anywhere if there is no free memory in
+ * allowed nodes.
+ */
+int sysctl_strict_mem_cpuset = 0;
+
 /*
  * Tracks how many cpusets are currently defined in system.
  * When there is only one cpuset (the root cpuset) we can
@@ -87,6 +97,9 @@ struct cpuset {
 	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
+	cpumask_var_t cpuset_cpus_allowed;
+	nodemask_t cpuset_mems_allowed;
+
 	struct cpuset *parent;		/* my parent */
 
 	struct fmeter fmeter;		/* memory_pressure filter */
@@ -115,6 +128,22 @@ static inline struct cpuset *task_cs(str
 			    struct cpuset, css);
 }
 
+static struct user_beancounter *get_cpuset_beancounter(struct cpuset *cs)
+{
+	struct user_beancounter *ub = NULL;
+	struct dentry *dentry;
+	unsigned long id;
+	char *endp;
+
+	dentry = cs->css.cgroup->dentry;
+	if (dentry) {
+		id = simple_strtoul(dentry->d_name.name, &endp, 10);
+		if (!*endp && id <= INT_MAX)
+			ub = get_beancounter_byuid(id, 0);
+	}
+	return ub;
+}
+
 /* bits in struct cpuset flags field */
 typedef enum {
 	CS_CPU_EXCLUSIVE,
@@ -218,6 +247,9 @@ static char cpuset_name[CPUSET_NAME_LEN]
 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 static DEFINE_SPINLOCK(cpuset_buffer_lock);
 
+/* Protected by cgroup_lock */
+static cpumask_var_t cpus_attach;
+
 /*
  * This is ugly, but preserves the userspace API for existing cpuset
  * users. If someone tries to mount the "cpuset" filesystem, we
@@ -349,6 +381,13 @@ static struct cpuset *alloc_trial_cpuset
 	}
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 
+	if (!alloc_cpumask_var(&trial->cpuset_cpus_allowed, GFP_KERNEL)) {
+		free_cpumask_var(trial->cpus_allowed);
+		kfree(trial);
+		return NULL;
+	}
+	cpumask_copy(trial->cpuset_cpus_allowed, cs->cpuset_cpus_allowed);
+
 	return trial;
 }
 
@@ -358,6 +397,7 @@ static struct cpuset *alloc_trial_cpuset
  */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+	free_cpumask_var(trial->cpuset_cpus_allowed);
 	free_cpumask_var(trial->cpus_allowed);
 	kfree(trial);
 }
@@ -419,14 +459,6 @@ static int validate_change(const struct 
 			return -EINVAL;
 	}
 
-	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
-	if (cgroup_task_count(cur->css.cgroup)) {
-		if (cpumask_empty(trial->cpus_allowed) ||
-		    nodes_empty(trial->mems_allowed)) {
-			return -ENOSPC;
-		}
-	}
-
 	return 0;
 }
 
@@ -791,8 +823,7 @@ void rebuild_sched_domains(void)
 static int cpuset_test_cpumask(struct task_struct *tsk,
 			       struct cgroup_scanner *scan)
 {
-	return !cpumask_equal(&tsk->cpus_allowed,
-			(cgroup_cs(scan->cg))->cpus_allowed);
+	return !cpumask_equal(&tsk->cpus_allowed, cpus_attach);
 }
 
 /**
@@ -809,7 +840,7 @@ static int cpuset_test_cpumask(struct ta
 static void cpuset_change_cpumask(struct task_struct *tsk,
 				  struct cgroup_scanner *scan)
 {
-	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+	set_cpus_allowed_ptr(tsk, cpus_attach);
 }
 
 /**
@@ -829,6 +860,7 @@ static void update_tasks_cpumask(struct 
 {
 	struct cgroup_scanner scan;
 
+	guarantee_online_cpus(cs, cpus_attach);
 	scan.cg = cs->css.cgroup;
 	scan.test_task = cpuset_test_cpumask;
 	scan.process_task = cpuset_change_cpumask;
@@ -839,11 +871,13 @@ static void update_tasks_cpumask(struct 
 /**
  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
  * @cs: the cpuset to consider
- * @buf: buffer of cpu numbers written to this cpuset
+ * @cpus_allowed: new cpu mask
  */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
-			  const char *buf)
+static int update_cpumask(struct cpuset *cs,
+		const struct cpumask *cpus_allowed, int update_allowed)
 {
+	const struct cpumask *cpuset_cpus_allowed = cpu_active_mask;
+	struct cpuset *trialcs;
 	struct ptr_heap heap;
 	int retval;
 	int is_load_balanced;
@@ -852,38 +886,36 @@ static int update_cpumask(struct cpuset 
 	if (cs == &top_cpuset)
 		return -EACCES;
 
-	/*
-	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
-	 * Since cpulist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have cpus.
-	 */
-	if (!*buf) {
-		cpumask_clear(trialcs->cpus_allowed);
-	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
-		if (retval < 0)
-			return retval;
+	if (!cpumask_empty(cs->cpuset_cpus_allowed) && !update_allowed)
+		cpuset_cpus_allowed = cs->cpuset_cpus_allowed;
+
+	if (!cpumask_subset(cpus_allowed, cpuset_cpus_allowed))
+		return -EINVAL;
+
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
+	cpumask_copy(trialcs->cpus_allowed, cpus_allowed);
 
-		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
-			return -EINVAL;
-	}
 	retval = validate_change(cs, trialcs);
 	if (retval < 0)
-		return retval;
+		goto done;
 
 	/* Nothing to do if the cpus didn't change */
 	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
-		return 0;
+		goto done;
 
 	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
 	if (retval)
-		return retval;
+		goto done;
 
 	is_load_balanced = is_sched_load_balance(trialcs);
 
 	mutex_lock(&callback_mutex);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+	if (update_allowed)
+		cpumask_copy(cs->cpuset_cpus_allowed, cs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
 	/*
@@ -896,7 +928,10 @@ static int update_cpumask(struct cpuset 
 
 	if (is_load_balanced)
 		async_rebuild_sched_domains();
-	return 0;
+
+done:
+	free_trial_cpuset(trialcs);
+	return retval;
 }
 
 /*
@@ -1022,9 +1057,9 @@ static void cpuset_change_nodemask(struc
 
 	migrate = is_memory_migrate(cs);
 
-	mpol_rebind_mm(mm, &cs->mems_allowed);
+	mpol_rebind_mm(mm, &newmems);
 	if (migrate)
-		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+		cpuset_migrate_mm(mm, oldmem, &newmems);
 	mmput(mm);
 }
 
@@ -1082,12 +1117,15 @@ static void update_tasks_nodemask(struct
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
  */
-static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
-			   const char *buf)
+static int update_nodemask(struct cpuset *cs,
+		const nodemask_t *mems_allowed, int update_allowed)
 {
+	const nodemask_t *cpuset_mems_allowed = &node_states[N_HIGH_MEMORY];
+	struct cpuset *trialcs;
 	nodemask_t oldmem;
 	int retval;
 	struct ptr_heap heap;
+	struct user_beancounter *ub;
 
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1096,24 +1134,19 @@ static int update_nodemask(struct cpuset
 	if (cs == &top_cpuset)
 		return -EACCES;
 
-	/*
-	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
-	 */
-	if (!*buf) {
-		nodes_clear(trialcs->mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs->mems_allowed);
-		if (retval < 0)
-			goto done;
+	if (!nodes_empty(cs->cpuset_mems_allowed) && !update_allowed)
+		cpuset_mems_allowed = &cs->cpuset_mems_allowed;
 
-		if (!nodes_subset(trialcs->mems_allowed,
-				node_states[N_HIGH_MEMORY]))
-			return -EINVAL;
-	}
-	oldmem = cs->mems_allowed;
+	if (!nodes_subset(*mems_allowed, *cpuset_mems_allowed))
+		return -EINVAL;
+
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
+	trialcs->mems_allowed = *mems_allowed;
+
+	guarantee_online_mems(cs, &oldmem);
 	if (nodes_equal(oldmem, trialcs->mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
 		goto done;
@@ -1128,12 +1161,22 @@ static int update_nodemask(struct cpuset
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
+	if (update_allowed)
+		cs->cpuset_mems_allowed = cs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
 	update_tasks_nodemask(cs, &oldmem, &heap);
 
 	heap_free(&heap);
+
+	ub = get_cpuset_beancounter(cs);
+	if (ub) {
+		guarantee_online_mems(cs, &trialcs->mems_allowed);
+		set_gang_limits(get_ub_gs(ub), NULL, &trialcs->mems_allowed);
+		put_beancounter_longterm(ub);
+	}
 done:
+	free_trial_cpuset(trialcs);
 	return retval;
 }
 
@@ -1357,11 +1400,6 @@ static int fmeter_getrate(struct fmeter 
 static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 			     struct task_struct *tsk)
 {
-	struct cpuset *cs = cgroup_cs(cont);
-
-	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-		return -ENOSPC;
-
 	/*
 	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
 	 * cannot change their cpu affinity and isolating such threads by their
@@ -1443,12 +1481,31 @@ static void cpuset_attach(struct cgroup_
 	}
 }
 
+/*
+ * cgroup_set_[cpumask|nodemask] - set a cgroup's cpu/mem affinity mask.
+ *
+ * Call holding cgroup mutex.
+ */
+
+int cgroup_set_cpumask(struct cgroup *cgrp, const struct cpumask *cpus_allowed)
+{
+	return update_cpumask(cgroup_cs(cgrp), cpus_allowed, 1);
+}
+
+int cgroup_set_nodemask(struct cgroup *cgrp, const nodemask_t *nodes_allowed)
+{
+	return update_nodemask(cgroup_cs(cgrp), nodes_allowed, 1);
+}
+
 /* The various types of files and directories in a cpuset file system */
 
 typedef enum {
 	FILE_MEMORY_MIGRATE,
 	FILE_CPULIST,
 	FILE_MEMLIST,
+	FILE_MEM_MIGRATION_PENDING,
+	FILE_CPUS_ALLOWED,
+	FILE_MEMS_ALLOWED,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_MEM_HARDWALL,
@@ -1526,42 +1583,112 @@ static int cpuset_write_s64(struct cgrou
 	return retval;
 }
 
-/*
- * Common handling for a write to a "cpus" or "mems" file.
- */
-static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+static int cpuset_write_cpumask(struct cgroup *cgrp, struct cftype *cft,
 				const char *buf)
 {
-	int retval = 0;
-	struct cpuset *cs = cgroup_cs(cgrp);
-	struct cpuset *trialcs;
+	cpuset_filetype_t type = cft->private;
+	int retval;
+	cpumask_var_t cpus_allowed;
 
-	if (!cgroup_lock_live_group(cgrp))
-		return -ENODEV;
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+		return -ENOMEM;
 
-	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs) {
-		retval = -ENOMEM;
-		goto out;
+	/*
+	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have cpus.
+	 */
+	if (!*buf) {
+		cpumask_clear(cpus_allowed);
+	} else {
+		retval = cpulist_parse(buf, cpus_allowed);
+		if (retval < 0)
+			goto done;
 	}
 
-	switch (cft->private) {
-	case FILE_CPULIST:
-		retval = update_cpumask(cs, trialcs, buf);
-		break;
-	case FILE_MEMLIST:
-		retval = update_nodemask(cs, trialcs, buf);
-		break;
-	default:
-		retval = -EINVAL;
-		break;
+	if (cgroup_lock_live_group(cgrp)) {
+		retval = update_cpumask(cgroup_cs(cgrp), cpus_allowed,
+					type == FILE_CPUS_ALLOWED);
+		cgroup_unlock();
+	} else {
+		retval = -ENODEV;
 	}
 
-	free_trial_cpuset(trialcs);
-out:
-	cgroup_unlock();
+done:
+	free_cpumask_var(cpus_allowed);
+	return retval;
+}
+
+static int cpuset_write_nodemask(struct cgroup *cgrp, struct cftype *cft,
+				 const char *buf)
+{
+	cpuset_filetype_t type = cft->private;
+	int retval;
+	nodemask_t mems_allowed;
+
+	/*
+	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+	 * Since nodelist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have memory.
+	 */
+	if (!*buf) {
+		nodes_clear(mems_allowed);
+	} else {
+		retval = nodelist_parse(buf, mems_allowed);
+		if (retval < 0)
+			goto done;
+	}
+
+	if (cgroup_lock_live_group(cgrp)) {
+		retval = update_nodemask(cgroup_cs(cgrp), &mems_allowed,
+					 type == FILE_MEMS_ALLOWED);
+		cgroup_unlock();
+	} else {
+		retval = -ENODEV;
+	}
+
+done:
+	return retval;
+}
+
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+static int cpuset_write_mem_migration_pending(struct cgroup *cgrp,
+		struct cftype *cft, const char *buf)
+{
+	int retval;
+	nodemask_t pending;
+	struct cpuset *cs;
+	struct user_beancounter *ub;
+
+	if (!*buf) {
+		nodes_clear(pending);
+	} else {
+		retval = nodelist_parse(buf, pending);
+		if (retval < 0)
+			goto done;
+	}
+
+	if (cgroup_lock_live_group(cgrp)) {
+		cs = cgroup_cs(cgrp);
+		ub = get_cpuset_beancounter(cs);
+		if (ub) {
+			cancel_gangs_migration(get_ub_gs(ub));
+			schedule_gangs_migration(get_ub_gs(ub),
+					&pending, &cs->mems_allowed);
+			put_beancounter_longterm(ub);
+		}
+		cgroup_unlock();
+		retval = 0;
+	} else {
+		retval = -ENODEV;
+	}
+
+done:
 	return retval;
 }
+#endif
 
 /*
  * These ascii lists should be read in a single call, by using a user
@@ -1597,6 +1724,22 @@ static int cpuset_sprintf_memlist(char *
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
 
+static int cpuset_sprintf_mem_migration_pending(char *page, struct cpuset *cs)
+{
+	nodemask_t mask;
+	struct user_beancounter *ub;
+
+	ub = get_cpuset_beancounter(cs);
+	if (ub) {
+		gangs_migration_pending(get_ub_gs(ub), &mask);
+		put_beancounter_longterm(ub);
+	} else {
+		nodes_clear(mask);
+	}
+
+	return nodelist_scnprintf(page, PAGE_SIZE, mask);
+}
+
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
 				       struct cftype *cft,
 				       struct file *file,
@@ -1621,6 +1764,15 @@ static ssize_t cpuset_common_file_read(s
 	case FILE_MEMLIST:
 		s += cpuset_sprintf_memlist(s, cs);
 		break;
+	case FILE_MEM_MIGRATION_PENDING:
+		s += cpuset_sprintf_mem_migration_pending(s, cs);
+		break;
+	case FILE_CPUS_ALLOWED:
+		s += cpulist_scnprintf(s, PAGE_SIZE, cs->cpuset_cpus_allowed);
+		break;
+	case FILE_MEMS_ALLOWED:
+		s += nodelist_scnprintf(s, PAGE_SIZE, cs->cpuset_mems_allowed);
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1688,7 +1840,7 @@ static struct cftype files[] = {
 	{
 		.name = "cpus",
 		.read = cpuset_common_file_read,
-		.write_string = cpuset_write_resmask,
+		.write_string = cpuset_write_cpumask,
 		.max_write_len = (100U + 6 * NR_CPUS),
 		.private = FILE_CPULIST,
 	},
@@ -1696,11 +1848,37 @@ static struct cftype files[] = {
 	{
 		.name = "mems",
 		.read = cpuset_common_file_read,
-		.write_string = cpuset_write_resmask,
+		.write_string = cpuset_write_nodemask,
 		.max_write_len = (100U + 6 * MAX_NUMNODES),
 		.private = FILE_MEMLIST,
 	},
 
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+	{
+		.name = "mem_migration_pending",
+		.read = cpuset_common_file_read,
+		.write_string = cpuset_write_mem_migration_pending,
+		.max_write_len = (100U + 6 * MAX_NUMNODES),
+		.private = FILE_MEM_MIGRATION_PENDING,
+	},
+#endif
+
+	{
+		.name = "cpus_allowed",
+		.read = cpuset_common_file_read,
+		.write_string = cpuset_write_cpumask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_CPUS_ALLOWED,
+	},
+
+	{
+		.name = "mems_allowed",
+		.read = cpuset_common_file_read,
+		.write_string = cpuset_write_nodemask,
+		.max_write_len = (100U + 6 * MAX_NUMNODES),
+		.private = FILE_MEMS_ALLOWED,
+	},
+
 	{
 		.name = "cpu_exclusive",
 		.read_u64 = cpuset_read_u64,
@@ -1848,6 +2026,11 @@ static struct cgroup_subsys_state *cpuse
 		kfree(cs);
 		return ERR_PTR(-ENOMEM);
 	}
+	if (!alloc_cpumask_var(&cs->cpuset_cpus_allowed, GFP_KERNEL)) {
+		free_cpumask_var(cs->cpus_allowed);
+		kfree(cs);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	cs->flags = 0;
 	if (is_spread_page(parent))
@@ -1857,6 +2040,8 @@ static struct cgroup_subsys_state *cpuse
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
+	cpumask_clear(cs->cpuset_cpus_allowed);
+	nodes_clear(cs->cpuset_mems_allowed);
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
 
@@ -1879,6 +2064,7 @@ static void cpuset_destroy(struct cgroup
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
 	number_of_cpusets--;
+	free_cpumask_var(cs->cpuset_cpus_allowed);
 	free_cpumask_var(cs->cpus_allowed);
 	kfree(cs);
 }
@@ -1910,9 +2096,13 @@ int __init cpuset_init(void)
 
 	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
 		BUG();
+	if (!alloc_cpumask_var(&top_cpuset.cpuset_cpus_allowed, GFP_KERNEL))
+		BUG();
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
+	cpumask_clear(top_cpuset.cpuset_cpus_allowed);
+	nodes_clear(top_cpuset.cpuset_mems_allowed);
 
 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/cred.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cred.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/cred.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/cred.c	2015-01-21 12:02:48.883076139 +0300
@@ -59,6 +59,7 @@ struct cred init_cred = {
 	.tgcred			= &init_tgcred,
 #endif
 };
+EXPORT_SYMBOL_GPL(init_cred);
 
 static inline void set_cred_subscribers(struct cred *cred, int n)
 {
@@ -368,66 +369,6 @@ struct cred *prepare_exec_creds(void)
 }
 
 /*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
-	struct thread_group_cred *tgcred = NULL;
-#endif
-	struct cred *new;
-
-#ifdef CONFIG_KEYS
-	tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
-	if (!tgcred)
-		return NULL;
-#endif
-
-	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
-	if (!new)
-		goto free_tgcred;
-
-	kdebug("prepare_usermodehelper_creds() alloc %p", new);
-
-	memcpy(new, &init_cred, sizeof(struct cred));
-
-	atomic_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
-	get_group_info(new->group_info);
-	get_uid(new->user);
-
-#ifdef CONFIG_KEYS
-	new->thread_keyring = NULL;
-	new->request_key_auth = NULL;
-	new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-
-	atomic_set(&tgcred->usage, 1);
-	spin_lock_init(&tgcred->lock);
-	new->tgcred = tgcred;
-#endif
-
-#ifdef CONFIG_SECURITY
-	new->security = NULL;
-#endif
-	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
-		goto error;
-	validate_creds(new);
-
-	BUG_ON(atomic_read(&new->usage) != 1);
-	return new;
-
-error:
-	put_cred(new);
-	return NULL;
-
-free_tgcred:
-#ifdef CONFIG_KEYS
-	kfree(tgcred);
-#endif
-	return NULL;
-}
-
-/*
  * Copy credentials for the new process created by fork()
  *
  * We share if we can, but under some circumstances we have to generate a new
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/events/core.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/events/core.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/events/core.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/events/core.c	2015-01-21 12:02:58.430822691 +0300
@@ -148,7 +148,7 @@ static struct srcu_struct pmus_srcu;
 
 /*
  * perf event paranoia level:
- *  -1 - not paranoid at all
+ *  -1 - not paranoid at all, including containers
  *   0 - disallow raw tracepoint access for unpriv
  *   1 - disallow cpu events for unpriv
  *   2 - disallow kernel profiling for unpriv
@@ -6469,6 +6469,9 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EACCES;
 	}
 
+	if (perf_paranoid_container() && !ve_is_super(get_exec_env()))
+		return -EACCES;
+
 	if (attr.freq) {
 		if (attr.sample_freq > sysctl_perf_event_sample_rate)
 			return -EINVAL;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/exit.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/exit.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/exit.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/exit.c	2015-01-21 12:02:57.971834874 +0300
@@ -48,16 +48,20 @@
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
+#include <linux/ve_proto.h>
 #include <trace/events/sched.h>
 #include <linux/oom.h>
 
+#include <bc/misc.h>
+#include <bc/oom_kill.h>
+
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 
-static void exit_mm(struct task_struct * tsk);
+void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
 {
@@ -68,6 +72,10 @@ static void __unhash_process(struct task
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
+#ifdef CONFIG_VE
+		list_del_rcu(&p->ve_task_info.vetask_list);
+		list_del(&p->ve_task_info.aux_list);
+#endif
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
@@ -172,7 +180,7 @@ static void delayed_put_task_struct(stru
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
-	int zap_leader;
+	int zap_leader, zap_ve;
 repeat:
 	tracehook_prepare_release_task(p);
 	/* don't need to get the RCU readlock here - the process is dead and
@@ -184,6 +192,8 @@ repeat:
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
+	nr_zombie--;
+	atomic_inc(&nr_dead);
 
 	/*
 	 * If we are the last non-leader member of the thread
@@ -191,6 +201,7 @@ repeat:
 	 * group leader's parent process. (if it wants notification.)
 	 */
 	zap_leader = 0;
+	zap_ve = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(task_detached(leader));
@@ -213,8 +224,13 @@ repeat:
 			leader->exit_state = EXIT_DEAD;
 	}
 
+	if (--p->ve_task_info.owner_env->pcounter == 0)
+		zap_ve = 1;
 	write_unlock_irq(&tasklist_lock);
 	release_thread(p);
+	ub_task_uncharge(p->task_bc.task_ub);
+	if (zap_ve)
+		ve_cleanup_schedule(p->ve_task_info.owner_env);
 	call_rcu(&p->rcu, delayed_put_task_struct);
 
 	p = leader;
@@ -339,11 +355,18 @@ kill_orphaned_pgrp(struct task_struct *t
  */
 static void reparent_to_kthreadd(void)
 {
+	struct task_struct *parent;
 	write_lock_irq(&tasklist_lock);
 
 	ptrace_unlink(current);
 	/* Reparent to init */
-	current->real_parent = current->parent = kthreadd_task;
+	if (kthreadd_task == NULL)
+		/* reparent CT's kthreadd */
+		parent = get_ve0()->_kthreadd_task;
+	else
+		parent = kthreadd_task;
+
+	current->real_parent = current->parent = parent;
 	list_move_tail(&current->sibling, &current->real_parent->children);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -357,8 +380,8 @@ static void reparent_to_kthreadd(void)
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
 
-	atomic_inc(&init_cred.usage);
-	commit_creds(&init_cred);
+	atomic_inc(&get_exec_env()->init_cred->usage);
+	commit_creds(get_exec_env()->init_cred);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -428,7 +451,11 @@ void daemonize(const char *name, ...)
 {
 	va_list args;
 	sigset_t blocked;
-
+#ifdef CONFIG_VE
+	struct nsproxy *root_nsproxy = get_exec_env()->ve_ns;
+#else
+	struct nsproxy *root_nsproxy = &init_nsproxy;
+#endif
 	va_start(args, name);
 	vsnprintf(current->comm, sizeof(current->comm), name, args);
 	va_end(args);
@@ -445,9 +472,9 @@ void daemonize(const char *name, ...)
 	 */
 	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 
-	if (current->nsproxy != &init_nsproxy) {
-		get_nsproxy(&init_nsproxy);
-		switch_task_namespaces(current, &init_nsproxy);
+	if (current->nsproxy != root_nsproxy) {
+		get_nsproxy(root_nsproxy);
+		switch_task_namespaces(current, root_nsproxy);
 	}
 	set_special_pids(&init_struct_pid);
 	proc_clear_tty(current);
@@ -514,6 +541,7 @@ struct files_struct *get_files_struct(st
 
 	return files;
 }
+EXPORT_SYMBOL_GPL(get_files_struct);
 
 void put_files_struct(struct files_struct *files)
 {
@@ -533,6 +561,7 @@ void put_files_struct(struct files_struc
 		free_fdtable(fdt);
 	}
 }
+EXPORT_SYMBOL_GPL(put_files_struct);
 
 void reset_files_struct(struct files_struct *files)
 {
@@ -575,6 +604,7 @@ mm_need_new_owner(struct mm_struct *mm, 
 		return 0;
 	return 1;
 }
+EXPORT_SYMBOL_GPL(put_fs_struct);
 
 void mm_update_next_owner(struct mm_struct *mm)
 {
@@ -605,10 +635,10 @@ retry:
 	 * Search through everything else. We should not get
 	 * here often
 	 */
-	do_each_thread(g, c) {
+	do_each_thread_all(g, c) {
 		if (c->mm == mm)
 			goto assign_new_owner;
-	} while_each_thread(g, c);
+	} while_each_thread_all(g, c);
 
 	read_unlock(&tasklist_lock);
 	/*
@@ -647,7 +677,7 @@ assign_new_owner:
  * Turn us into a lazy TLB process if we
  * aren't already..
  */
-static void exit_mm(struct task_struct * tsk)
+void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
@@ -655,6 +685,7 @@ static void exit_mm(struct task_struct *
 	mm_release(tsk, mm);
 	if (!mm)
 		return;
+
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_sem around checking core_state
@@ -695,12 +726,11 @@ static void exit_mm(struct task_struct *
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
 }
+EXPORT_SYMBOL(exit_mm);
 
 /*
  * When we die, we re-parent all our children.
@@ -715,7 +745,7 @@ static struct task_struct *find_new_reap
 	struct task_struct *thread;
 
 	thread = father;
-	while_each_thread(father, thread) {
+	while_each_thread_ve(father, thread) {
 		if (thread->flags & PF_EXITING)
 			continue;
 		if (unlikely(pid_ns->child_reaper == father))
@@ -848,11 +878,16 @@ static void exit_notify(struct task_stru
 	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 
+	if (tsk->exit_signal != -1 && tsk == init_pid_ns.child_reaper)
+		/* We dont want people slaying init. */
+		tsk->exit_signal = SIGCHLD;
+
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
 	if (signal >= 0)
 		signal = do_notify_parent(tsk, signal);
 
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+	nr_zombie++;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -894,6 +929,32 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
+#ifdef CONFIG_VE
+static void do_initproc_exit(struct task_struct *tsk)
+{
+	struct ve_struct *env;
+
+	env = get_exec_env();
+	if (tsk == get_env_init(env)) {
+		/*
+		 * Here the VE changes its state into "not running".
+		 * op_sem taken for write is a barrier to all VE manipulations from
+		 * ioctl: it waits for operations currently in progress and blocks all
+		 * subsequent operations until is_running is set to 0 and op_sem is
+		 * released.
+		 */
+
+		down_write(&env->op_sem);
+		env->is_running = 0;
+		up_write(&env->op_sem);
+
+		ve_hook_iterate_fini(VE_INIT_EXIT_CHAIN, env);
+	}
+}
+#else
+#define do_initproc_exit(tsk)	do { } while (0)
+#endif
+
 NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
@@ -917,8 +978,9 @@ NORET_TYPE void do_exit(long code)
 	 */
 	set_fs(USER_DS);
 
-	tracehook_report_exit(&code);
+	do_initproc_exit(tsk);
 
+	tracehook_report_exit(&code);
 	validate_creds_for_do_exit(tsk);
 
 	/*
@@ -1002,7 +1064,15 @@ NORET_TYPE void do_exit(long code)
 	 */
 	perf_event_exit_task(tsk);
 
-	exit_notify(tsk, group_dead);
+	if (!(tsk->flags & PF_EXIT_RESTART) || atomic_read(&get_exec_env()->suspend))
+		exit_notify(tsk, group_dead);
+	else {
+		write_lock_irq(&tasklist_lock);
+		tsk->exit_state = EXIT_ZOMBIE;
+		nr_zombie++;
+		write_unlock_irq(&tasklist_lock);
+		exit_task_namespaces(tsk);
+	}
 #ifdef CONFIG_NUMA
 	task_lock(tsk);
 	mpol_put(tsk->mempolicy);
@@ -1053,7 +1123,6 @@ NORET_TYPE void complete_and_exit(struct
 
 	do_exit(code);
 }
-
 EXPORT_SYMBOL(complete_and_exit);
 
 SYSCALL_DEFINE1(exit, int, error_code)
@@ -1125,12 +1194,40 @@ struct pid *task_pid_type(struct task_st
 	return task->pids[type].pid;
 }
 
-static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+static int __eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 
+static int __entered_pid(struct wait_opts *wo, struct task_struct *p)
+{
+	struct pid *pid, *wo_pid;
+
+	wo_pid = wo->wo_pid;
+	if ((wo_pid == NULL) || (wo_pid->level != 0))
+		return 0;
+
+	pid = task_pid_type(p, wo->wo_type);
+	if (pid->level != 1)
+		return 0;
+
+	if (wo_pid->numbers[0].nr != pid->numbers[0].nr)
+		return 0;
+
+	wo->wo_pid = get_pid(pid);
+	put_pid(wo_pid);
+	return 1;
+}
+
+static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+{
+	if (__eligible_pid(wo, p))
+		return 1;
+	else
+		return __entered_pid(wo, p);
+}
+
 static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
@@ -1652,7 +1749,7 @@ repeat:
 
 		if (wo->wo_flags & __WNOTHREAD)
 			break;
-	} while_each_thread(current, tsk);
+	} while_each_thread_ve(current, tsk);
 	read_unlock(&tasklist_lock);
 
 notask:
@@ -1734,7 +1831,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_
 			ret = put_user(0, &infop->si_status);
 	}
 
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(5, ret, which, upid, infop, options, ru);
@@ -1773,12 +1870,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int 
 	wo.wo_stat	= stat_addr;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
 	return ret;
 }
+EXPORT_SYMBOL(sys_wait4);
 
 #ifdef __ARCH_WANT_SYS_WAITPID
 
@@ -1792,3 +1890,22 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int
 }
 
 #endif
+
+/**
+ * This reaps non-child zombies. We hold read_lock(&tasklist_lock) on entry.
+ * If we return zero, we still hold the lock and this task is uninteresting.
+ */
+int reap_zombie(struct task_struct *p)
+{
+	struct wait_opts wo = {
+		.wo_flags = WEXITED,
+	};
+	int ret = 0;
+
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
+		p->exit_signal = -1;
+		ret = wait_task_zombie(&wo, p);
+	}
+
+	return ret;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/fairsched.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/fairsched.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/fairsched.c	2015-01-21 12:02:53.655949435 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/fairsched.c	2015-01-21 12:02:54.617923900 +0300
@@ -0,0 +1,783 @@
+/*
+ * Fair Scheduler
+ *
+ * Copyright (C) 2000-2008  SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/mount.h>
+#include <linux/cgroup.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/fairsched.h>
+#include <linux/uaccess.h>
+
+static struct cgroup *fairsched_root, *fairsched_host;
+
+/* fairsched use node id = INT_MAX for ve0 tasks */
+#define FAIRSCHED_HOST_NODE 2147483647
+
+static void fairsched_name(char *buf, int len, int id)
+{
+	if (id == FAIRSCHED_HOST_NODE)
+		id = 0;
+
+	snprintf(buf, len, "%d", id);
+}
+
+static struct cgroup *fairsched_open(unsigned int id)
+{
+	struct cgroup *cgrp;
+	char name[16];
+
+	fairsched_name(name, sizeof(name), id);
+	cgrp = cgroup_kernel_open(fairsched_root, 0, name);
+	if (cgrp == NULL)
+		return ERR_PTR(-ENOENT);
+	return cgrp;
+}
+
+static int fairsched_node_id(const char *name)
+{
+	unsigned long id;
+	char *endp;
+
+	id = simple_strtoul(name, &endp, 10);
+	if (*endp || id > INT_MAX)
+		return -1;
+
+	return id ?: FAIRSCHED_HOST_NODE;
+}
+
+SYSCALL_DEFINE3(fairsched_mknod, unsigned int, parent, unsigned int, weight,
+				 unsigned int, newid)
+{
+	struct cgroup *node;
+	int retval;
+	char name[16];
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	retval = -EINVAL;
+	if (weight < 1 || weight > FSCHWEIGHT_MAX)
+		goto out;
+	if (newid < 0 || newid > INT_MAX)
+		goto out;
+
+	fairsched_name(name, sizeof(name), newid);
+	node = cgroup_kernel_open(fairsched_root, CGRP_CREAT|CGRP_EXCL, name);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+	cgroup_kernel_close(node);
+	retval = newid;
+out:
+	return retval;
+}
+
+SYSCALL_DEFINE1(fairsched_rmnod, unsigned int, id)
+{
+	char name[16];
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	fairsched_name(name, sizeof(name), id);
+	return cgroup_kernel_remove(fairsched_root, name);
+}
+
+SYSCALL_DEFINE2(fairsched_chwt, unsigned int, id, unsigned, weight)
+{
+	struct cgroup *cgrp;
+	int retval;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	if (id == 0)
+		return -EINVAL;
+	if (weight < 1 || weight > FSCHWEIGHT_MAX)
+		return -EINVAL;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	retval = sched_cgroup_set_shares(cgrp, FSCHWEIGHT_BASE / weight);
+	cgroup_kernel_close(cgrp);
+
+	return retval;
+}
+
+SYSCALL_DEFINE2(fairsched_vcpus, unsigned int, id, unsigned int, vcpus)
+{
+	struct cgroup *cgrp;
+	int retval = 0;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	if (id == 0)
+		return -EINVAL;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	retval = sched_cgroup_set_nr_cpus(cgrp, vcpus);
+	cgroup_kernel_close(cgrp);
+
+	return retval;
+}
+
+SYSCALL_DEFINE3(fairsched_rate, unsigned int, id, int, op, unsigned, rate)
+{
+	struct cgroup *cgrp;
+	long ret;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	if (id == 0)
+		return -EINVAL;
+	if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31)))
+		return -EINVAL;
+
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	switch (op) {
+		case FAIRSCHED_SET_RATE:
+			ret = sched_cgroup_set_rate(cgrp, rate);
+			if (!ret)
+				ret = sched_cgroup_get_rate(cgrp);
+			break;
+		case FAIRSCHED_DROP_RATE:
+			ret = sched_cgroup_set_rate(cgrp, 0);
+			break;
+		case FAIRSCHED_GET_RATE:
+			ret = sched_cgroup_get_rate(cgrp);
+			if (!ret)
+				ret = -ENODATA;
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+	}
+	cgroup_kernel_close(cgrp);
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(fairsched_mvpr, pid_t, pid, unsigned int, id)
+{
+	struct cgroup *cgrp;
+	struct task_struct *tsk;
+	int retval;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	rcu_read_lock();
+	tsk = current;
+	if (pid != task_pid_vnr(tsk))
+		tsk = find_task_by_vpid(pid);
+	if (tsk == NULL) {
+		rcu_read_unlock();
+		cgroup_kernel_close(cgrp);
+		return -ESRCH;
+	}
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	retval = cgroup_kernel_attach(cgrp, tsk);
+
+	cgroup_kernel_close(cgrp);
+	put_task_struct(tsk);
+
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     struct cpumask *new_mask)
+{
+	if (len < cpumask_size())
+		cpumask_clear(new_mask);
+	else if (len > cpumask_size())
+		len = cpumask_size();
+
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE3(fairsched_cpumask, unsigned int, id, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	struct cgroup *cgrp;
+	int retval;
+	cpumask_var_t new_mask, in_mask;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	if (id == 0)
+		return -EINVAL;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	if (!alloc_cpumask_var(&in_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_in_mask;
+	}
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, in_mask);
+	if (retval == 0) {
+		cpumask_and(new_mask, in_mask, cpu_active_mask);
+		cgroup_lock();
+		retval = cgroup_set_cpumask(cgrp, new_mask);
+		cgroup_unlock();
+	}
+
+	free_cpumask_var(new_mask);
+
+out_free_in_mask:
+	free_cpumask_var(in_mask);
+out:
+	cgroup_kernel_close(cgrp);
+	return retval;
+}
+
+static int get_user_node_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			      nodemask_t *new_mask)
+{
+	if (len < sizeof(nodemask_t))
+		nodes_clear(*new_mask);
+	else if (len > sizeof(nodemask_t))
+		len = sizeof(nodemask_t);
+
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE3(fairsched_nodemask, unsigned int, id, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	struct cgroup *cgrp;
+	char name[16];
+	int retval;
+	nodemask_t new_mask, in_mask;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	if (id == 0)
+		return -EINVAL;
+
+	fairsched_name(name, sizeof(name), id);
+	cgrp = cgroup_kernel_open(fairsched_root, 0, name);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	if (cgrp == NULL)
+		return -ENOENT;
+
+	retval = get_user_node_mask(user_mask_ptr, len, &in_mask);
+	if (retval == 0) {
+		nodes_and(new_mask, in_mask, node_states[N_HIGH_MEMORY]);
+		cgroup_lock();
+		retval = cgroup_set_nodemask(cgrp, &new_mask);
+		cgroup_unlock();
+	}
+
+	cgroup_kernel_close(cgrp);
+	return retval;
+}
+
+int fairsched_new_node(int id, unsigned int vcpus)
+{
+	struct cgroup *cgrp;
+	int err, err2;
+	char name[16];
+
+	fairsched_name(name, sizeof(name), id);
+	cgrp = cgroup_kernel_open(fairsched_root, CGRP_CREAT, name);
+	err = PTR_ERR(cgrp);
+	if (IS_ERR(cgrp)) {
+		printk(KERN_ERR "Can't create fairsched node %d err=%d\n", id, err);
+		goto out;
+	}
+
+	err = sched_cgroup_set_nr_cpus(cgrp, vcpus);
+	if (err) {
+		printk(KERN_ERR "Can't set sched vcpus on node %d err=%d\n", id, err);
+		goto cleanup;
+	}
+
+	err = cgroup_kernel_attach(cgrp, current);
+	if (err) {
+		printk(KERN_ERR "Can't switch to fairsched node %d err=%d\n", id, err);
+		goto cleanup;
+	}
+
+	cgroup_kernel_close(cgrp);
+	return 0;
+
+cleanup:
+	cgroup_kernel_close(cgrp);
+	err2 = cgroup_kernel_remove(fairsched_root, name);
+	if (err2)
+		printk(KERN_ERR "Can't clean fairsched node %d err=%d\n", id, err2);
+out:
+	return err;
+}
+EXPORT_SYMBOL(fairsched_new_node);
+
+void fairsched_drop_node(int id, int leave)
+{
+	char name[16];
+	int err;
+
+	if (leave) {
+		err = cgroup_kernel_attach(fairsched_host, current);
+		if (err)
+			printk(KERN_ERR "Can't leave fairsched node %d "
+					"err=%d\n", id, err);
+	}
+
+	fairsched_name(name, sizeof(name), id);
+	err = cgroup_kernel_remove(fairsched_root, name);
+	if (err)
+		printk(KERN_ERR "Can't remove fairsched node %d err=%d\n", id, err);
+}
+EXPORT_SYMBOL(fairsched_drop_node);
+
+int fairsched_move_task(int id, struct task_struct *tsk)
+{
+	struct cgroup *cgrp;
+	int err;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	err = cgroup_kernel_attach(cgrp, tsk);
+	cgroup_kernel_close(cgrp);
+
+	return err;
+}
+EXPORT_SYMBOL(fairsched_move_task);
+
+#ifdef CONFIG_PROC_FS
+
+/*********************************************************************/
+/*
+ * proc interface
+ */
+/*********************************************************************/
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+
+struct fairsched_node_dump {
+	int id;
+	unsigned weight;
+	unsigned rate;
+	int nr_pcpu;
+	int nr_tasks, nr_runtasks;
+};
+
+struct fairsched_dump {
+	int len;
+	struct fairsched_node_dump nodes[0];
+};
+
+static struct fairsched_dump *fairsched_do_dump(int compat)
+{
+	struct fairsched_dump *dump;
+	struct fairsched_node_dump *p;
+	int nr_nodes;
+	struct dentry *root, *dentry;
+	struct cgroup *cgrp;
+	int id;
+
+	root = fairsched_root->dentry;
+	mutex_lock(&root->d_inode->i_mutex);
+
+	spin_lock(&dcache_lock);
+	nr_nodes = 0;
+	list_for_each_entry(dentry, &root->d_subdirs, d_u.d_child) {
+		if (d_unhashed(dentry) || !dentry->d_inode ||
+				!S_ISDIR(dentry->d_inode->i_mode))
+			continue;
+		nr_nodes++;
+	}
+	spin_unlock(&dcache_lock);
+
+	nr_nodes = ve_is_super(get_exec_env()) ? nr_nodes + 16 : 1;
+
+	dump = ub_vmalloc(sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]));
+	if (dump == NULL)
+		goto out;
+
+	spin_lock(&dcache_lock);
+
+	p = dump->nodes;
+	list_for_each_entry_reverse(dentry, &root->d_subdirs, d_u.d_child) {
+		if (d_unhashed(dentry) || !dentry->d_inode ||
+				!S_ISDIR(dentry->d_inode->i_mode))
+			continue;
+		id = fairsched_node_id(dentry->d_name.name);
+		if (id < 0)
+			continue;
+		if (!ve_accessible_veid(id, get_exec_env()->veid))
+			continue;
+		cgrp = dentry->d_fsdata; /* __d_cgrp */
+		p->id = id;
+		p->nr_tasks = cgroup_task_count(cgrp);
+		p->nr_runtasks = sched_cgroup_get_nr_running(cgrp);
+		p->weight = FSCHWEIGHT_BASE / sched_cgroup_get_shares(cgrp);
+		p->nr_pcpu = num_online_cpus();
+		p->rate = sched_cgroup_get_rate(cgrp);
+		p++;
+		if (!--nr_nodes)
+			break;
+	}
+	dump->len = p - dump->nodes;
+
+	spin_unlock(&dcache_lock);
+out:
+	mutex_unlock(&root->d_inode->i_mutex);
+	return dump;
+}
+
+#define FAIRSCHED_PROC_HEADLINES 2
+
+#define FAIRSHED_DEBUG          " debug"
+
+#ifdef CONFIG_VE
+/*
+ * File format is dictated by compatibility reasons.
+ */
+static int fairsched_seq_show(struct seq_file *m, void *v)
+{
+	struct fairsched_dump *dump;
+	struct fairsched_node_dump *p;
+	unsigned vid, nid, pid, r;
+
+	dump = m->private;
+	p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL);
+	if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
+		if (p == dump->nodes)
+			seq_printf(m, "Version: 2.6 debug\n");
+		else if (p == dump->nodes + 1)
+			seq_printf(m,
+				       "      veid "
+				       "        id "
+				       "    parent "
+				       "weight "
+				       " rate "
+				       "tasks "
+				       "  run "
+				       "cpus"
+				       " "
+				       "flg "
+				       "ready "
+				       "           start_tag "
+				       "               value "
+				       "               delay"
+				       "\n");
+	} else {
+		p -= FAIRSCHED_PROC_HEADLINES;
+		vid = nid = pid = 0;
+		r = (unsigned long)v & 3;
+		if (p == dump->nodes) {
+			if (r == 2)
+				nid = p->id;
+		} else {
+			if (!r)
+				nid = p->id;
+			else if (r == 1)
+				vid = pid = p->id;
+			else
+				vid = p->id, nid = 1;
+		}
+		seq_printf(m,
+			       "%10u "
+			       "%10u %10u %6u %5u %5u %5u %4u"
+			       " "
+			       " %c%c %5u %20Lu %20Lu %20Lu"
+			       "\n",
+			       vid,
+			       nid,
+			       pid,
+			       p->weight,
+			       p->rate,
+			       p->nr_tasks,
+			       p->nr_runtasks,
+			       p->nr_pcpu,
+			       p->rate ? 'L' : '.',
+			       '.',
+			       p->nr_runtasks,
+			       0ll, 0ll, 0ll);
+	}
+
+	return 0;
+}
+
+static void *fairsched_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct fairsched_dump *dump;
+	unsigned long l;
+
+	dump = m->private;
+	if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES)
+		return NULL;
+	if (*pos < FAIRSCHED_PROC_HEADLINES)
+		return dump->nodes + *pos;
+	/* guess why... */
+	l = (unsigned long)(dump->nodes +
+		((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3);
+	l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3;
+	return (void *)l;
+}
+static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return fairsched_seq_start(m, pos);
+}
+#endif /* CONFIG_VE */
+
+static int fairsched2_seq_show(struct seq_file *m, void *v)
+{
+	struct fairsched_dump *dump;
+	struct fairsched_node_dump *p;
+
+	dump = m->private;
+	p = v;
+	if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
+		if (p == dump->nodes)
+			seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n");
+		else if (p == dump->nodes + 1)
+			seq_printf(m,
+				       "        id "
+				       "weight "
+				       " rate "
+				       "  run "
+				       "cpus"
+#ifdef FAIRSHED_DEBUG
+				       " "
+				       "flg "
+				       "ready "
+				       "           start_tag "
+				       "               value "
+				       "               delay"
+#endif
+				       "\n");
+	} else {
+		p -= FAIRSCHED_PROC_HEADLINES;
+		seq_printf(m,
+			       "%10u %6u %5u %5u %4u"
+#ifdef FAIRSHED_DEBUG
+			       " "
+			       " %c%c %5u %20Lu %20Lu %20Lu"
+#endif
+			       "\n",
+			       p->id,
+			       p->weight,
+			       p->rate,
+			       p->nr_runtasks,
+			       p->nr_pcpu
+#ifdef FAIRSHED_DEBUG
+			       ,
+			       p->rate ? 'L' : '.',
+			       '.',
+			       p->nr_runtasks,
+			       0ll, 0ll, 0ll
+#endif
+			       );
+	}
+
+	return 0;
+}
+
+static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct fairsched_dump *dump;
+
+	dump = m->private;
+	if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES)
+		return NULL;
+	return dump->nodes + *pos;
+}
+static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return fairsched2_seq_start(m, pos);
+}
+static void fairsched2_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+#ifdef CONFIG_VE
+static struct seq_operations fairsched_seq_op = {
+	.start		= fairsched_seq_start,
+	.next		= fairsched_seq_next,
+	.stop		= fairsched2_seq_stop,
+	.show		= fairsched_seq_show
+};
+#endif
+static struct seq_operations fairsched2_seq_op = {
+	.start		= fairsched2_seq_start,
+	.next		= fairsched2_seq_next,
+	.stop		= fairsched2_seq_stop,
+	.show		= fairsched2_seq_show
+};
+static int fairsched_seq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	int compat;
+
+#ifdef CONFIG_VE
+	compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1);
+	ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op);
+#else
+	compat = 0;
+	ret = seq_open(file, &fairsched2_seq_op);
+#endif
+	if (ret)
+		return ret;
+	m = file->private_data;
+	m->private = fairsched_do_dump(compat);
+	if (m->private == NULL) {
+		seq_release(inode, file);
+		ret = -ENOMEM;
+	}
+	return ret;
+}
+static int fairsched_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	struct fairsched_dump *dump;
+
+	m = file->private_data;
+	dump = m->private;
+	m->private = NULL;
+	vfree(dump);
+	seq_release(inode, file);
+	return 0;
+}
+static struct file_operations proc_fairsched_operations = {
+	.open		= fairsched_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= fairsched_seq_release
+};
+
+int fairsched_show_stat(struct seq_file *p, int id)
+{
+	struct cgroup *cgrp;
+	int err;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	err = cpu_cgroup_proc_stat(cgrp, NULL, p);
+	cgroup_kernel_close(cgrp);
+
+	return err;
+}
+
+int fairsched_get_cpu_avenrun(int id, unsigned long *avenrun)
+{
+	struct cgroup *cgrp;
+	int err;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	err = cpu_cgroup_get_avenrun(cgrp, avenrun);
+	cgroup_kernel_close(cgrp);
+
+	return 0;
+}
+EXPORT_SYMBOL(fairsched_get_cpu_avenrun);
+
+int fairsched_get_cpu_stat(int id, struct kernel_cpustat *kstat)
+{
+	struct cgroup *cgrp;
+
+	cgrp = fairsched_open(id);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	cpu_cgroup_get_stat(cgrp, kstat);
+	cgroup_kernel_close(cgrp);
+
+	return 0;
+}
+EXPORT_SYMBOL(fairsched_get_cpu_stat);
+
+#endif /* CONFIG_PROC_FS */
+
+int __init fairsched_init(void)
+{
+	struct vfsmount *mnt;
+	int ret;
+	struct cgroup_sb_opts opts = {
+		.name		= "fairsched",
+		.subsys_bits	=
+			(1ul << cpu_cgroup_subsys_id) |
+			(1ul << cpuacct_subsys_id) |
+			(1ul << cpuset_subsys_id),
+	};
+
+	mnt = cgroup_kernel_mount(&opts);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+	fairsched_root = cgroup_get_root(mnt);
+
+	fairsched_host = cgroup_kernel_open(fairsched_root, CGRP_CREAT, "0");
+	if (IS_ERR(fairsched_host))
+		return PTR_ERR(fairsched_host);
+
+	ret = sched_cgroup_set_rt_runtime(fairsched_host,
+					  3 * sysctl_sched_rt_runtime / 4);
+	if (ret)
+		printk(KERN_WARNING
+		       "Can't set rt runtime for fairsched host: %d\n", ret);
+
+	ret = cgroup_kernel_attach(fairsched_host, init_pid_ns.child_reaper);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_PROC_FS
+	proc_create("fairsched", S_IRUGO, &glob_proc_root,
+			&proc_fairsched_operations);
+	proc_create("fairsched2", S_IRUGO, &glob_proc_root,
+			&proc_fairsched_operations);
+	proc_create("fairsched", S_IFDIR|S_IRUSR|S_IXUSR, proc_vz_dir, NULL);
+#endif /* CONFIG_PROC_FS */
+	return 0;
+}
+late_initcall(fairsched_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/fence-watchdog.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/fence-watchdog.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/fence-watchdog.c	2015-01-21 12:02:58.505820701 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/fence-watchdog.c	2015-01-21 12:02:58.548819559 +0300
@@ -0,0 +1,206 @@
+/*
+ * Provide userspace with an interface to forbid kernel to work
+ * without an userspace daemon.
+ *
+ * The daemon should write number of seconds before fencing to the
+ * file /sys/kernel/watchdog_timer, and must renew it, until the
+ * time elapses.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/jiffies.h>
+#include <linux/reboot.h>
+#include <linux/fence-watchdog.h>
+#include <linux/device.h>
+#include <linux/kmsg_dump.h>
+
+#define MAX_U64			(~(u64)0)
+#define MAX_JIFFIES_DELTA	(10 * 365UL * 24UL * 3600UL * HZ)
+#define __section_fence_wdog	__attribute__ ((unused, \
+			__section__ (".fence_wdog_jiffies64"), aligned(16)))
+#define ACTION_NAME_LEN		16
+
+enum {
+	FENCE_WDOG_CRASH = 0,
+	FENCE_WDOG_REBOOT = 1,
+	FENCE_WDOG_POWEROFF = 2,
+	FENCE_WDOG_NETFILTER = 3,
+};
+
+const char *action_names[] = {"crash", "reboot", "halt", "netfilter", NULL};
+
+unsigned long volatile __fence_wdog_jiffies64 __section_fence_wdog = MAX_U64;
+extern unsigned long volatile fence_wdog_jiffies64;
+static int fence_wdog_action = FENCE_WDOG_CRASH;
+static atomic_t not_fenced = ATOMIC_INIT(-1);
+
+static void do_halt(struct work_struct *dummy)
+{
+	printk(KERN_EMERG"fence-watchdog: %s\n",
+	       action_names[fence_wdog_action]);
+	kernel_halt();
+}
+
+static DECLARE_WORK(halt_work, do_halt);
+
+void fence_wdog_do_fence(void)
+{
+	char *killer = NULL;
+
+	if (fence_wdog_action != FENCE_WDOG_POWEROFF &&
+			fence_wdog_action != FENCE_WDOG_NETFILTER) {
+		bust_spinlocks(1);
+		printk(KERN_EMERG"fence-watchdog: %s\n",
+			action_names[fence_wdog_action]);
+		bust_spinlocks(0);
+	}
+
+	switch (fence_wdog_action) {
+	case FENCE_WDOG_CRASH:
+		panic_on_oops = 1;
+		wmb();
+		*killer = 1;
+		break;
+	case FENCE_WDOG_REBOOT:
+		lockdep_off();
+		local_irq_enable();
+		emergency_restart();
+		break;
+	case FENCE_WDOG_POWEROFF:
+		schedule_work(&halt_work);
+		break;
+	}
+}
+
+inline int fence_wdog_check_timer(void)
+{
+	if (unlikely(get_jiffies_64() > fence_wdog_jiffies64 &&
+			fence_wdog_action != FENCE_WDOG_NETFILTER)) {
+		if (atomic_inc_not_zero(&not_fenced))
+			fence_wdog_do_fence();
+		return 1;
+	}
+
+	return 0;
+}
+
+bool fence_wdog_tmo_match(void)
+{
+	return get_jiffies_64() > fence_wdog_jiffies64;
+}
+EXPORT_SYMBOL(fence_wdog_tmo_match);
+
+static ssize_t fence_wdog_timer_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	ssize_t ret;
+	u64 jiffies_delta = fence_wdog_jiffies64 - get_jiffies_64();
+	struct timespec t;
+
+	if (jiffies_delta > MAX_JIFFIES_DELTA) {
+		ret =  sprintf(buf, "inf\n");
+	} else {
+		jiffies_to_timespec(jiffies_delta, &t);
+		ret =  sprintf(buf, "%ld\n", t.tv_sec);
+	}
+
+	return ret;
+}
+
+static ssize_t fence_wdog_timer_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long long val;
+	unsigned long jiffies_delta;
+	struct timespec t;
+
+	if (strict_strtoull(buf, 10, &val))
+		return -EINVAL;
+
+	if (val == 0) {
+		fence_wdog_jiffies64 = MAX_U64;
+		return count;
+	}
+
+	t.tv_sec = val;
+	t.tv_nsec = 0;
+
+	jiffies_delta = timespec_to_jiffies(&t);
+	if (jiffies_delta > MAX_JIFFIES_DELTA)
+		return -EINVAL;
+
+	fence_wdog_jiffies64 = get_jiffies_64() + jiffies_delta;
+
+	return count;
+}
+
+static ssize_t fence_wdog_action_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", action_names[fence_wdog_action]);
+}
+
+static ssize_t fence_wdog_action_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	char str_action[ACTION_NAME_LEN];
+	int i = 0;
+
+	if (sscanf(buf, "%15s", str_action) != 1)
+		return -EINVAL;
+
+	for (i = 0; action_names[i]; i++) {
+		if ((!strnicmp(str_action, action_names[i], ACTION_NAME_LEN))) {
+			fence_wdog_action = i;
+			return count;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t fence_wdog_available_actions_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	int i, ret = 0;
+
+	for (i = 0; action_names[i] != NULL; i++)
+		ret += sprintf(&buf[ret], "%s ", action_names[i]);
+
+	ret += sprintf(&buf[ret], "\n");
+	return ret;
+}
+
+static struct kobj_attribute fence_wdog_timer_attr =
+	__ATTR(watchdog_timer, 0644,
+		fence_wdog_timer_show, fence_wdog_timer_store);
+
+static struct kobj_attribute fence_wdog_action_attr =
+	__ATTR(watchdog_action, 0644,
+		fence_wdog_action_show, fence_wdog_action_store);
+
+static struct kobj_attribute fence_wdog_available_actions_attr =
+	__ATTR(watchdog_available_actions, 0644,
+		fence_wdog_available_actions_show, NULL);
+
+static struct attribute *fence_wdog_attrs[] = {
+	&fence_wdog_timer_attr.attr,
+	&fence_wdog_action_attr.attr,
+	&fence_wdog_available_actions_attr.attr,
+	NULL,
+};
+
+static struct attribute_group fence_wdog_attr_group = {
+	.attrs = fence_wdog_attrs,
+};
+
+static int __init fence_wdog_init(void)
+{
+	sysfs_update_group(kernel_kobj, &fence_wdog_attr_group);
+	return 0;
+}
+
+module_init(fence_wdog_init)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/fork.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/fork.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/fork.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/fork.c	2015-01-21 12:02:58.011833811 +0300
@@ -77,6 +77,11 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/vmpages.h>
+#include <bc/misc.h>
+#include <bc/kmem.h>
+#include <bc/oom_kill.h>
+
 #include <trace/events/sched.h>
 
 /*
@@ -84,12 +89,14 @@
  */
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
 int nr_threads; 		/* The idle threads do not count.. */
+EXPORT_SYMBOL(nr_threads);
 
 int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+EXPORT_SYMBOL(tasklist_lock);
 
 int nr_processes(void)
 {
@@ -143,7 +150,7 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+struct kmem_cache *__vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
@@ -173,9 +180,12 @@ void __put_task_struct(struct task_struc
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	ub_task_put(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 
+	put_ve(VE_TASK_INFO(tsk)->owner_env);
+	atomic_dec(&nr_dead);
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
@@ -315,6 +325,9 @@ static int dup_mmap(struct mm_struct *mm
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
 	pprev = &mm->mmap;
+	retval = ub_page_table_precharge(mm, oldmm->nr_ptes + oldmm->nr_ptds);
+	if (retval)
+		goto out;
 	retval = ksm_fork(mm, oldmm);
 	if (retval)
 		goto out;
@@ -334,6 +347,10 @@ static int dup_mmap(struct mm_struct *mm
 			continue;
 		}
 		charge = 0;
+		if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start,
+					mpnt->vm_flags & ~VM_LOCKED,
+					mpnt->vm_file, UB_HARD))
+			goto fail_noch;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned long len;
 			len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
@@ -341,7 +358,7 @@ static int dup_mmap(struct mm_struct *mm
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = allocate_vma(mm, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -396,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, oldmm, mpnt);
+		retval = copy_page_range(mm, oldmm, tmp, mpnt);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
@@ -408,6 +425,7 @@ static int dup_mmap(struct mm_struct *mm
 	arch_dup_mmap(oldmm, mm);
 	retval = 0;
 out:
+	ub_page_table_commit(mm);
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
@@ -415,8 +433,11 @@ out:
 fail_nomem_anon_vma_fork:
 	mpol_put(pol);
 fail_nomem_policy:
-	kmem_cache_free(vm_area_cachep, tmp);
+	free_vma(mm, tmp);
 fail_nomem:
+	ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start,
+			mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file);
+fail_noch:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
 	goto out;
@@ -441,8 +462,37 @@ static inline void mm_free_pgd(struct mm
 #endif /* CONFIG_MMU */
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+EXPORT_SYMBOL(mmlist_lock);
+
+#ifdef CONFIG_BEANCOUNTERS
+
+static inline struct mm_struct *allocate_mm(struct user_beancounter *ub)
+{
+	return ub_kmem_alloc(ub, mm_cachep, GFP_KERNEL);
+}
+
+static inline void set_mm_ub(struct mm_struct *mm, struct user_beancounter *ub)
+{
+	mm->mm_ub = get_beancounter_longterm(ub);
+}
+
+static inline void put_mm_ub(struct mm_struct *mm)
+{
+	VM_BUG_ON(mm->page_table_precharge);
+	ub_kmem_uncharge(mm->mm_ub,
+			mm_cachep->objuse + (mm->nr_ptds << PAGE_SHIFT));
+	put_beancounter_longterm(mm->mm_ub);
+	mm->mm_ub = NULL;
+}
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define allocate_mm(ub)  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define set_mm_ub(mm, ub)
+#define put_mm_ub(mm)
+
+#endif /* CONFIG_BEANCOUNTERS */
 
-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
@@ -477,6 +527,8 @@ static struct mm_struct * mm_init(struct
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
+	mm->nr_ptds = 0;
+	mm->page_table_precharge = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	set_mm_counter(mm, swap_usage, 0);
@@ -485,7 +537,6 @@ static struct mm_struct * mm_init(struct
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
-	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -493,6 +544,7 @@ static struct mm_struct * mm_init(struct
 		return mm;
 	}
 
+	put_mm_ub(mm);
 	free_mm(mm);
 	return NULL;
 }
@@ -504,13 +556,15 @@ struct mm_struct * mm_alloc(void)
 {
 	struct mm_struct * mm;
 
-	mm = allocate_mm();
+	mm = allocate_mm(get_exec_ub());
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
+		set_mm_ub(mm, get_exec_ub());
 		mm = mm_init(mm, current);
 	}
 	return mm;
 }
+EXPORT_SYMBOL(mm_alloc);
 
 /*
  * Called when the last reference to the mm
@@ -520,6 +574,8 @@ struct mm_struct * mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
+	if (unlikely(atomic_read(&mm->mm_users)))
+		put_mm_ub(mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
@@ -551,6 +607,9 @@ void mmput(struct mm_struct *mm)
 		put_swap_token(mm);
 		if (mm->binfmt)
 			module_put(mm->binfmt->module);
+		if (mm->global_oom || mm->ub_oom)
+			ub_oom_mm_dead(mm);
+		put_mm_ub(mm);
 		mmdrop(mm);
 	}
 }
@@ -677,7 +736,7 @@ struct mm_struct *dup_mm(struct task_str
 	if (!oldmm)
 		return NULL;
 
-	mm = allocate_mm();
+	mm = allocate_mm(tsk->task_bc.task_ub);
 	if (!mm)
 		goto fail_nomem;
 
@@ -687,10 +746,14 @@ struct mm_struct *dup_mm(struct task_str
 	mm->token_priority = 0;
 	mm->last_interval = 0;
 
+	mm->global_oom = 0;
+	mm->ub_oom = 0;
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	mm->pmd_huge_pte = NULL;
 #endif
 
+	set_mm_ub(mm, tsk->task_bc.task_ub);
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
@@ -724,6 +787,7 @@ fail_nocontext:
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
+	put_mm_ub(mm);
 	mm_free_pgd(mm);
 	free_mm(mm);
 	return NULL;
@@ -767,9 +831,6 @@ good_mm:
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_inc(&mm->oom_disable_count);
-
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
@@ -783,13 +844,13 @@ static int copy_fs(unsigned long clone_f
 	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		if (fs->in_exec) {
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 			return -EAGAIN;
 		}
 		fs->users++;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 		return 0;
 	}
 	tsk->fs = copy_fs_struct(fs);
@@ -1039,6 +1100,7 @@ static struct task_struct *copy_process(
 					unsigned long stack_size,
 					int __user *child_tidptr,
 					struct pid *pid,
+					pid_t vpid,
 					int trace)
 {
 	int retval;
@@ -1086,9 +1148,14 @@ static struct task_struct *copy_process(
 		goto fork_out;
 
 	retval = -ENOMEM;
+	if (ub_task_charge(get_exec_ub()))
+		goto fork_out;
+
 	p = dup_task_struct(current);
 	if (!p)
-		goto fork_out;
+		goto bad_fork_uncharge;
+
+	ub_task_get(get_exec_ub(), p);
 
 	tracehook_init_task(p);
 
@@ -1223,7 +1290,7 @@ static struct task_struct *copy_process(
 		goto bad_fork_cleanup_sighand;
 	if ((retval = copy_mm(clone_flags, p)))
 		goto bad_fork_cleanup_signal;
-	if ((retval = copy_namespaces(clone_flags, p)))
+	if ((retval = copy_namespaces(clone_flags, p, 0)))
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
@@ -1233,9 +1300,13 @@ static struct task_struct *copy_process(
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
-		pid = alloc_pid(p->nsproxy->pid_ns);
+		pid = alloc_pid(p->nsproxy->pid_ns, vpid);
 		if (!pid)
 			goto bad_fork_cleanup_io;
+
+		if ((clone_flags & CLONE_NEWPID) &&
+		    (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD))
+			task_active_pid_ns(p)->flags |= PID_NS_HIDDEN;
 	}
 
 	p->pid = pid_nr(pid);
@@ -1320,7 +1391,7 @@ static struct task_struct *copy_process(
 	 * thread can't slip out of an OOM kill (or normal SIGKILL).
  	 */
 	recalc_sigpending();
-	if (signal_pending(current)) {
+	if (signal_pending(current) && !vpid) {
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
@@ -1350,12 +1421,23 @@ static struct task_struct *copy_process(
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+#ifdef CONFIG_VE
+			list_add_tail_rcu(&p->ve_task_info.vetask_list,
+					&p->ve_task_info.owner_env->vetask_lh);
+			list_add_tail(&p->ve_task_info.aux_list,
+					&p->ve_task_info.owner_env->vetask_auxlist);
+#endif
 			__get_cpu_var(process_counts)++;
 		}
 		attach_pid(p, PIDTYPE_PID, pid);
 		nr_threads++;
 	}
+	p->ve_task_info.owner_env->pcounter++;
+	(void)get_ve(p->ve_task_info.owner_env);
 
+#ifdef CONFIG_VE
+	seqcount_init(&p->ve_task_info.wakeup_lock);
+#endif
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
@@ -1375,13 +1457,8 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm) {
-		task_lock(p);
-		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&p->mm->oom_disable_count);
-		task_unlock(p);
+	if (p->mm)
 		mmput(p->mm);
-	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		__cleanup_signal(p->signal);
@@ -1410,7 +1487,10 @@ bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
 bad_fork_free:
+	ub_task_put(p);
 	free_task(p);
+bad_fork_uncharge:
+	ub_task_uncharge(get_exec_ub());
 fork_out:
 	return ERR_PTR(retval);
 }
@@ -1427,7 +1507,7 @@ struct task_struct * __cpuinit fork_idle
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-			    &init_struct_pid, 0);
+			    &init_struct_pid, 0, 0);
 	if (!IS_ERR(task))
 		init_idle(task, cpu);
 
@@ -1440,12 +1520,13 @@ struct task_struct * __cpuinit fork_idle
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long do_fork(unsigned long clone_flags,
+long do_fork_pid(unsigned long clone_flags,
 	      unsigned long stack_start,
 	      struct pt_regs *regs,
 	      unsigned long stack_size,
 	      int __user *parent_tidptr,
-	      int __user *child_tidptr)
+	      int __user *child_tidptr,
+	      long vpid)
 {
 	struct task_struct *p;
 	int trace = 0;
@@ -1492,7 +1573,7 @@ long do_fork(unsigned long clone_flags,
 		trace = tracehook_prepare_clone(clone_flags);
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
-			 child_tidptr, NULL, trace);
+			 child_tidptr, NULL, vpid, trace);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1561,25 +1642,38 @@ static void sighand_ctor(void *data)
 	init_waitqueue_head(&sighand->signalfd_wqh);
 }
 
+EXPORT_SYMBOL(do_fork_pid);
+
+long do_fork(unsigned long clone_flags,
+		unsigned long stack_start,
+		struct pt_regs *regs,
+		unsigned long stack_size,
+		int __user *parent_tidptr,
+		int __user *child_tidptr)
+{
+	return do_fork_pid(clone_flags, stack_start, regs, stack_size,
+			parent_tidptr, child_tidptr, 0);
+}
+
 void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-			SLAB_NOTRACK, sighand_ctor);
+			SLAB_NOTRACK|SLAB_UBC, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+	__vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 	nsproxy_cache_init();
 }
@@ -1719,13 +1813,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, 
 
 		if (new_fs) {
 			fs = current->fs;
-			write_lock(&fs->lock);
+			spin_lock(&fs->lock);
 			current->fs = new_fs;
 			if (--fs->users)
 				new_fs = NULL;
 			else
 				new_fs = fs;
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 		}
 
 		if (new_fd) {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/freezer.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/freezer.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/freezer.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/freezer.c	2015-01-21 12:02:50.448034593 +0300
@@ -114,6 +114,7 @@ bool freeze_task(struct task_struct *p, 
 
 	return true;
 }
+EXPORT_SYMBOL(freeze_task);
 
 void cancel_freezing(struct task_struct *p)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/futex.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/futex.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/futex.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/futex.c	2015-01-21 12:02:57.972834848 +0300
@@ -678,7 +678,8 @@ void exit_pi_state_list(struct task_stru
  */
 static int
 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
-		union futex_key *key, struct futex_pi_state **ps)
+		union futex_key *key, struct futex_pi_state **ps,
+		struct task_struct *task)
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
@@ -776,6 +777,11 @@ lookup_pi_state(u32 uval, struct futex_h
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
+	if (!p->mm) {
+		put_task_struct(p);
+		return -EPERM;
+	}
+
 	/*
 	 * We need to look at the task state flags to figure out,
 	 * whether the task is exiting. To protect against the do_exit
@@ -925,7 +931,7 @@ retry:
 	 * We dont have the lock. Look up the PI state (or create it if
 	 * we are the first waiter):
 	 */
-	ret = lookup_pi_state(uval, hb, key, ps);
+	ret = lookup_pi_state(uval, hb, key, ps, task);
 
 	if (unlikely(ret)) {
 		switch (ret) {
@@ -982,6 +988,9 @@ static void wake_futex(struct futex_q *q
 {
 	struct task_struct *p = q->task;
 
+	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+		return;
+
 	/*
 	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
 	 * a non futex wake up happens on another CPU then the task
@@ -1222,6 +1231,10 @@ retry_private:
 
 	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
+			if (this->pi_state || this->rt_waiter) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
 			wake_futex(this);
 			if (++ret >= nr_wake)
 				break;
@@ -1234,6 +1247,10 @@ retry_private:
 		op_ret = 0;
 		plist_for_each_entry_safe(this, next, head, list) {
 			if (match_futex (&this->key, &key2)) {
+				if (this->pi_state || this->rt_waiter) {
+					ret = -EINVAL;
+					goto out_unlock;
+				}
 				wake_futex(this);
 				if (++op_ret >= nr_wake2)
 					break;
@@ -1242,6 +1259,7 @@ retry_private:
 		ret += op_ret;
 	}
 
+out_unlock:
 	double_unlock_hb(hb1, hb2);
 out_put_keys:
 	put_futex_key(fshared, &key2);
@@ -1331,8 +1349,8 @@ void requeue_pi_wake_futex(struct futex_
  * hb1 and hb2 must be held by the caller.
  *
  * Returns:
- *  0 - failed to acquire the lock atomicly
- *  1 - acquired the lock
+ *  0 - failed to acquire the lock atomically
+ * >0 - acquired the lock, return value is vpid of the top_waiter
  * <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1343,7 +1361,7 @@ static int futex_proxy_trylock_atomic(u3
 {
 	struct futex_q *top_waiter = NULL;
 	u32 curval;
-	int ret;
+	int ret, vpid;
 
 	if (get_futex_value_locked(&curval, pifutex))
 		return -EFAULT;
@@ -1371,11 +1389,13 @@ static int futex_proxy_trylock_atomic(u3
 	 * the contended case or if set_waiters is 1.  The pi_state is returned
 	 * in ps in contended cases.
 	 */
+	vpid = task_pid_vnr(top_waiter->task);
 	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
 				   set_waiters);
-	if (ret == 1)
+	if (ret == 1) {
 		requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+		return vpid;
+	}
 	return ret;
 }
 
@@ -1405,7 +1425,6 @@ static int futex_requeue(u32 __user *uad
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
-	u32 curval2;
 
 	if (requeue_pi) {
 		/*
@@ -1509,16 +1528,25 @@ retry_private:
 		 * At this point the top_waiter has either taken uaddr2 or is
 		 * waiting on it.  If the former, then the pi_state will not
 		 * exist yet, look it up one more time to ensure we have a
-		 * reference to it.
+		 * reference to it. If the lock was taken, ret contains the
+		 * vpid of the top waiter task.
 		 */
-		if (ret == 1) {
+		if (ret > 0) {
 			WARN_ON(pi_state);
 			drop_count++;
 			task_count++;
-			ret = get_futex_value_locked(&curval2, uaddr2);
-			if (!ret)
-				ret = lookup_pi_state(curval2, hb2, &key2,
-						      &pi_state);
+			/*
+			 * If we acquired the lock, then the user
+			 * space value of uaddr2 should be vpid. It
+			 * cannot be changed by the top waiter as it
+			 * is blocked on hb2 lock if it tries to do
+			 * so. If something fiddled with it behind our
+			 * back the pi state lookup might unearth
+			 * it. So we rather use the known value than
+			 * rereading and handing potential crap to
+			 * lookup_pi_state.
+			 */
+			ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
 		}
 
 		switch (ret) {
@@ -1557,9 +1585,13 @@ retry_private:
 		/*
 		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
 		 * be paired with each other and no other futex ops.
+		 *
+		 * We should never be requeueing a futex_q with a pi_state,
+		 * which is awaiting a futex_unlock_pi().
 		 */
 		if ((requeue_pi && !this->rt_waiter) ||
-		    (!requeue_pi && this->rt_waiter)) {
+		    (!requeue_pi && this->rt_waiter) ||
+		    this->pi_state) {
 			ret = -EINVAL;
 			break;
 		}
@@ -1870,7 +1902,7 @@ handle_fault:
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
 
-static long futex_wait_restart(struct restart_block *restart);
+long futex_wait_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -2145,7 +2177,7 @@ out:
 }
 
 
-static long futex_wait_restart(struct restart_block *restart)
+long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
 	int fshared = 0;
@@ -2162,7 +2194,7 @@ static long futex_wait_restart(struct re
 				restart->futex.bitset,
 				restart->futex.flags & FLAGS_CLOCKRT);
 }
-
+EXPORT_SYMBOL(futex_wait_restart);
 
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
@@ -2839,12 +2871,19 @@ long do_futex(u32 __user *uaddr, int op,
 	int clockrt, ret = -ENOSYS;
 	int cmd = op & FUTEX_CMD_MASK;
 	int fshared = 0;
+	ktime_t abs_time;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		fshared = 1;
 
 	clockrt = op & FUTEX_CLOCK_REALTIME;
-	if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+	if (cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI) {
+		if (timeout && !clockrt) {
+			abs_time = ktime_add(*timeout, timespec_to_ktime(
+						get_exec_env()->start_timespec));
+			timeout = &abs_time;
+		}
+	} else if (clockrt)
 		return -ENOSYS;
 
 	switch (cmd) {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/hrtimer.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/hrtimer.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/hrtimer.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/hrtimer.c	2015-01-21 12:02:57.972834848 +0300
@@ -74,25 +74,43 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, 
 			.get_time = &ktime_get,
 			.resolution = KTIME_LOW_RES,
 		},
+		{
+			.index = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+			.resolution = KTIME_LOW_RES,
+		},
 	}
 };
 
+static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
+	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
+	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
+};
+
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+	BUG_ON(clock_id >= MAX_CLOCKS);
+	return hrtimer_clock_to_base_table[clock_id];
+}
+
 /*
  * Get the coarse grained time at the softirq based on xtime and
  * wall_to_monotonic.
  */
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
-	ktime_t xtim, tomono;
+	ktime_t xtim, mono, boot;
 	struct timespec xts, tom, slp;
 
 	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
 
 	xtim = timespec_to_ktime(xts);
-	tomono = timespec_to_ktime(tom);
-	base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
-	base->clock_base[CLOCK_MONOTONIC].softirq_time =
-		ktime_add(xtim, tomono);
+	mono = ktime_add(xtim, timespec_to_ktime(tom));
+	boot = ktime_add(mono, timespec_to_ktime(slp));
+	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
+	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
 }
 
 /*
@@ -179,10 +197,11 @@ switch_hrtimer_base(struct hrtimer *time
 	struct hrtimer_cpu_base *new_cpu_base;
 	int this_cpu = smp_processor_id();
 	int cpu = hrtimer_get_target(this_cpu, pinned);
+	int basenum = hrtimer_clockid_to_base(base->index);
 
 again:
 	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
-	new_base = &new_cpu_base->clock_base[base->index];
+	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
 		/*
@@ -210,6 +229,11 @@ again:
 			goto again;
 		}
 		timer->base = new_base;
+	} else {
+		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+			cpu = this_cpu;
+			goto again;
+		}
 	}
 	return new_base;
 }
@@ -304,6 +328,7 @@ u64 ktime_divns(const ktime_t kt, s64 di
 
 	return dclc;
 }
+EXPORT_SYMBOL(ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
 
 /*
@@ -659,7 +684,7 @@ static void retrigger_next_event(void *a
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int cpu = smp_processor_id();
+	int i, cpu = smp_processor_id();
 	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
 	unsigned long flags;
 
@@ -675,8 +700,8 @@ static int hrtimer_switch_to_hres(void)
 		return 0;
 	}
 	base->hres_active = 1;
-	base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
-	base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+		base->clock_base[i].resolution = KTIME_HIGH_RES;
 
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
@@ -687,9 +712,10 @@ static int hrtimer_switch_to_hres(void)
 
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
-	ktime_t *offs_real = &base->clock_base[CLOCK_REALTIME].offset;
+	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 
-	return ktime_get_update_offsets(offs_real);
+	return ktime_get_update_offsets(offs_real, offs_boot);
 }
 
 static void clock_was_set_work(struct work_struct *work)
@@ -943,6 +969,7 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_is_queued(timer)) {
+		unsigned long state;
 		int reprogram;
 
 		/*
@@ -956,8 +983,13 @@ remove_hrtimer(struct hrtimer *timer, st
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-				 reprogram);
+		/*
+		 * We must preserve the CALLBACK state flag here,
+		 * otherwise we could move the timer base in
+		 * switch_hrtimer_base.
+		 */
+		state = timer->state & HRTIMER_STATE_CALLBACK;
+		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
 	return 0;
@@ -976,11 +1008,8 @@ int __hrtimer_start_range_ns(struct hrti
 	/* Remove an active timer from the queue: */
 	ret = remove_hrtimer(timer, base);
 
-	/* Switch the timer base, if necessary: */
-	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
 	if (mode & HRTIMER_MODE_REL) {
-		tim = ktime_add_safe(tim, new_base->get_time());
+		tim = ktime_add_safe(tim, base->get_time());
 		/*
 		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
 		 * to signal that they simply return xtime in
@@ -995,6 +1024,9 @@ int __hrtimer_start_range_ns(struct hrti
 
 	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
+	/* Switch the timer base, if necessary: */
+	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
 	timer_stats_hrtimer_set_start_info(timer);
 
 	leftmost = enqueue_hrtimer(timer, new_base);
@@ -1159,6 +1191,7 @@ static void __hrtimer_init(struct hrtime
 			   enum hrtimer_mode mode)
 {
 	struct hrtimer_cpu_base *cpu_base;
+	int base;
 
 	memset(timer, 0, sizeof(struct hrtimer));
 
@@ -1167,7 +1200,8 @@ static void __hrtimer_init(struct hrtime
 	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
 		clock_id = CLOCK_MONOTONIC;
 
-	timer->base = &cpu_base->clock_base[clock_id];
+	base = hrtimer_clockid_to_base(clock_id);
+	timer->base = &cpu_base->clock_base[base];
 	hrtimer_init_timer_hres(timer);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1202,9 +1236,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
 	struct hrtimer_cpu_base *cpu_base;
+	int base = hrtimer_clockid_to_base(which_clock);
 
 	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
 
 	return 0;
 }
@@ -1244,6 +1279,9 @@ static void __run_hrtimer(struct hrtimer
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 		enqueue_hrtimer(timer, base);
 	}
+
+	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
@@ -1574,6 +1612,7 @@ out:
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
 }
+EXPORT_SYMBOL(hrtimer_nanosleep_restart);
 
 long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/hung_task.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/hung_task.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/hung_task.c	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/hung_task.c	2015-01-21 12:02:43.794211240 +0300
@@ -38,6 +38,8 @@ unsigned long __read_mostly sysctl_hung_
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
+int __read_mostly sysctl_hung_task_verbosity = 0;
+
 static int __read_mostly did_panic;
 
 static struct task_struct *watchdog_task;
@@ -90,6 +92,15 @@ static void check_hung_task(struct task_
 		return;
 	sysctl_hung_task_warnings--;
 
+	if (sysctl_hung_task_verbosity & 1)
+		nmi_show_regs(NULL, 0);
+	if (sysctl_hung_task_verbosity & 2)
+		show_state_filter(0);
+	if (sysctl_hung_task_verbosity & 4)
+		show_mem(0);
+	if (sysctl_hung_task_verbosity & 8)
+		show_sched_debug();
+
 	/*
 	 * Ok, the task did not get scheduled for more than 2 minutes,
 	 * complain:
@@ -148,7 +159,7 @@ static void check_hung_uninterruptible_t
 		return;
 
 	rcu_read_lock();
-	do_each_thread(g, t) {
+	do_each_thread_all(g, t) {
 		if (!--max_count)
 			goto unlock;
 		if (!--batch_count) {
@@ -161,7 +172,7 @@ static void check_hung_uninterruptible_t
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, timeout);
-	} while_each_thread(g, t);
+	} while_each_thread_all(g, t);
  unlock:
 	rcu_read_unlock();
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/kexec.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kexec.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/kexec.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kexec.c	2015-01-21 12:02:53.081964672 +0300
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <linux/pram.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -118,26 +119,18 @@ static struct page *kimage_alloc_page(st
 				       gfp_t gfp_mask,
 				       unsigned long dest);
 
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
-	                    unsigned long nr_segments,
-                            struct kexec_segment __user *segments)
+static struct kimage *alloc_kimage(void)
 {
-	size_t segment_bytes;
 	struct kimage *image;
-	unsigned long i;
-	int result;
 
-	/* Allocate a controlling structure */
-	result = -ENOMEM;
 	image = kzalloc(sizeof(*image), GFP_KERNEL);
 	if (!image)
-		goto out;
+		return NULL;
 
 	image->head = 0;
 	image->entry = &image->head;
 	image->last_entry = &image->head;
 	image->control_page = ~0; /* By default this does not apply */
-	image->start = entry;
 	image->type = KEXEC_TYPE_DEFAULT;
 
 	/* Initialize the list of control pages */
@@ -149,6 +142,27 @@ static int do_kimage_alloc(struct kimage
 	/* Initialize the list of unuseable pages */
 	INIT_LIST_HEAD(&image->unuseable_pages);
 
+	return image;
+}
+
+static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+	                    unsigned long nr_segments,
+                            struct kexec_segment __user *segments)
+{
+	size_t segment_bytes;
+	struct kimage *image;
+	unsigned long i;
+	int result;
+
+	/* Allocate a controlling structure */
+	result = -ENOMEM;
+	image = alloc_kimage();
+	if (!image)
+		goto out;
+
+	/* Set the entry point */
+	image->start = entry;
+
 	/* Read in the segments */
 	image->nr_segments = nr_segments;
 	segment_bytes = nr_segments * sizeof(*segments);
@@ -1245,7 +1259,6 @@ static int __init crash_notes_memory_ini
 	}
 	return 0;
 }
-module_init(crash_notes_memory_init)
 
 
 /*
@@ -1265,7 +1278,8 @@ module_init(crash_notes_memory_init)
 static int __init parse_crashkernel_mem(char 			*cmdline,
 					unsigned long long	system_ram,
 					unsigned long long	*crash_size,
-					unsigned long long	*crash_base)
+					unsigned long long	*crash_base,
+					int			*strict)
 {
 	char *cur = cmdline, *tmp;
 
@@ -1336,6 +1350,8 @@ static int __init parse_crashkernel_mem(
 						"after '@'\n");
 				return -EINVAL;
 			}
+			if (strict && *crash_base > 0 && *tmp != '+')
+				*strict = 1;
 		}
 	}
 
@@ -1351,7 +1367,8 @@ static int __init parse_crashkernel_mem(
  */
 static int __init parse_crashkernel_simple(char 		*cmdline,
 					   unsigned long long 	*crash_size,
-					   unsigned long long 	*crash_base)
+					   unsigned long long 	*crash_base,
+					   int			*strict)
 {
 	char *cur = cmdline;
 
@@ -1361,8 +1378,11 @@ static int __init parse_crashkernel_simp
 		return -EINVAL;
 	}
 
-	if (*cur == '@')
+	if (*cur == '@') {
 		*crash_base = memparse(cur+1, &cur);
+		if (strict && *crash_base > 0 && *cur != '+')
+			*strict = 1;
+	}
 
 	return 0;
 }
@@ -1411,14 +1431,18 @@ unsigned long long __init arch_default_c
 int __init parse_crashkernel(char 		 *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
+			     unsigned long long *crash_base,
+			     int		*strict)
 {
 	char 	*p = cmdline, *ck_cmdline = NULL;
 	char	*first_colon, *first_space;
+	int	ret = 0;
 
 	BUG_ON(!crash_size || !crash_base);
 	*crash_size = 0;
 	*crash_base = 0;
+	if (strict)
+		*strict = 0;
 
 	/* find crashkernel and use the last one if there are more */
 	p = strstr(p, "crashkernel=");
@@ -1452,7 +1476,6 @@ int __init parse_crashkernel(char 		 *cm
 					strlen(cmdline) - (ck_cmdline + 4 - cmdline) + 1);
 				memcpy(ck_cmdline, tmp, len);
 			}
-			return 0;
 		} else {
 			/*
 			 * We can't reserve memory auotmatcally,
@@ -1462,8 +1485,9 @@ int __init parse_crashkernel(char 		 *cm
 			memmove(ck_cmdline - 16, ck_cmdline,
 				strlen(cmdline) - (ck_cmdline - cmdline) + 1);
 			pr_warning("crashkernel=auto resulted in zero bytes of reserved memory.\n");
-			return -ENOMEM;
+			ret = -ENOMEM;
 		}
+		goto out;
 	}
 #endif
 	/*
@@ -1473,13 +1497,18 @@ int __init parse_crashkernel(char 		 *cm
 	first_colon = strchr(ck_cmdline, ':');
 	first_space = strchr(ck_cmdline, ' ');
 	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
+		ret = parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base, strict);
 	else
-		return parse_crashkernel_simple(ck_cmdline, crash_size,
-				crash_base);
-
-	return 0;
+		ret = parse_crashkernel_simple(ck_cmdline, crash_size,
+				crash_base, strict);
+out:
+	if (ret == 0 && *crash_base < pram_low) {
+		*crash_base = pram_low;
+		if (strict)
+			*strict = 0;
+	}
+	return ret;
 }
 
 
@@ -1496,7 +1525,17 @@ static void update_vmcoreinfo_note(void)
 
 void crash_save_vmcoreinfo(void)
 {
-	vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+	unsigned long time;
+
+	/*
+	 * If we panic early, timekeeping might have not been initialized yet
+	 * resulting in get_seconds() returning 0. Userspace utilities do not
+	 * like the zero-time so skip it then.
+	 */
+	time = get_seconds();
+	if (time > 0)
+		vmcoreinfo_append_str("CRASHTIME=%ld", time);
+
 	update_vmcoreinfo_note();
 }
 
@@ -1595,7 +1634,170 @@ static int __init crash_save_vmcoreinfo_
 	return 0;
 }
 
-module_init(crash_save_vmcoreinfo_init)
+static int __init crash_init(void)
+{
+	crash_notes_memory_init();
+	crash_save_vmcoreinfo_init();
+	return 0;
+}
+
+#ifdef CONFIG_KEXEC_REUSE_CRASH
+int kexec_reuse_crash = 1;
+
+struct kexec_pram_segment {
+	__u64	offset;
+	__u64	size;
+};
+
+struct kexec_pram_image {
+	__u64	start;
+	__u64	end;
+	__u64	entry_offset;
+	__u64	nr_segments;
+};
+
+#define KEXEC_CRASH_PRAM	"crash"
+
+static void kexec_crash_image_save(void)
+{
+	struct pram_stream stream;
+	struct kexec_pram_image pimage;
+	struct kexec_pram_segment psegment;
+	struct kimage *image;
+	unsigned long i;
+	int result;
+
+	if (!kexec_reuse_crash || !kexec_crash_image)
+		return;
+
+	image = kexec_crash_image;
+
+	result = pram_open(KEXEC_CRASH_PRAM, PRAM_WRITE, &stream);
+	if (result)
+		goto out;
+
+	pimage.start = __pa(crashk_res.start);
+	pimage.end = __pa(crashk_res.end);
+	pimage.entry_offset = image->start - crashk_res.start;
+	pimage.nr_segments = image->nr_segments;
+
+	result = -EIO;
+	if (pram_write(&stream, &pimage, sizeof(pimage)) != sizeof(pimage))
+		goto out_close_stream;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		psegment.offset = image->segment[i].mem - crashk_res.start;
+		psegment.size = image->segment[i].memsz;
+		if (pram_write(&stream, &psegment, sizeof(psegment)) !=
+				sizeof(psegment))
+			goto out_close_stream;
+	}
+
+	printk(KERN_INFO "Crash image saved");
+	result = 0;
+
+out_close_stream:
+	pram_close(&stream, result);
+out:
+	if (result)
+		printk(KERN_ERR "Could not save crash image: %d\n", result);
+}
+
+static int __init __kexec_crash_image_reuse(struct kimage *image)
+{
+	int result;
+
+	image->control_page = crashk_res.start;
+	image->type = KEXEC_TYPE_CRASH;
+
+	result = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+					get_order(KEXEC_CONTROL_PAGE_SIZE));
+	if (!image->control_code_page)
+		goto out;
+
+	result = machine_kexec_prepare(image);
+	if (result)
+		goto out;
+
+	kimage_terminate(image);
+
+	kexec_crash_image = image;
+	result = 0;
+out:
+	return result;
+}
+
+static void __init kexec_crash_image_reuse(void)
+{
+	struct pram_stream stream;
+	struct kexec_pram_image pimage;
+	struct kexec_pram_segment psegment;
+	struct kimage *image;
+	unsigned long i;
+	int result;
+
+	if (WARN_ON(kexec_crash_image))
+		return;
+
+	result = pram_open(KEXEC_CRASH_PRAM, PRAM_READ, &stream);
+	if (result)
+		goto out;
+
+	result = -EIO;
+	if (pram_read(&stream, &pimage, sizeof(pimage)) != sizeof(pimage))
+		goto out_close_stream;
+
+	result = -EINVAL;
+	if (pimage.start != __pa(crashk_res.start) ||
+	    pimage.end != __pa(crashk_res.end) ||
+	    pimage.nr_segments > KEXEC_SEGMENT_MAX)
+		goto out_close_stream;
+
+	result = -ENOMEM;
+	image = alloc_kimage();
+	if (!image)
+		goto out_close_stream;
+
+	image->start = crashk_res.start + pimage.entry_offset;
+	image->nr_segments = pimage.nr_segments;
+
+	result = -EIO;
+	for (i = 0; i < pimage.nr_segments; i++) {
+		if (pram_read(&stream, &psegment, sizeof(psegment)) !=
+				sizeof(psegment))
+			goto out_free_image;
+		image->segment[i].mem = crashk_res.start + psegment.offset;
+		image->segment[i].memsz = psegment.size;
+	}
+
+	result = __kexec_crash_image_reuse(image);
+	if (result == 0) {
+		printk(KERN_INFO "Using crash image from previous kernel\n");
+		goto out_close_stream;
+	}
+
+out_free_image:
+	kfree(image);
+out_close_stream:
+	pram_close(&stream, 0);
+out:
+	if (result && result != -ENOENT)
+		printk(KERN_ERR "Could not load crash image: %d\n", result);
+}
+
+void __init kexec_crash_init(void)
+{
+	crash_init();
+	kexec_crash_image_reuse();
+}
+#else
+module_init(crash_init)
+
+static inline void kexec_crash_image_save(void)
+{
+}
+#endif /* CONFIG_KEXEC_REUSE_CRASH */
 
 /*
  * Move into place and start executing a preloaded standalone
@@ -1646,6 +1848,7 @@ int kernel_kexec(void)
 	} else
 #endif
 	{
+		kexec_crash_image_save();
 		kernel_restart_prepare(NULL);
 		printk(KERN_EMERG "Starting new kernel\n");
 		machine_shutdown();
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/kgdb.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kgdb.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/kgdb.c	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kgdb.c	2015-01-21 12:02:43.794211240 +0300
@@ -1019,7 +1019,7 @@ static void gdb_cmd_query(struct kgdb_st
 			}
 		}
 
-		do_each_thread(g, p) {
+		do_each_thread_all(g, p) {
 			if (i >= ks->thr_query && !finished) {
 				int_to_threadref(thref, p->pid);
 				pack_threadid(ptr, thref);
@@ -1030,7 +1030,7 @@ static void gdb_cmd_query(struct kgdb_st
 					finished = 1;
 			}
 			i++;
-		} while_each_thread(g, p);
+		} while_each_thread_all(g, p);
 
 		*(--ptr) = '\0';
 		break;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/kmod.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kmod.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/kmod.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kmod.c	2015-01-21 12:02:45.864156285 +0300
@@ -36,13 +36,15 @@
 #include <linux/resource.h>
 #include <linux/notifier.h>
 #include <linux/suspend.h>
+#include <linux/netfilter.h>
+#include <net/net_namespace.h>
 #include <asm/uaccess.h>
 
 #include <trace/events/module.h>
 
 extern int max_threads;
 
-static struct workqueue_struct *khelper_wq;
+struct workqueue_struct *khelper_wq;
 
 #define CAP_BSET	(void *)1
 #define CAP_PI		(void *)2
@@ -60,11 +62,11 @@ char modprobe_path[KMOD_PATH_LEN] = "/sb
 
 static void free_modprobe_argv(struct subprocess_info *info)
 {
-	kfree(info->argv[3]); /* check call_modprobe() */
+	kfree(info->argv[4]); /* check call_modprobe() */
 	kfree(info->argv);
 }
 
-static int call_modprobe(char *module_name, int wait)
+static int call_modprobe(char *module_name, int wait, int blacklist)
 {
 	static char *envp[] = {
 		"HOME=/",
@@ -73,7 +75,7 @@ static int call_modprobe(char *module_na
 		NULL
 	};
 
-	char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+	char **argv = kmalloc(sizeof(char *[6]), GFP_KERNEL);
 	if (!argv)
 		goto out;
 
@@ -83,9 +85,13 @@ static int call_modprobe(char *module_na
 
 	argv[0] = modprobe_path;
 	argv[1] = "-q";
-	argv[2] = "--";
-	argv[3] = module_name;	/* check free_modprobe_argv() */
-	argv[4] = NULL;
+	if (blacklist)
+		argv[2] = "-b";
+	else
+		argv[2] = "-q"; /* just repeat argv[1] */
+	argv[3] = "--";
+	argv[4] = module_name;	/* check free_modprobe_argv() */
+	argv[5] = NULL;
 
 	return call_usermodehelper_fns(modprobe_path, argv, envp,
 		wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
@@ -96,10 +102,10 @@ out:
 }
 
 /**
- * __request_module - try to load a kernel module
+ * ___request_module - try to load a kernel module
  * @wait: wait (or not) for the operation to complete
- * @fmt: printf style format string for the name of the module
- * @...: arguments as specified in the format string
+ * @blacklist: say usermodehelper to ignore blacklisted modules
+ * @module_name: name of requested module
  *
  * Load a module using the user mode module loader. The function returns
  * zero on success or a negative errno code on failure. Note that a
@@ -110,21 +116,17 @@ out:
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int __request_module(bool wait, const char *fmt, ...)
+static int ___request_module(bool wait, bool blacklist, char *module_name)
 {
-	va_list args;
-	char module_name[MODULE_NAME_LEN];
 	unsigned int max_modprobes;
 	int ret;
 	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
 
-	va_start(args, fmt);
-	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
-	va_end(args);
-	if (ret >= MODULE_NAME_LEN)
-		return -ENAMETOOLONG;
+	/* Don't allow request_module() inside VE. */
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
 
 	ret = security_kernel_module_request(module_name);
 	if (ret)
@@ -156,12 +158,192 @@ int __request_module(bool wait, const ch
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
-	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, blacklist);
 
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
+
+int __request_module(bool wait, const char *fmt, ...)
+{
+	char module_name[MODULE_NAME_LEN];
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+	va_end(args);
+
+	if (ret >= MODULE_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	return ___request_module(wait, false, module_name);
+}
 EXPORT_SYMBOL(__request_module);
+
+#ifdef CONFIG_VE_IPTABLES
+
+/* ve0 allowed modules */
+static struct {
+	const char *name;
+	u64 perm;
+} ve0_am[] = {
+	{ "ip_tables",		VE_IP_IPTABLES	},
+	{ "ip6_tables",		VE_IP_IPTABLES6	},
+	{ "iptable_filter",	VE_IP_FILTER	},
+	{ "iptable_raw",	VE_IP_IPTABLES	},
+	{ "iptable_nat",	VE_IP_NAT	},
+	{ "iptable_mangle",	VE_IP_MANGLE	},
+	{ "ip6table_filter",	VE_IP_FILTER6	},
+	{ "ip6table_mangle",	VE_IP_MANGLE6	},
+
+	{ "xt_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_NOTRACK",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_cluster",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_helper",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_socket",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+
+	{ "ipt_CLUSTERIP",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_NOTRACK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_cluster",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_helper",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_socket",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ipt_MASQUERADE",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_NETMAP",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_REDIRECT",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+
+	{ "ip6t_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_NOTRACK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_cluster",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_helper",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_socket",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "nf-nat-ipv4",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "nf-nat",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "nf_conntrack-2",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack_ipv4",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack-10",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack_ipv6",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+};
+
+/*
+ * module_payload_allowed - check if module functionality is allowed
+ * 			    to be used inside current virtual enviroment.
+ *
+ * Returns true if it is allowed or we're in ve0, false otherwise.
+ */
+bool module_payload_allowed(const char *module)
+{
+	u64 permitted = get_exec_env()->ipt_mask;
+	int i;
+
+	if (ve_is_super(get_exec_env()))
+		return true;
+
+	/* Look for full module name in ve0_am table */
+	for (i = 0; i < ARRAY_SIZE(ve0_am); i++) {
+		if (!strcmp(ve0_am[i].name, module))
+			return mask_ipt_allow(permitted, ve0_am[i].perm);
+	}
+
+	/* ts_* algorithms are required for xt_string module */
+	if (!strcmp("ts_bm", module) || !strcmp("ts_fsm", module) ||
+	    !strcmp("ts_kmp", module))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES) ||
+		       mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of xt_* modules is allowed in both ipv4 and ipv6 modes */
+	if (!strncmp("xt_", module, 3))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES) ||
+		       mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of ipt_* modules */
+	if (!strncmp("ipt_", module, 4))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES);
+
+	/* The rest of ip6t_* modules */
+	if (!strncmp("ip6t_", module, 5))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of arpt_* modules */
+	if (!strncmp("arpt_", module, 5))
+		return true;
+
+	/* The rest of ebt_* modules */
+	if (!strncmp("ebt_", module, 4))
+		return true;
+
+	return false;
+}
+#endif /* CONFIG_VE_IPTABLES */
+
+int ve0_request_module(const char *name,...)
+{
+	char module_name[MODULE_NAME_LEN];
+	struct ve_struct *old;
+	int blacklist, ret;
+	va_list args;
+
+	va_start(args, name);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, name, args);
+	va_end(args);
+
+	if (ret >= MODULE_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	/* Check that autoload is not prohobited using /proc interface */
+	if (!ve_is_super(get_exec_env()) &&
+	    !ve_allow_module_load)
+		return -EPERM;
+
+	/* Check that module functionality is permitted */
+	if (!module_payload_allowed(module_name))
+		return -EPERM;
+
+	old = set_exec_env(get_ve0());
+
+	/*
+	 * This function may be called from ve0, where standard behaviour
+	 * is not to use blacklist. So, we request blacklist reading only
+	 * if we're inside CT.
+	 */
+	blacklist = (old != get_ve0());
+
+	ret = ___request_module(true, blacklist, module_name);
+
+	set_exec_env(old);
+
+	return ret;
+}
+EXPORT_SYMBOL(ve0_request_module);
+
 #endif /* CONFIG_MODULES */
 
 /*
@@ -170,11 +352,9 @@ EXPORT_SYMBOL(__request_module);
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct cred *cred = sub_info->cred;
+	struct cred *new;
 	int retval;
 
-	BUG_ON(atomic_read(&cred->usage) != 1);
-
 	/* Unblock all signals */
 	spin_lock_irq(&current->sighand->siglock);
 	flush_signal_handlers(current, 1);
@@ -182,31 +362,36 @@ static int ____call_usermodehelper(void 
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	spin_lock(&umh_sysctl_lock);
-	cred->cap_bset = cap_intersect(usermodehelper_bset, cred->cap_bset);
-	cred->cap_inheritable = cap_intersect(usermodehelper_inheritable,
-					      cred->cap_inheritable);
-	spin_unlock(&umh_sysctl_lock);
-
-	/* Install the credentials */
-	commit_creds(sub_info->cred);
-	sub_info->cred = NULL;
-
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed_ptr(current, cpu_all_mask);
 
-	if (sub_info->init) {
-		retval = sub_info->init(sub_info);
-		if (retval)
-			goto fail;
-	}
-
 	/*
 	 * Our parent is keventd, which runs with elevated scheduling priority.
 	 * Avoid propagating that into the userspace child.
 	 */
 	set_user_nice(current, 0);
 
+	retval = -ENOMEM;
+	new = prepare_kernel_cred(current);
+	if (!new)
+		goto fail;
+
+	spin_lock(&umh_sysctl_lock);
+	new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+	new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+					     new->cap_inheritable);
+	spin_unlock(&umh_sysctl_lock);
+
+	if (sub_info->init) {
+		retval = sub_info->init(sub_info, new);
+		if (retval) {
+			abort_creds(new);
+			goto fail;
+		}
+	}
+
+	commit_creds(new);
+
 	retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
 
 	/* Exec failed? */
@@ -219,8 +404,6 @@ void call_usermodehelper_freeinfo(struct
 {
 	if (info->cleanup)
 		(*info->cleanup)(info);
-	if (info->cred)
-		put_cred(info->cred);
 	kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -286,8 +469,6 @@ static void __call_usermodehelper(struct
 	enum umh_wait wait = sub_info->wait;
 	pid_t pid;
 
-	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-
 	if (wait != UMH_NO_WAIT)
 		wait &= ~UMH_KILLABLE;
 
@@ -414,36 +595,12 @@ struct subprocess_info *call_usermodehel
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
-	sub_info->cred = prepare_usermodehelper_creds();
-	if (!sub_info->cred) {
-		kfree(sub_info);
-		return NULL;
-	}
-
   out:
 	return sub_info;
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
 /**
- * call_usermodehelper_setkeys - set the session keys for usermode helper
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @session_keyring: the session keyring for the process
- */
-void call_usermodehelper_setkeys(struct subprocess_info *info,
-				 struct key *session_keyring)
-{
-#ifdef CONFIG_KEYS
-	struct thread_group_cred *tgcred = info->cred->tgcred;
-	key_put(tgcred->session_keyring);
-	tgcred->session_keyring = key_get(session_keyring);
-#else
-	BUG();
-#endif
-}
-EXPORT_SYMBOL(call_usermodehelper_setkeys);
-
-/**
  * call_usermodehelper_setfns - set a cleanup/init function
  * @info: a subprocess_info returned by call_usermodehelper_setup
  * @cleanup: a cleanup function
@@ -460,7 +617,7 @@ EXPORT_SYMBOL(call_usermodehelper_setkey
  * context in which call_usermodehelper_exec is called.
  */
 void call_usermodehelper_setfns(struct subprocess_info *info,
-		    int (*init)(struct subprocess_info *info),
+		    int (*init)(struct subprocess_info *info, struct cred *new),
 		    void (*cleanup)(struct subprocess_info *info),
 		    void *data)
 {
@@ -482,14 +639,16 @@ EXPORT_SYMBOL(call_usermodehelper_setfns
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
-			     enum umh_wait wait)
+int call_usermodehelper_exec_wq(struct subprocess_info *sub_info,
+				enum umh_wait wait,
+				struct workqueue_struct *khelper_wq)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
-	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-	validate_creds(sub_info->cred);
+	if (!ve_is_super(get_exec_env()) &&
+	    khelper_wq != get_exec_env()->khelper_wq)
+		return -EPERM;
 
 	helper_lock();
 	if (sub_info->path[0] == '\0')
@@ -527,8 +686,43 @@ unlock:
 	helper_unlock();
 	return retval;
 }
+EXPORT_SYMBOL(call_usermodehelper_exec_wq);
+
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
+			     enum umh_wait wait)
+{
+	return call_usermodehelper_exec_wq(sub_info, wait, khelper_wq);
+}
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
+int
+call_usermodehelper_fns_wq(char *path, char **argv, char **envp,
+			enum umh_wait wait,
+			int (*init)(struct subprocess_info *info, struct cred *),
+			void (*cleanup)(struct subprocess_info *), void *data,
+			struct workqueue_struct *khelper_wq)
+{
+	struct subprocess_info *info;
+	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+	info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
+	if (info == NULL)
+		return -ENOMEM;
+	call_usermodehelper_setfns(info, init, cleanup, data);
+	return call_usermodehelper_exec_wq(info, wait, khelper_wq);
+}
+EXPORT_SYMBOL(call_usermodehelper_fns_wq);
+
+int
+call_usermodehelper_fns(char *path, char **argv, char **envp,
+			enum umh_wait wait,
+			int (*init)(struct subprocess_info *info, struct cred *),
+			void (*cleanup)(struct subprocess_info *), void *data)
+{
+	return call_usermodehelper_fns_wq(path, argv, envp, wait, init,
+					cleanup, data, khelper_wq);
+}
+EXPORT_SYMBOL(call_usermodehelper_fns);
 
 static int proc_cap_handler(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -538,7 +732,8 @@ static int proc_cap_handler(struct ctl_t
 	kernel_cap_t new_cap;
 	int err, i;
 
-	if (write && !capable(CAP_SYS_MODULE))
+	if (write && (!capable(CAP_SETPCAP) ||
+		      !capable(CAP_SYS_MODULE)))
 		return -EPERM;
 
 	/*
@@ -610,5 +805,6 @@ struct ctl_table usermodehelper_table[] 
 void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
+	ve0.khelper_wq = khelper_wq;
 	BUG_ON(!khelper_wq);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ksysfs.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ksysfs.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ksysfs.c	2014-12-12 23:28:56.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ksysfs.c	2015-01-21 12:02:44.818184053 +0300
@@ -16,6 +16,7 @@
 #include <linux/kexec.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -29,7 +30,7 @@ static struct kobj_attribute _name##_att
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+	return sprintf(buf, "%llu\n", (unsigned long long)ve_uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
 
 #endif /* CONFIG_KEXEC */
 
+/* whether file capabilities are enabled */
+static ssize_t fscaps_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", file_caps_enabled);
+}
+KERNEL_ATTR_RO(fscaps);
+
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
  */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
 EXPORT_SYMBOL_GPL(kernel_kobj);
 
 static struct attribute * kernel_attrs[] = {
+	&fscaps_attr.attr,
 #if defined(CONFIG_HOTPLUG)
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
@@ -174,22 +184,61 @@ static struct attribute * kernel_attrs[]
 	NULL
 };
 
+static struct attribute * kernel_ve_attrs[] = {
+	&fscaps_attr.attr,
+#if defined(CONFIG_HOTPLUG)
+	&uevent_seqnum_attr.attr,
+#endif
+	NULL
+};
+
 static struct attribute_group kernel_attr_group = {
 	.attrs = kernel_attrs,
 };
 
-static int __init ksysfs_init(void)
+static struct attribute_group kernel_ve_attr_group = {
+	.attrs = kernel_ve_attrs,
+};
+
+int ksysfs_init_ve(struct ve_struct *ve, struct kobject **kernel_obj)
 {
-	int error;
+	struct attribute_group *k_grp;
+	int err;
 
-	kernel_kobj = kobject_create_and_add("kernel", NULL);
-	if (!kernel_kobj) {
-		error = -ENOMEM;
-		goto exit;
+	if (!ve || ve_is_super(ve))
+		k_grp = &kernel_attr_group;
+	else
+		k_grp = &kernel_ve_attr_group;
+
+	*kernel_obj = kobject_create_and_add("kernel", NULL);
+	if (!*kernel_obj)
+		return -ENOMEM;
+
+	err = sysfs_create_group(*kernel_obj, k_grp);
+
+	if (err) {
+		kobject_put(*kernel_obj);
+		*kernel_obj = NULL;
 	}
-	error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ksysfs_init_ve);
+
+void ksysfs_fini_ve(struct ve_struct *ve, struct kobject **kernel_obj)
+{
+	sysfs_remove_group(*kernel_obj, &kernel_ve_attr_group);
+	kobject_put(*kernel_obj);
+	*kernel_obj = NULL;
+}
+EXPORT_SYMBOL_GPL(ksysfs_fini_ve);
+
+static int __init ksysfs_init(void)
+{
+	int error = ksysfs_init_ve(NULL, &kernel_kobj);
+
 	if (error)
-		goto kset_exit;
+		return error;
 
 	if (notes_size > 0) {
 		notes_attr.size = notes_size;
@@ -209,10 +258,7 @@ notes_exit:
 	if (notes_size > 0)
 		sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
-	sysfs_remove_group(kernel_kobj, &kernel_attr_group);
-kset_exit:
-	kobject_put(kernel_kobj);
-exit:
+	ksysfs_fini_ve(NULL, &kernel_kobj);
 	return error;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/kthread.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kthread.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/kthread.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/kthread.c	2015-01-21 12:02:45.880155859 +0300
@@ -16,11 +16,18 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/freezer.h>
+#include <linux/nsproxy.h>
+#include <linux/syscalls.h>
 #include <trace/events/sched.h>
+#include <asm/uaccess.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
+#ifdef CONFIG_VE
+#define kthread_create_list get_exec_env()->_kthread_create_list
+#else
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
+#endif
 
 struct kthread_create_info
 {
@@ -36,6 +43,12 @@ struct kthread_create_info
 	struct list_head list;
 };
 
+struct kthreadd_create_info
+{
+	struct completion done;
+	struct task_struct *result;
+};
+
 struct kthread {
 	int should_stop;
 	struct completion exited;
@@ -77,6 +90,7 @@ static int kthread(void *_create)
 		kfree(create);
 		do_exit(-EINTR);
 	}
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
@@ -86,7 +100,6 @@ static int kthread(void *_create)
 	ret = -EINTR;
 	if (!self.should_stop)
 		ret = threadfn(data);
-
 	/* we can't just return, we must preserve "self" on stack */
 	do_exit(ret);
 }
@@ -146,15 +159,19 @@ static void create_kthread(struct kthrea
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
-struct task_struct *va_kthread_create_on_node(int (*threadfn)(void *data),
+struct task_struct *va_kthread_create_on_node(struct ve_struct *ve,
+					      int (*threadfn)(void *data),
 					      void *data, int node,
 					      const char *fmt, va_list args)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
-	struct kthread_create_info *create = kmalloc(sizeof(*create),
-						     GFP_KERNEL);
+	struct kthread_create_info *create;
+	struct ve_struct *old_ve;
+
+	old_ve = set_exec_env(ve);
 
+	create = kmalloc(sizeof(*create), GFP_KERNEL);
 	if (!create)
 		return ERR_PTR(-ENOMEM);
 	create->threadfn = threadfn;
@@ -199,6 +216,8 @@ struct task_struct *va_kthread_create_on
 		set_cpus_allowed_ptr(task, cpu_all_mask);
 	}
 	kfree(create);
+	set_exec_env(old_ve);
+
 	return task;
 }
 
@@ -234,7 +253,7 @@ struct task_struct *kthread_create_on_no
 	struct task_struct *result;
 
 	va_start(args, namefmt);
-	result = va_kthread_create_on_node(threadfn, data,
+	result = va_kthread_create_on_node(get_ve0(), threadfn, data,
 					   node, namefmt, args);
 	va_end(args);
 	return result;
@@ -242,7 +261,7 @@ struct task_struct *kthread_create_on_no
 EXPORT_SYMBOL(kthread_create_on_node);
 
 /**
- * kthread_create - create a kthread.
+ * kthread_create_ve - create a kthread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
  * @namefmt: printf-style name for the thread.
@@ -260,7 +279,8 @@ EXPORT_SYMBOL(kthread_create_on_node);
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_ve(struct ve_struct *ve,
+				   int (*threadfn)(void *data),
 				   void *data,
 				   const char namefmt[],
 				   ...)
@@ -269,11 +289,11 @@ struct task_struct *kthread_create(int (
 	struct task_struct *result;
 
 	va_start(args, namefmt);
-	result = va_kthread_create_on_node(threadfn, data, -1, namefmt, args);
+	result = va_kthread_create_on_node(ve, threadfn, data, -1, namefmt, args);
 	va_end(args);
 	return result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_ve);
 
 /**
  * kthread_stop - stop a thread created by kthread_create().
@@ -314,22 +334,44 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
 
-int kthreadd(void *unused)
+int kthreadd(void *data)
 {
 	struct task_struct *tsk = current;
+	struct kthreadd_create_info *kcreate;
+	struct kthread self;
+	int rc;
+
+	self.should_stop = 0;
+
+	kcreate = (struct kthreadd_create_info *) data;
+
+	if (kcreate) {
+		daemonize("kthreadd/%d", get_exec_env()->veid);
+		kcreate->result = current;
+		set_fs(KERNEL_DS);
+		init_completion(&self.exited);
+		current->vfork_done = &self.exited;
+	} else
+		set_task_comm(tsk, "kthreadd");
 
 	/* Setup a clean context for our children to inherit. */
-	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 	set_mems_allowed(node_states[N_HIGH_MEMORY]);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
+	if (kcreate)
+		complete(&kcreate->done);
+
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (list_empty(&kthread_create_list))
-			schedule();
+		if (list_empty(&kthread_create_list)) {
+			if (self.should_stop)
+				break;
+			else
+				schedule();
+		}
 		__set_current_state(TASK_RUNNING);
 
 		spin_lock(&kthread_create_lock);
@@ -348,8 +390,59 @@ int kthreadd(void *unused)
 		spin_unlock(&kthread_create_lock);
 	}
 
+	do {
+		clear_thread_flag(TIF_SIGPENDING);
+		rc = sys_wait4(-1, NULL, __WALL, NULL);
+	} while (rc != -ECHILD);
+
+	do_exit(0);
+}
+
+int kthreadd_create()
+{
+	struct kthreadd_create_info create;
+	int ret;
+	struct ve_struct *ve = get_exec_env();
+
+	BUG_ON(ve->_kthreadd_task);
+
+	INIT_LIST_HEAD(&ve->_kthread_create_list);
+	init_completion(&create.done);
+	ret = kernel_thread(kthreadd, (void *) &create, CLONE_FS);
+	if (ret < 0) {
+		return ret;
+	}
+	wait_for_completion(&create.done);
+	ve->_kthreadd_task = create.result;
 	return 0;
 }
+EXPORT_SYMBOL(kthreadd_create);
+
+void kthreadd_stop(struct ve_struct *ve)
+{
+	struct kthread *kthread;
+	int ret;
+	struct task_struct *k;
+
+	if (!ve->_kthreadd_task)
+		return;
+
+	k = ve->_kthreadd_task;
+	trace_sched_kthread_stop(k);
+	get_task_struct(k);
+
+	BUG_ON(!k->vfork_done);
+
+	kthread = container_of(k->vfork_done, struct kthread, exited);
+	kthread->should_stop = 1;
+	wake_up_process(k);
+	wait_for_completion(&kthread->exited);
+	ret = k->exit_code;
+
+	put_task_struct(k);
+	trace_sched_kthread_stop_ret(ret);
+}
+EXPORT_SYMBOL(kthreadd_stop);
 
 void __init_kthread_worker(struct kthread_worker *worker,
 				const char *name,
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/lockdep.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/lockdep.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/lockdep.c	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/lockdep.c	2015-01-21 12:02:58.310825876 +0300
@@ -2705,7 +2705,7 @@ void lockdep_init_map(struct lockdep_map
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
-EXPORT_SYMBOL_GPL(lockdep_init_map);
+EXPORT_SYMBOL(lockdep_init_map);
 
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3216,7 +3216,7 @@ void lock_acquire(struct lockdep_map *lo
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(lock_acquire);
+EXPORT_SYMBOL(lock_acquire);
 
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
@@ -3235,7 +3235,7 @@ void lock_release(struct lockdep_map *lo
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(lock_release);
+EXPORT_SYMBOL(lock_release);
 
 int lock_is_held(struct lockdep_map *lock)
 {
@@ -3742,7 +3742,7 @@ retry:
 			printk(KERN_CONT " locked it.\n");
 	}
 
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		/*
 		 * It's not reliable to print a task's held locks
 		 * if it's not sleeping (or if it's not the current
@@ -3755,7 +3755,7 @@ retry:
 		if (!unlock)
 			if (read_trylock(&tasklist_lock))
 				unlock = 1;
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 
 	printk("\n");
 	printk("=============================================\n\n");
@@ -3799,4 +3799,14 @@ void lockdep_sys_exit(void)
 				curr->comm, curr->pid);
 		lockdep_print_held_locks(curr);
 	}
+	if (unlikely(curr->transaction_info || curr->trans_count)) {
+		printk("\n================================================\n");
+		printk(  "[ BUG: transaction held when returning to user space! ]\n");
+		printk(  "------------------------------------------------\n");
+		printk("%s/%d is leaving the kernel with locks still held!\n",
+				curr->comm, curr->pid);
+		printk("trans_count is %u transaction_info is %p",
+		       curr->trans_count, curr->transaction_info);
+	}
+
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/module-verify-sig.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/module-verify-sig.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/module-verify-sig.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/module-verify-sig.c	2015-01-21 12:02:41.358275915 +0300
@@ -88,6 +88,15 @@ static int __init sign_setup(char *str)
 }
 __setup("enforcemodulesig", sign_setup);
 
+static int badsigok = 0;
+
+static int __init setup_badsigok(char *str)
+{
+	badsigok = 1;
+	return 0;
+}
+__setup("badsigok", setup_badsigok);
+
 static const char modsign_note_name[] = ELFNOTE_NAME(MODSIGN_NOTE_NAME);
 static const char modsign_note_section[] = ELFNOTE_SECTION(MODSIGN_NOTE_NAME);
 
@@ -285,6 +294,11 @@ int module_verify_signature(struct modul
 		break;
 	}
 
+	if (ret && badsigok) {
+		printk(KERN_ERR "Bad signature ignored by cmdline\n");
+		ret = 0;
+	}
+
 	return ret;
 
 format_error:
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/module.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/module.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/module.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/module.c	2015-01-21 12:02:58.443822344 +0300
@@ -631,8 +631,6 @@ static void module_unload_init(struct mo
 		local_set(__module_ref_addr(mod, cpu), 0);
 	/* Hold reference count during initialization. */
 	local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
-	/* Backwards compatibility macros put refcount during init. */
-	mod->waiter = current;
 }
 
 /* modules using other modules */
@@ -755,16 +753,9 @@ static int __try_stop_module(void *_sref
 
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-	if (flags & O_NONBLOCK) {
-		struct stopref sref = { mod, flags, forced };
+	struct stopref sref = { mod, flags, forced };
 
-		return stop_machine(__try_stop_module, &sref, NULL);
-	} else {
-		/* We don't need to stop the machine for this. */
-		mod->state = MODULE_STATE_GOING;
-		synchronize_sched();
-		return 0;
-	}
+	return stop_machine(__try_stop_module, &sref, NULL);
 }
 
 unsigned int module_refcount(struct module *mod)
@@ -781,21 +772,6 @@ EXPORT_SYMBOL(module_refcount);
 /* This exists whether we can unload or not */
 static void free_module(struct module *mod);
 
-static void wait_for_zero_refcount(struct module *mod)
-{
-	/* Since we might sleep for some time, release the mutex first */
-	mutex_unlock(&module_mutex);
-	for (;;) {
-		DEBUGP("Looking at refcount...\n");
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (module_refcount(mod) == 0)
-			break;
-		schedule();
-	}
-	current->state = TASK_RUNNING;
-	mutex_lock(&module_mutex);
-}
-
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
@@ -810,6 +786,11 @@ SYSCALL_DEFINE2(delete_module, const cha
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
+	if (!(flags & O_NONBLOCK)) {
+		printk(KERN_WARNING
+		       "waiting module removal not supported: please upgrade");
+	}
+
 	if (mutex_lock_interruptible(&module_mutex) != 0)
 		return -EINTR;
 
@@ -827,8 +808,7 @@ SYSCALL_DEFINE2(delete_module, const cha
 
 	/* Doing init or already dying? */
 	if (mod->state != MODULE_STATE_LIVE) {
-		/* FIXME: if (force), slam module count and wake up
-                   waiter --RR */
+		/* FIXME: if (force), slam module count damn the torpedoes */
 		DEBUGP("%s already dying\n", mod->name);
 		ret = -EBUSY;
 		goto out;
@@ -844,18 +824,11 @@ SYSCALL_DEFINE2(delete_module, const cha
 		}
 	}
 
-	/* Set this up before setting mod->state */
-	mod->waiter = current;
-
 	/* Stop the machine so refcounts can't move and disable module. */
 	ret = try_stop_module(mod, flags, &forced);
 	if (ret != 0)
 		goto out;
 
-	/* Never wait if forced. */
-	if (!forced && module_refcount(mod) != 0)
-		wait_for_zero_refcount(mod);
-
 	mutex_unlock(&module_mutex);
 	/* Final destruction now noone is using it. */
 	if (mod->exit != NULL)
@@ -943,9 +916,6 @@ void module_put(struct module *module)
 		local_dec(__module_ref_addr(module, cpu));
 		trace_module_put(module, _RET_IP_,
 				 local_read(__module_ref_addr(module, cpu)));
-		/* Maybe they're waiting for us to drop reference? */
-		if (unlikely(!module_is_live(module)))
-			wake_up_process(module->waiter);
 		put_cpu();
 	}
 }
@@ -2068,9 +2038,7 @@ static void kmemleak_load_module(struct 
 	unsigned int i;
 
 	/* only scan the sections containing data */
-	kmemleak_scan_area(mod->module_core, (unsigned long)mod -
-			   (unsigned long)mod->module_core,
-			   sizeof(struct module), GFP_KERNEL);
+	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
 
 	for (i = 1; i < hdr->e_shnum; i++) {
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2079,8 +2047,7 @@ static void kmemleak_load_module(struct 
 		    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
 			continue;
 
-		kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
-				   (unsigned long)mod->module_core,
+		kmemleak_scan_area((void *)sechdrs[i].sh_addr,
 				   sechdrs[i].sh_size, GFP_KERNEL);
 	}
 }
@@ -2415,6 +2382,12 @@ static noinline struct module *load_modu
 					 "_ftrace_events",
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
+	/*
+	 * This section contains pointers to allocated objects in the trace
+	 * code and not scanning it leads to false positives.
+	 */
+	kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+			   mod->num_trace_events, GFP_KERNEL);
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 	/* sechdrs[0].sh_size is always zero */
@@ -2928,6 +2901,8 @@ static char *module_flags(struct module 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&module_mutex);
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
 	return seq_list_start(&modules, *pos);
 }
 
@@ -2992,7 +2967,7 @@ static const struct file_operations proc
 
 static int __init proc_modules_init(void)
 {
-	proc_create("modules", 0, NULL, &proc_modules_operations);
+	proc_create("modules", 0, &glob_proc_root, &proc_modules_operations);
 	return 0;
 }
 module_init(proc_modules_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/mutex.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/mutex.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/mutex.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/mutex.c	2015-01-21 12:02:58.116831024 +0300
@@ -730,7 +730,7 @@ mutex_lock_nested(struct mutex *lock, un
 			    subclass, NULL, _RET_IP_, NULL);
 }
 
-EXPORT_SYMBOL_GPL(mutex_lock_nested);
+EXPORT_SYMBOL(mutex_lock_nested);
 
 void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
@@ -749,7 +749,7 @@ mutex_lock_killable_nested(struct mutex 
 	return __mutex_lock_common(lock, TASK_KILLABLE,
 				   subclass, NULL, _RET_IP_, NULL);
 }
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL(mutex_lock_killable_nested);
 
 int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/nsproxy.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/nsproxy.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/nsproxy.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/nsproxy.c	2015-01-21 12:02:49.867050016 +0300
@@ -29,6 +29,14 @@ static struct kmem_cache *nsproxy_cachep
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
+void get_task_namespaces(struct task_struct *tsk)
+{
+	struct nsproxy *ns = tsk->nsproxy;
+	if (ns) {
+		get_nsproxy(ns);
+	}
+}
+
 static inline struct nsproxy *create_nsproxy(void)
 {
 	struct nsproxy *nsproxy;
@@ -39,6 +47,22 @@ static inline struct nsproxy *create_nsp
 	return nsproxy;
 }
 
+struct nsproxy *duplicate_nsproxy(struct nsproxy *nsproxy)
+{
+	struct nsproxy *ns = create_nsproxy();
+	if (ns) {
+		*ns = *nsproxy;
+		atomic_set(&ns->count, 1);
+		get_uts_ns(ns->uts_ns);
+		get_ipc_ns(ns->ipc_ns);
+		get_mnt_ns(ns->mnt_ns);
+		get_pid_ns(ns->pid_ns);
+		get_net(ns->net_ns);
+	}
+	return ns;
+}
+EXPORT_SYMBOL_GPL(duplicate_nsproxy);
+
 /*
  * Create new nsproxy and all of its the associated namespaces.
  * Return the newly created nsproxy.  Do not attach this to the task,
@@ -107,7 +131,8 @@ out_ns:
  * called from clone.  This now handles copy for nsproxy and all
  * namespaces therein.
  */
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+		int force_admin)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct nsproxy *new_ns;
@@ -122,9 +147,26 @@ int copy_namespaces(unsigned long flags,
 				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN)) {
-		err = -EPERM;
-		goto out;
+	if (!force_admin) {
+		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) {
+			err = -EPERM;
+			goto out;
+		}
+
+		if (!capable(CAP_SYS_ADMIN) &&
+		    (flags & (CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWPID))) {
+			err = -EPERM;
+			goto out;
+		}
+
+		/*
+		 * netns-vs-sysfs is deadly broken, thus new namespace
+		 * (even in ve0) can bring the node down
+		 */
+		if (flags & CLONE_NEWNET) {
+			err = -EINVAL;
+			goto out;
+		}
 	}
 
 	/*
@@ -151,6 +193,7 @@ out:
 	put_nsproxy(old_ns);
 	return err;
 }
+EXPORT_SYMBOL(copy_namespaces);
 
 void free_nsproxy(struct nsproxy *ns)
 {
@@ -165,6 +208,22 @@ void free_nsproxy(struct nsproxy *ns)
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
+EXPORT_SYMBOL(free_nsproxy);
+
+struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk)
+{
+	struct mnt_namespace *mnt_ns = NULL;
+
+	task_lock(tsk);
+	if (tsk->nsproxy)
+		mnt_ns = tsk->nsproxy->mnt_ns;
+	if (mnt_ns)
+		get_mnt_ns(mnt_ns);
+	task_unlock(tsk);
+
+	return mnt_ns;
+}
+EXPORT_SYMBOL(get_task_mnt_ns);
 
 /*
  * Called from unshare. Unshare all the namespaces part of nsproxy.
@@ -179,7 +238,12 @@ int unshare_nsproxy_namespaces(unsigned 
 			       CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
+		return -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN) &&
+	    (unshare_flags & (CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET |
+			      CLONE_NEWPID)))
 		return -EPERM;
 
 	*new_nsp = create_new_namespaces(unshare_flags, current,
@@ -218,6 +282,7 @@ void switch_task_namespaces(struct task_
 		free_nsproxy(ns);
 	}
 }
+EXPORT_SYMBOL_GPL(switch_task_namespaces);
 
 void exit_task_namespaces(struct task_struct *p)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/panic.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/panic.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/panic.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/panic.c	2015-01-21 12:02:57.843838269 +0300
@@ -280,6 +280,11 @@ void add_taint(unsigned flag)
 	    __debug_locks_off())
 		printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
 
+	/* Do not confuse people with calltraces on proprietary modules */
+	if (flag != TAINT_PROPRIETARY_MODULE) {
+		printk(KERN_WARNING "Tainting kernel with flag 0x%x\n", flag);
+		dump_stack();
+	}
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/pid.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/pid.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/pid.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/pid.c	2015-01-21 12:02:57.998834158 +0300
@@ -33,6 +33,7 @@
 #include <linux/rculist.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
+#include <bc/kmem.h>
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
@@ -44,8 +45,6 @@ static struct hlist_head *pid_hash;
 static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
-int pid_max = PID_MAX_DEFAULT;
-
 #define RESERVED_PIDS		300
 
 int pid_max_min = RESERVED_PIDS + 1;
@@ -112,9 +111,9 @@ EXPORT_SYMBOL(is_container_init);
  * For now it is easier to be safe than to prove it can't happen.
  */
 
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct upid *upid)
+void free_pidmap(struct upid *upid)
 {
 	int nr = upid->nr;
 	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
@@ -161,17 +160,17 @@ static void set_last_pid(struct pid_name
 	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
 }
 
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+int alloc_pidmap(struct pid_namespace *pid_ns)
 {
 	int i, offset, max_scan, pid, last = pid_ns->last_pid;
 	struct pidmap *map;
 
 	pid = last + 1;
-	if (pid >= pid_max)
+	if (pid >= pid_ns->pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+	max_scan = (pid_ns->pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
 			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -203,11 +202,11 @@ static int alloc_pidmap(struct pid_names
 			 * bitmap block and the final block was the same
 			 * as the starting point, pid is before last_pid.
 			 */
-			} while (offset < BITS_PER_PAGE && pid < pid_max &&
+			} while (offset < BITS_PER_PAGE && pid < pid_ns->pid_max &&
 					(i != max_scan || pid < last ||
 					    !((last+1) & BITS_PER_PAGE_MASK)));
 		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+		if (map < &pid_ns->pidmap[(pid_ns->pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
@@ -221,6 +220,36 @@ static int alloc_pidmap(struct pid_names
 	return -1;
 }
 
+int set_pidmap(struct pid_namespace *pid_ns, pid_t pid)
+{
+	int offset;
+	struct pidmap *map;
+
+	offset = pid & BITS_PER_PAGE_MASK;
+	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+	if (unlikely(!map->page)) {
+		void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+		/*
+		 * Free the page if someone raced with us
+		 * installing it:
+		 */
+		spin_lock_irq(&pidmap_lock);
+		if (map->page)
+			kfree(page);
+		else
+			map->page = page;
+		spin_unlock_irq(&pidmap_lock);
+		if (unlikely(!map->page))
+			return -ENOMEM;
+	}
+
+	if (test_and_set_bit(offset, map->page))
+		return -EBUSY;
+
+	atomic_dec(&map->nr_free);
+	return pid;
+}
+
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 {
 	int offset;
@@ -256,7 +285,7 @@ void put_pid(struct pid *pid)
 		put_pid_ns(ns);
 	}
 }
-EXPORT_SYMBOL_GPL(put_pid);
+EXPORT_SYMBOL(put_pid);
 
 static void delayed_put_pid(struct rcu_head *rhp)
 {
@@ -273,25 +302,32 @@ void free_pid(struct pid *pid)
 	spin_lock_irqsave(&pidmap_lock, flags);
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
-		hlist_del_rcu(&upid->pid_chain);
-		if (--upid->ns->nr_hashed == 0)
-			schedule_work(&upid->ns->proc_work);
+		if (!hlist_unhashed(&upid->pid_chain)) {
+			hlist_del_rcu(&upid->pid_chain);
+			if (--upid->ns->nr_hashed == 0)
+				schedule_work(&upid->ns->proc_work);
+		}
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
+	ub_kmem_uncharge(pid->ub,
+		kmem_cache_objuse(pid->numbers[pid->level].ns->pid_cachep));
 
 	for (i = 0; i <= pid->level; i++)
 		free_pidmap(pid->numbers + i);
 
+	put_beancounter(pid->ub);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
+EXPORT_SYMBOL(free_pid);
 
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid)
 {
 	struct pid *pid;
 	enum pid_type type;
 	int i, nr;
 	struct pid_namespace *tmp;
 	struct upid *upid;
+	struct user_beancounter *ub;
 
 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
 	if (!pid)
@@ -300,7 +336,10 @@ struct pid *alloc_pid(struct pid_namespa
 	tmp = ns;
 	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		if (vpid != 0 && i == ns->level)
+			nr = set_pidmap(tmp, vpid);
+		else
+			nr = alloc_pidmap(tmp);
 		if (nr < 0)
 			goto out_free;
 
@@ -314,6 +353,13 @@ struct pid *alloc_pid(struct pid_namespa
 			goto out_free;
 	}
 
+#ifdef CONFIG_BEANCOUNTERS
+	ub = get_exec_ub();
+	if (ub_kmem_charge(ub, kmem_cache_objuse(ns->pid_cachep), GFP_KERNEL))
+		goto out_free;
+	pid->ub = get_beancounter(ub);
+#endif
+
 	get_pid_ns(ns);
 	atomic_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
@@ -324,6 +370,9 @@ struct pid *alloc_pid(struct pid_namespa
 	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		if (upid->ns->flags & PID_NS_HIDDEN)
+			while (upid-- > pid->numbers)
+				INIT_HLIST_NODE(&upid->pid_chain);
 		upid->ns->nr_hashed++;
 	}
 	spin_unlock_irq(&pidmap_lock);
@@ -339,6 +388,7 @@ out_free:
 	pid = NULL;
 	goto out;
 }
+EXPORT_SYMBOL_GPL(alloc_pid);
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
@@ -361,6 +411,49 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
+void reattach_pid(struct task_struct *tsk, struct pid *pid)
+{
+	int i;
+	struct pid *old_pid;
+	struct pid_link *link;
+	struct upid *upid;
+
+	link = &tsk->pids[PIDTYPE_PID];
+	old_pid = link->pid;
+
+	hlist_del_rcu(&link->node);
+	link->pid = pid;
+	hlist_add_head_rcu(&link->node, &pid->tasks[PIDTYPE_PID]);
+
+	for (i = PIDTYPE_MAX; --i >= 0; )
+		if (!hlist_empty(&old_pid->tasks[i]))
+			BUG();
+
+	for (i = 0; i < pid->level; i++)
+		hlist_replace_rcu(&old_pid->numbers[i].pid_chain,
+				&pid->numbers[i].pid_chain);
+
+	if (old_pid->level > 0) {
+		upid = &old_pid->numbers[old_pid->level];
+		hlist_del_rcu(&upid->pid_chain);
+	}
+
+	upid = &pid->numbers[pid->level];
+	hlist_add_head_rcu(&upid->pid_chain,
+			&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+	upid->ns->nr_hashed++; /* see alloc_pid() */
+
+	spin_unlock(&pidmap_lock);
+	write_unlock_irq(&tasklist_lock);
+
+	if (old_pid->level > 0) {
+		upid = &old_pid->numbers[old_pid->level];
+		pid_ns_release_proc(upid->ns);
+	}
+
+	call_rcu(&old_pid->rcu, delayed_put_pid);
+}
+
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
@@ -373,6 +466,7 @@ void attach_pid(struct task_struct *task
 	link->pid = pid;
 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
 }
+EXPORT_SYMBOL(attach_pid);
 
 static void __change_pid(struct task_struct *task, enum pid_type type,
 			struct pid *new)
@@ -398,6 +492,7 @@ void detach_pid(struct task_struct *task
 {
 	__change_pid(task, type, NULL);
 }
+EXPORT_SYMBOL(detach_pid);
 
 void change_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
@@ -434,11 +529,13 @@ struct task_struct *find_task_by_pid_ns(
 {
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
+EXPORT_SYMBOL(find_task_by_pid_ns);
 
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
 	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
 }
+EXPORT_SYMBOL(find_task_by_vpid);
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
@@ -474,7 +571,35 @@ struct pid *find_get_pid(pid_t nr)
 
 	return pid;
 }
-EXPORT_SYMBOL_GPL(find_get_pid);
+EXPORT_SYMBOL(find_get_pid);
+
+pid_t pid_to_vpid(pid_t nr)
+{
+	struct pid *pid;
+
+	pid = find_pid_ns(nr, &init_pid_ns);
+	if (pid)
+		return pid->numbers[pid->level].nr;
+	return -1;
+}
+EXPORT_SYMBOL_GPL(pid_to_vpid);
+
+pid_t vpid_to_pid_ve(pid_t vnr, struct ve_struct *env)
+{
+	struct pid *pid;
+	pid_t nr = -1;
+
+	if (unlikely(ve_is_super(env)))
+		return -1;
+
+	rcu_read_lock();
+	pid = find_pid_ns(vnr, env->ve_ns->pid_ns);
+	if (pid != NULL)
+		nr = pid->numbers[0].nr;
+	rcu_read_unlock();
+
+	return nr;
+}
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
@@ -488,6 +613,7 @@ pid_t pid_nr_ns(struct pid *pid, struct 
 	}
 	return nr;
 }
+EXPORT_SYMBOL_GPL(pid_nr_ns);
 
 pid_t pid_vnr(struct pid *pid)
 {
@@ -514,6 +640,16 @@ pid_t __task_pid_nr_ns(struct task_struc
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 
+pid_t ve_task_ppid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t ppid;
+	ppid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	/* It's dirty hack. Some old utils don't work if ppid is zero*/
+	if (ppid == 0 && ns->child_reaper != tsk)
+		ppid = 1;
+	return ppid;
+}
+
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return pid_nr_ns(task_tgid(tsk), ns);
@@ -566,11 +702,11 @@ void __init pidhash_init(void)
 void __init pidmap_init(void)
 {
 	/* bump default and minimum pid_max based on number of cpus */
-	pid_max = min(pid_max_max, max_t(int, pid_max,
+	init_pid_ns.pid_max = min(pid_max_max, max_t(int, PID_MAX_DEFAULT,
 				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 	pid_max_min = max_t(int, pid_max_min,
 				PIDS_PER_CPU_MIN * num_possible_cpus());
-	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
 
 	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/pid_namespace.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/pid_namespace.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/pid_namespace.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/pid_namespace.c	2015-01-21 12:02:47.197120897 +0300
@@ -15,6 +15,11 @@
 #include <linux/acct.h>
 #include <linux/proc_fs.h>
 #include <linux/reboot.h>
+#include <linux/module.h>
+#include <linux/ve_proto.h>
+#include <linux/kthread.h>
+
+#include <bc/kmem.h>
 
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 
@@ -100,6 +105,7 @@ static struct pid_namespace *create_pid_
 	kref_init(&ns->kref);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
+	ns->pid_max = PID_MAX_NS_DEFAULT;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	set_bit(0, ns->pidmap[0].page);
@@ -125,6 +131,10 @@ static void destroy_pid_namespace(struct
 	proc_free_inum(ns->proc_inum);
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	kfree(ns->bacct);
+#endif
 	kmem_cache_free(pid_ns_cachep, ns);
 }
 
@@ -150,11 +160,165 @@ void free_pid_ns(struct kref *kref)
 		put_pid_ns(parent);
 }
 
+/*
+ * this is a dirty ugly hack.
+ */
+
+static int __pid_ns_attach_task(struct pid_namespace *ns,
+		struct task_struct *tsk, pid_t nr)
+{
+	struct pid *pid, *old_pid;
+	enum pid_type type;
+	unsigned long old_size, new_size;
+
+	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
+	if (!pid)
+		goto out;
+
+	if (nr == 0)
+		nr = alloc_pidmap(ns);
+	else
+		nr = set_pidmap(ns, nr);
+
+	if (nr < 0)
+		goto out_free;
+
+	old_pid = task_pid(tsk);
+	memcpy(pid, old_pid,
+		sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid));
+
+	pid->level = ns->level;
+	pid->numbers[pid->level].nr = nr;
+	pid->numbers[pid->level].ns = get_pid_ns(ns);
+	atomic_set(&pid->count, 1);
+	for (type = 0; type < PIDTYPE_MAX; ++type)
+		INIT_HLIST_HEAD(&pid->tasks[type]);
+
+	old_size = kmem_cache_objuse(old_pid->numbers[old_pid->level].ns->pid_cachep);
+	new_size = kmem_cache_objuse(pid->numbers[pid->level].ns->pid_cachep);
+	/*
+	 * Depending on sizeof(struct foo), cache flags (redzoning, etc)
+	 * and actual CPU (cacheline_size() jump from 64 to 128 bytes after
+	 * CPU detection) new size can very well be smaller than old size.
+	 */
+	if (new_size > old_size) {
+		if (ub_kmem_charge(pid->ub, new_size - old_size, UB_HARD) < 0)
+			goto out_enable;
+	} else if (new_size < old_size)
+		ub_kmem_uncharge(pid->ub, old_size - new_size);
+
+	write_lock_irq(&tasklist_lock);
+
+	change_pid(tsk, PIDTYPE_SID, pid);
+	change_pid(tsk, PIDTYPE_PGID, pid);
+
+	spin_lock(&pidmap_lock);
+	tsk->signal->leader_pid = pid;
+	put_pid(current->signal->tty_old_pgrp);
+	current->signal->tty_old_pgrp = NULL;
+
+	reattach_pid(tsk, pid);
+
+	return 0;
+
+out_enable:
+	local_irq_enable();
+	free_pidmap(pid->numbers + pid->level);
+	put_pid_ns(ns);
+out_free:
+	kmem_cache_free(ns->pid_cachep, pid);
+out:
+	return -ENOMEM;
+}
+
+int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk)
+{
+	return __pid_ns_attach_task(ns, tsk, 0);
+}
+EXPORT_SYMBOL_GPL(pid_ns_attach_task);
+
+int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk)
+{
+	int err;
+
+	err = __pid_ns_attach_task(ns, tsk, 1);
+	if (err < 0)
+		return err;
+
+	ns->child_reaper = tsk;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pid_ns_attach_init);
+
+#ifdef CONFIG_VE
+static noinline void show_lost_task(struct task_struct *p)
+{
+	printk("Lost task: %d/%s/%p blocked: %lx pending: %lx\n",
+			p->pid, p->comm, p,
+			p->blocked.sig[0],
+			p->pending.signal.sig[0]);
+}
+
+static void zap_ve_processes(struct ve_struct *env)
+{
+	int kthreads = 0;
+	/* wait for all init childs exit */
+	while (env->pcounter > 1 + kthreads) {
+		struct task_struct *g, *p;
+		long delay = 1;
+
+		if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0)
+			continue;
+		/* it was ENOCHLD or no more children somehow */
+		if (env->pcounter == 1)
+			break;
+
+		/* clear all signals to avoid wakeups */
+		if (signal_pending(current))
+			flush_signals(current);
+		/* we have child without signal sent */
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(delay);
+		delay = (delay < HZ) ? (delay << 1) : HZ;
+again:
+		read_lock(&tasklist_lock);
+		kthreads = 0;
+		do_each_thread_ve(g, p) {
+			if (p->flags & PF_KTHREAD) {
+				kthreads++;
+				continue;
+			}
+			if (p != current) {
+				/*
+				 * by that time no processes other then entered
+				 * may exist in the VE. if some were missed by
+				 * zap_pid_ns_processes() this was a BUG
+				 */
+				if (!p->did_ve_enter)
+					show_lost_task(p);
+
+				force_sig_specific(SIGKILL, p);
+
+				if (reap_zombie(p))
+					goto again;
+			}
+		} while_each_thread_ve(g, p);
+		read_unlock(&tasklist_lock);
+	}
+
+	ve_hook_iterate_fini(VE_SS_CHAIN, get_exec_env());
+
+	destroy_workqueue(env->khelper_wq);
+	kthreadd_stop(env);
+}
+#endif
+
 void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
 	int nr;
 	int rc;
 	struct task_struct *task;
+	struct ve_struct *env = get_exec_env();
 
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
@@ -180,8 +344,12 @@ void zap_pid_ns_processes(struct pid_nam
 		 * signal
 		 */
 		task = pid_task(find_vpid(nr), PIDTYPE_PID);
-		if (task)
-			force_sig(SIGKILL, task);
+		if (task) {
+			if ((task->flags & PF_KTHREAD))
+				send_sig(SIGKILL, task, 1);
+			else
+				force_sig(SIGKILL, task);
+		}
 
 		rcu_read_unlock();
 
@@ -198,6 +366,11 @@ void zap_pid_ns_processes(struct pid_nam
 		current->signal->group_exit_code = pid_ns->reboot;
 
 	acct_exit_ns(pid_ns);
+
+#ifdef CONFIG_VE
+	if (pid_ns == env->ve_ns->pid_ns)
+		zap_ve_processes(env);
+#endif
 	return;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/posix-cpu-timers.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/posix-cpu-timers.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/posix-cpu-timers.c	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/posix-cpu-timers.c	2015-01-21 12:02:57.973834821 +0300
@@ -9,6 +9,7 @@
 #include <asm/uaccess.h>
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
+#include <linux/module.h>
 
 /*
  * Called after updating RLIMIT_CPU to set timer expiration if necessary.
@@ -1387,6 +1388,7 @@ static inline int fastpath_timer_check(s
 
 	return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
 }
+EXPORT_SYMBOL(set_process_cpu_timer);
 
 /*
  * This is called from the timer interrupt handler.  The irq handler has
@@ -1539,8 +1541,10 @@ static int do_cpu_nanosleep(const clocki
 		while (!signal_pending(current)) {
 			if (timer.it.cpu.expires.sched == 0) {
 				/*
-				 * Our timer fired and was reset.
+				 * Our timer fired and was reset, below
+				 * deletion can not fail.
 				 */
+				posix_cpu_timer_del(&timer);
 				spin_unlock_irq(&timer.it_lock);
 				return 0;
 			}
@@ -1558,9 +1562,26 @@ static int do_cpu_nanosleep(const clocki
 		 * We were interrupted by a signal.
 		 */
 		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-		posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		if (!error) {
+			/*
+			 * Timer is now unarmed, deletion can not fail.
+			 */
+			posix_cpu_timer_del(&timer);
+		}
 		spin_unlock_irq(&timer.it_lock);
 
+		while (error == TIMER_RETRY) {
+			/*
+			 * We need to handle case when timer was or is in the
+			 * middle of firing. In other cases we already freed
+			 * resources.
+			 */
+			spin_lock_irq(&timer.it_lock);
+			error = posix_cpu_timer_del(&timer);
+			spin_unlock_irq(&timer.it_lock);
+		}
+
 		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
 			/*
 			 * It actually did fire already.
@@ -1574,7 +1595,7 @@ static int do_cpu_nanosleep(const clocki
 	return error;
 }
 
-static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
+long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 			    struct timespec *rqtp, struct timespec __user *rmtp)
@@ -1612,7 +1633,7 @@ static int posix_cpu_nsleep(const clocki
 	return error;
 }
 
-static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.index;
 	struct timespec t;
@@ -1636,6 +1657,7 @@ static long posix_cpu_nsleep_restart(str
 	return error;
 
 }
+EXPORT_SYMBOL_GPL(posix_cpu_nsleep_restart);
 
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/posix-timers.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/posix-timers.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/posix-timers.c	2014-12-12 23:29:20.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/posix-timers.c	2015-01-21 12:02:50.331037698 +0300
@@ -48,6 +48,8 @@
 #include <linux/workqueue.h>
 #include <linux/module.h>
 
+#include <bc/beancounter.h>
+
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
  * Timer ids are allocated by an external routine that keeps track of the
@@ -70,8 +72,14 @@
  * Lets keep our timers in a slab cache :-)
  */
 static struct kmem_cache *posix_timers_cache;
+
+#ifdef CONFIG_VE
+#define posix_timers_id		(get_exec_env()->_posix_timers_id)
+#define idr_lock		(get_exec_env()->posix_timers_lock)
+#else
 static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+static spinlock_t idr_lock;
+#endif
 
 /*
  * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -151,6 +159,36 @@ static inline void unlock_timer(struct k
 	spin_unlock_irqrestore(&timr->it_lock, flags);
 }
 
+#define clock_is_monotonic(which_clock) \
+	((which_clock) == CLOCK_MONOTONIC || \
+	 (which_clock) == CLOCK_MONOTONIC_RAW || \
+	 (which_clock) == CLOCK_MONOTONIC_COARSE)
+
+#ifdef CONFIG_VE
+void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	if (clock_is_monotonic(which_clock))
+		set_normalized_timespec(tp,
+				tp->tv_sec - ve->start_timespec.tv_sec,
+				tp->tv_nsec - ve->start_timespec.tv_nsec);
+}
+
+void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	if (clock_is_monotonic(which_clock))
+		set_normalized_timespec(tp,
+				tp->tv_sec + ve->start_timespec.tv_sec,
+				tp->tv_nsec + ve->start_timespec.tv_nsec);
+}
+#else
+void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp) { }
+void monotonic_ve_to_abs(clockid_t which_clock, struct timepsec *tp) { }
+#endif
+
 /* Get clock_realtime */
 static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
 {
@@ -208,6 +246,13 @@ int posix_get_coarse_res(const clockid_t
 	*tp = ktime_to_timespec(KTIME_LOW_RES);
 	return 0;
 }
+
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+	get_monotonic_boottime(tp);
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -247,17 +292,29 @@ static __init int init_posix_timers(void
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 	};
+	struct k_clock clock_boottime = {
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_boottime,
+		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
+	};
 
 	posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
 	posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
 	posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
 	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
 	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+	posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
-					sizeof (struct k_itimer), 0, SLAB_PANIC,
-					NULL);
+					sizeof (struct k_itimer), 0,
+					SLAB_PANIC|SLAB_UBC, NULL);
 	idr_init(&posix_timers_id);
+	spin_lock_init(&idr_lock);
 	return 0;
 }
 
@@ -331,8 +388,17 @@ int posix_timer_event(struct k_itimer *t
 	rcu_read_lock();
 	task = pid_task(timr->it_pid, PIDTYPE_PID);
 	if (task) {
+		struct ve_struct *ve;
+		struct user_beancounter *ub;
+
+		ve = set_exec_env(task->ve_task_info.owner_env);
+		ub = set_exec_ub(task->task_bc.task_ub);
+
 		shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
 		ret = send_sigqueue(timr->sigq, task, shared);
+
+		(void)set_exec_ub(ub);
+		(void)set_exec_env(ve);
 	}
 	rcu_read_unlock();
 	/* If we failed to send the signal the timer stops. */
@@ -505,11 +571,14 @@ static int common_timer_create(struct k_
 	return 0;
 }
 
-/* Create a POSIX.1b interval timer. */
-
-SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
-		struct sigevent __user *, timer_event_spec,
-		timer_t __user *, created_timer_id)
+/*
+ * If timer_id >= 0, the function will create a timer with the id
+ * specified in timer_id or return -EEXIST if a timer with such
+ * an id already exists. Otherwise, the value of timer_id is ignored.
+ */
+static int __timer_create_id(const clockid_t which_clock,
+			     struct sigevent __user *timer_event_spec,
+			     timer_t timer_id, timer_t __user *created_timer_id)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct k_itimer *new_timer;
@@ -533,7 +602,15 @@ SYSCALL_DEFINE3(timer_create, const cloc
 		goto out;
 	}
 	spin_lock_irq(&idr_lock);
-	error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
+	/* Ugly, but otherwise we would have to extend the idr API */
+	error = idr_get_new_above(&posix_timers_id, new_timer,
+			timer_id >= 0 ? timer_id : 0, &new_timer_id);
+	if (timer_id >= 0 &&
+	    (error == -ENOSPC || timer_id != new_timer_id)) {
+		if (!error)
+			idr_remove(&posix_timers_id, new_timer_id);
+		error = -EEXIST;
+	}
 	spin_unlock_irq(&idr_lock);
 	if (error) {
 		if (error == -EAGAIN)
@@ -542,7 +619,8 @@ SYSCALL_DEFINE3(timer_create, const cloc
 		 * Weird looking, but we return EAGAIN if the IDR is
 		 * full (proper POSIX return value for this)
 		 */
-		error = -EAGAIN;
+		if (error != -EEXIST)
+			error = -EAGAIN;
 		goto out;
 	}
 
@@ -603,6 +681,31 @@ out:
 	return error;
 }
 
+int timer_create_id(const clockid_t which_clock,
+		    struct sigevent *timer_event_spec, timer_t *timer_id)
+{
+	int err;
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = __timer_create_id(which_clock, timer_event_spec,
+				*timer_id >= 0 ? *timer_id : -1, timer_id);
+	set_fs(oldfs);
+	return err;
+}
+EXPORT_SYMBOL(timer_create_id);
+
+/* Create a POSIX.1b interval timer. */
+
+SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
+		struct sigevent __user *, timer_event_spec,
+		timer_t __user *, created_timer_id)
+{
+	return __timer_create_id(which_clock, timer_event_spec,
+				 -1, created_timer_id);
+}
+
 /*
  * Locking issues: We need to protect the result of the id look up until
  * we get the timer locked down so it is not deleted under us.  The
@@ -714,6 +817,25 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, 
 	return ret;
 }
 
+void get_timer_setting(struct k_itimer *timr, struct itimerspec *setting,
+		int *overrun, int *overrun_last, int *signal_pending)
+{
+	unsigned long flags;
+	struct k_clock *kc;
+
+	spin_lock_irqsave(&timr->it_lock, flags);
+	kc = clockid_to_kclock(timr->it_clock);
+
+	if (WARN_ON_ONCE(!kc || !kc->timer_get))
+		return;
+	kc->timer_get(timr, setting);
+	*overrun = timr->it_overrun;
+	*overrun_last = timr->it_overrun_last;
+	*signal_pending = !list_empty(&timr->sigq->list);
+	spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+EXPORT_SYMBOL(get_timer_setting);
+
 /*
  * Get the number of overruns of a POSIX.1b interval timer.  This is to
  * be the overrun of the timer last delivered.  At the same time we are
@@ -816,6 +938,9 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
+	if ((flags & TIMER_ABSTIME) &&
+	    (new_spec.it_value.tv_sec || new_spec.it_value.tv_nsec))
+		monotonic_ve_to_abs(timr->it_clock, &new_spec.it_value);
 	kc = clockid_to_kclock(timr->it_clock);
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -835,6 +960,51 @@ retry:
 	return error;
 }
 
+int timer_setup(timer_t timer_id, struct itimerspec *setting,
+		int overrun, int overrun_last, int signal_pending)
+{
+	struct k_itimer *timr;
+	unsigned long flags;
+	int err;
+	struct k_clock *kc;
+
+	if (!timespec_valid(&setting->it_interval) ||
+	    !timespec_valid(&setting->it_value))
+		return -EINVAL;
+
+	if (overrun >= 0 &&
+	    ((!setting->it_value.tv_sec && !setting->it_value.tv_nsec) ||
+	     (!setting->it_interval.tv_sec && !setting->it_interval.tv_nsec)))
+		return -EINVAL;
+
+retry:
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	kc = clockid_to_kclock(timr->it_clock);
+	if (WARN_ON_ONCE(!kc || !kc->timer_set))
+		err = -EINVAL;
+	else
+		err = kc->timer_set(timr, 0, setting, NULL);
+
+	if (!err) {
+		if (overrun >= 0)
+			timr->it_overrun = overrun;
+		if (overrun_last >= 0)
+			timr->it_overrun_last = overrun_last;
+		if (signal_pending)
+			posix_timer_event(timr, timr->it_requeue_pending);
+	}
+
+	unlock_timer(timr, flags);
+	if (err == TIMER_RETRY)
+		goto retry;
+
+	return err;
+}
+EXPORT_SYMBOL(timer_setup);
+
 static int common_timer_del(struct k_itimer *timer)
 {
 	timer->it.real.interval.tv64 = 0;
@@ -949,6 +1119,7 @@ SYSCALL_DEFINE2(clock_gettime, const clo
 
 	error = kc->clock_get(which_clock, &kernel_tp);
 
+	monotonic_abs_to_ve(which_clock, &kernel_tp);
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
 
@@ -1025,6 +1196,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const c
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
+	if (flags & TIMER_ABSTIME)
+		monotonic_ve_to_abs(which_clock, &t);
+
 	return kc->nsleep(which_clock, flags, &t, rmtp);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/power/process.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/power/process.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/power/process.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/power/process.c	2015-01-21 12:02:48.067097799 +0300
@@ -24,7 +24,9 @@ static inline int freezeable(struct task
 {
 	if ((p == current) ||
 	    (p->flags & PF_NOFREEZE) ||
-	    (p->exit_state != 0))
+	    (p->exit_state != 0) ||
+	    (p->state == TASK_STOPPED) ||
+	    (p->state == TASK_TRACED))
 		return 0;
 	return 1;
 }
@@ -44,7 +46,7 @@ static int try_to_freeze_tasks(bool sig_
 	do {
 		todo = 0;
 		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
+		do_each_thread_all(g, p) {
 			if (frozen(p) || !freezeable(p))
 				continue;
 
@@ -60,7 +62,7 @@ static int try_to_freeze_tasks(bool sig_
 			if (!task_is_stopped_or_traced(p) &&
 			    !freezer_should_skip(p))
 				todo++;
-		} while_each_thread(g, p);
+		} while_each_thread_all(g, p);
 		read_unlock(&tasklist_lock);
 		yield();			/* Yield is okay here */
 		if (time_after(jiffies, end_time))
@@ -84,13 +86,13 @@ static int try_to_freeze_tasks(bool sig_
 				elapsed_csecs / 100, elapsed_csecs % 100, todo);
 		show_state();
 		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
+		do_each_thread_all(g, p) {
 			task_lock(p);
 			if (freezing(p) && !freezer_should_skip(p))
 				printk(KERN_ERR " %s\n", p->comm);
 			cancel_freezing(p);
 			task_unlock(p);
-		} while_each_thread(g, p);
+		} while_each_thread_all(g, p);
 		read_unlock(&tasklist_lock);
 	} else {
 		printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
@@ -132,7 +134,7 @@ static void thaw_tasks(bool nosig_only)
 	struct task_struct *g, *p;
 
 	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		if (!freezeable(p))
 			continue;
 
@@ -142,8 +144,10 @@ static void thaw_tasks(bool nosig_only)
 		if (cgroup_freezing_or_frozen(p))
 			continue;
 
-		thaw_process(p);
-	} while_each_thread(g, p);
+		if (!thaw_process(p))
+			printk(KERN_WARNING " Strange, %s not stopped\n",
+				p->comm );
+	} while_each_thread_all(g, p);
 	read_unlock(&tasklist_lock);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/printk.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/printk.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/printk.c	2014-12-12 23:29:14.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/printk.c	2015-01-21 12:02:44.280198336 +0300
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/kmsg_dump.h>
+#include <linux/veprintk.h>
 
 #include <asm/uaccess.h>
 
@@ -137,6 +138,7 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
+int console_silence_loglevel;
 
 #ifdef CONFIG_PRINTK
 
@@ -163,6 +165,19 @@ void log_buf_kexec_setup(void)
 }
 #endif
 
+static int __init setup_console_silencelevel(char *str)
+{
+	int level;
+
+	if (get_option(&str, &level) != 1)
+		return 0;
+
+	console_silence_loglevel = level;
+	return 1;
+}
+
+__setup("silencelevel=", setup_console_silencelevel);
+
 /* requested log_buf_len from kernel cmdline */
 static unsigned long __initdata new_log_buf_len;
 
@@ -210,6 +225,9 @@ void __init setup_log_buf(unsigned long 
 	spin_lock_irqsave(&logbuf_lock, flags);
 	log_buf_len = new_log_buf_len;
 	log_buf = new_log_buf;
+#ifdef CONFIG_VE
+	ve0.log_buf = log_buf;
+#endif
 	new_log_buf_len = 0;
 	free = __LOG_BUF_LEN - log_end;
 
@@ -313,6 +331,9 @@ int do_syslog(int type, char __user *buf
 	char c;
 	int error = 0;
 
+	if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7))
+		goto out;
+
 	error = security_syslog(type);
 	if (error)
 		return error;
@@ -333,15 +354,15 @@ int do_syslog(int type, char __user *buf
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait,
-							(log_start - log_end));
+		error = wait_event_interruptible(ve_log_wait,
+						(ve_log_start - ve_log_end));
 		if (error)
 			goto out;
 		i = 0;
 		spin_lock_irq(&logbuf_lock);
-		while (!error && (log_start != log_end) && i < len) {
-			c = LOG_BUF(log_start);
-			log_start++;
+		while (!error && (ve_log_start != ve_log_end) && i < len) {
+			c = VE_LOG_BUF(ve_log_start);
+			ve_log_start++;
 			spin_unlock_irq(&logbuf_lock);
 			error = __put_user(c,buf);
 			buf++;
@@ -367,15 +388,17 @@ int do_syslog(int type, char __user *buf
 			error = -EFAULT;
 			goto out;
 		}
+		if (ve_log_buf == NULL)
+			goto out;
 		count = len;
-		if (count > log_buf_len)
-			count = log_buf_len;
 		spin_lock_irq(&logbuf_lock);
-		if (count > logged_chars)
-			count = logged_chars;
+		if (count > ve_log_buf_len)
+			count = ve_log_buf_len;
+		if (count > ve_logged_chars)
+			count = ve_logged_chars;
 		if (do_clear)
-			logged_chars = 0;
-		limit = log_end;
+			ve_logged_chars = 0;
+		limit = ve_log_end;
 		/*
 		 * __put_user() could sleep, and while we sleep
 		 * printk() could overwrite the messages
@@ -384,9 +407,9 @@ int do_syslog(int type, char __user *buf
 		 */
 		for (i = 0; i < count && !error; i++) {
 			j = limit-1-i;
-			if (j + log_buf_len < log_end)
+			if (j + ve_log_buf_len < ve_log_end)
 				break;
-			c = LOG_BUF(j);
+			c = VE_LOG_BUF(j);
 			spin_unlock_irq(&logbuf_lock);
 			error = __put_user(c,&buf[count-1-i]);
 			cond_resched();
@@ -410,7 +433,7 @@ int do_syslog(int type, char __user *buf
 		}
 		break;
 	case 5:		/* Clear ring buffer */
-		logged_chars = 0;
+		ve_logged_chars = 0;
 		break;
 	case 6:		/* Disable logging to console */
 		if (saved_console_loglevel == -1)
@@ -427,18 +450,21 @@ int do_syslog(int type, char __user *buf
 		error = -EINVAL;
 		if (len < 1 || len > 8)
 			goto out;
+		error = 0;
+		/* VE has no console, so return success */
+		if (!ve_is_super(get_exec_env()))
+			goto out;
 		if (len < minimum_console_loglevel)
 			len = minimum_console_loglevel;
 		console_loglevel = len;
 		/* Implicitly re-enable logging to console */
 		saved_console_loglevel = -1;
-		error = 0;
 		break;
 	case 9:		/* Number of chars in the log buffer */
-		error = log_end - log_start;
+		error = ve_log_end - ve_log_start;
 		break;
 	case 10:	/* Size of the log buffer */
-		error = log_buf_len;
+		error = ve_log_buf_len;
 		break;
 	default:
 		error = -EINVAL;
@@ -549,14 +575,14 @@ static void call_console_drivers(unsigne
 
 static void emit_log_char(char c)
 {
-	LOG_BUF(log_end) = c;
-	log_end++;
-	if (log_end - log_start > log_buf_len)
-		log_start = log_end - log_buf_len;
-	if (log_end - con_start > log_buf_len)
-		con_start = log_end - log_buf_len;
-	if (logged_chars < log_buf_len)
-		logged_chars++;
+	VE_LOG_BUF(ve_log_end) = c;
+	ve_log_end++;
+	if (ve_log_end - ve_log_start > ve_log_buf_len)
+		ve_log_start = ve_log_end - ve_log_buf_len;
+	if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len)
+		con_start = ve_log_end - ve_log_buf_len;
+	if (ve_logged_chars < ve_log_buf_len)
+		ve_logged_chars++;
 }
 
 /*
@@ -624,6 +650,30 @@ static int have_callable_console(void)
  * See the vsnprintf() documentation for format string extensions over C99.
  */
 
+static inline int ve_log_init(void)
+{
+#ifdef CONFIG_VE
+	if (ve_log_buf != NULL)
+		return 0;
+
+	if (ve_is_super(get_exec_env())) {
+		ve0._log_wait = &log_wait;
+		ve0._log_start = &log_start;
+		ve0._log_end = &log_end;
+		ve0._logged_chars = &logged_chars;
+		ve0.log_buf = log_buf;
+		return 0;
+	}
+
+	ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC);
+	if (!ve_log_buf)
+		return -ENOMEM;
+
+	memset(ve_log_buf, 0, ve_log_buf_len);
+#endif
+	return 0;
+}
+
 asmlinkage int printk(const char *fmt, ...)
 {
 	va_list args;
@@ -705,13 +755,14 @@ static inline void printk_delay(void)
 	}
 }
 
-asmlinkage int vprintk(const char *fmt, va_list args)
+asmlinkage int __vprintk(const char *fmt, va_list args)
 {
 	int printed_len = 0;
 	int current_log_level = default_message_loglevel;
 	unsigned long flags;
 	int this_cpu;
 	char *p;
+	int err, need_wake;
 
 	boot_delay_msec();
 	printk_delay();
@@ -743,6 +794,13 @@ asmlinkage int vprintk(const char *fmt, 
 	spin_lock(&logbuf_lock);
 	printk_cpu = this_cpu;
 
+	err = ve_log_init();
+	if (err) {
+		spin_unlock(&logbuf_lock);
+		printed_len = err;
+		goto out_lockdep;
+	}
+
 	if (recursion_bug) {
 		recursion_bug = 0;
 		strcpy(printk_buf, recursion_bug_msg);
@@ -833,19 +891,67 @@ asmlinkage int vprintk(const char *fmt, 
 	 * will release 'logbuf_lock' regardless of whether it
 	 * actually gets the semaphore or not.
 	 */
-	if (acquire_console_semaphore_for_printk(this_cpu))
+	if (!ve_is_super(get_exec_env())) {
+		need_wake = (ve_log_start != ve_log_end);
+		printk_cpu = UINT_MAX;
+		spin_unlock(&logbuf_lock);
+		lockdep_on();
+		raw_local_irq_restore(flags);
+		if (!oops_in_progress && need_wake)
+			wake_up_interruptible(&ve_log_wait);
+		goto out_preempt;
+	} else if (acquire_console_semaphore_for_printk(this_cpu))
 		release_console_sem();
 
+out_lockdep:
 	lockdep_on();
 out_restore_irqs:
 	raw_local_irq_restore(flags);
 
+out_preempt:
 	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
 EXPORT_SYMBOL(vprintk);
 
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+	int i;
+	struct ve_struct *env;
+
+	env = set_exec_env(get_ve0());
+	i = __vprintk(fmt, args);
+	(void)set_exec_env(env);
+	return i;
+}
+
+asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
+{
+	int printed_len;
+	va_list args2;
+
+	printed_len = 0;
+	va_copy(args2, args);
+	if (ve_is_super(get_exec_env()) || (dst & VE0_LOG))
+		printed_len = vprintk(fmt, args);
+	if (!ve_is_super(get_exec_env()) && (dst & VE_LOG))
+		printed_len = __vprintk(fmt, args2);
+	return printed_len;
+}
+
+asmlinkage int ve_printk(int dst, const char *fmt, ...)
+{
+	va_list args;
+	int printed_len;
+
+	va_start(args, fmt);
+	printed_len = ve_vprintk(dst, fmt, args);
+	va_end(args);
+	return printed_len;
+}
+EXPORT_SYMBOL(ve_printk);
+
 #else
 
 static void call_console_drivers(unsigned start, unsigned end)
@@ -1105,6 +1211,7 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+		printk_cpu = UINT_MAX;
 		spin_unlock(&logbuf_lock);
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
@@ -1113,6 +1220,7 @@ void release_console_sem(void)
 	}
 	console_locked = 0;
 	up(&console_sem);
+	printk_cpu = UINT_MAX;
 	spin_unlock_irqrestore(&logbuf_lock, flags);
 	if (wake_klogd)
 		wake_up_klogd();
@@ -1578,3 +1686,65 @@ void kmsg_dump(enum kmsg_dump_reason rea
 	spin_unlock_irqrestore(&dump_list_lock, flags);
 }
 #endif
+
+static cpumask_t nmi_show_regs_cpus = CPU_MASK_NONE;
+static unsigned long nmi_show_regs_timeout;
+
+void __attribute__((weak)) send_nmi_ipi_allbutself(void)
+{
+	cpus_clear(nmi_show_regs_cpus);
+}
+
+static void busted_show_regs(struct pt_regs *regs, int in_nmi)
+{
+	if (!regs || (in_nmi && spin_is_locked(&logbuf_lock)))
+		return;
+
+	bust_spinlocks(1);
+	printk("----------- IPI show regs -----------\n");
+	show_regs(regs);
+	bust_spinlocks(-1);
+}
+
+void nmi_show_regs(struct pt_regs *regs, int in_nmi)
+{
+	if (cpus_empty(nmi_show_regs_cpus))
+		goto doit;
+
+	/* Previous request still in progress */
+	if (time_before(jiffies, nmi_show_regs_timeout))
+		return;
+
+	if (!in_nmi || !spin_is_locked(&logbuf_lock)) {
+		int cpu;
+
+		bust_spinlocks(1);
+		printk("previous show regs lost IPI to: ");
+		for_each_cpu_mask(cpu, nmi_show_regs_cpus)
+			printk("%d ", cpu);
+		printk("\n");
+		bust_spinlocks(-1);
+	}
+
+doit:
+	nmi_show_regs_timeout = jiffies + HZ/10;
+	nmi_show_regs_cpus = cpu_online_map;
+	cpu_clear(raw_smp_processor_id(), nmi_show_regs_cpus);
+	busted_show_regs(regs, in_nmi);
+	send_nmi_ipi_allbutself();
+}
+
+/* call only from nmi handler */
+int do_nmi_show_regs(struct pt_regs *regs, int cpu)
+{
+	static DEFINE_SPINLOCK(nmi_show_regs_lock);
+
+	if (!cpu_isset(cpu, nmi_show_regs_cpus))
+		return 0;
+
+	spin_lock(&nmi_show_regs_lock);
+	busted_show_regs(regs, 1);
+	cpu_clear(cpu, nmi_show_regs_cpus);
+	spin_unlock(&nmi_show_regs_lock);
+	return 1;
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ptrace-utrace.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ptrace-utrace.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ptrace-utrace.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ptrace-utrace.c	2015-01-21 12:02:44.596189947 +0300
@@ -681,6 +681,10 @@ int ptrace_attach(struct task_struct *ta
 
 	task_lock(task);
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	if (!retval) {
+		if (!task->mm || task->mm->vps_dumpable == VD_LICDATA_ACCESS)
+			retval = -EACCES;
+	}
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ptrace.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ptrace.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ptrace.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ptrace.c	2015-01-21 12:02:44.596189947 +0300
@@ -37,6 +37,8 @@ int __ptrace_may_access(struct task_stru
 	 * or halting the specified task is impossible.
 	 */
 	int dumpable = 0;
+	int vps_dumpable = 0;
+
 	/* Don't let security modules deny introspection */
 	if (same_thread_group(task, current))
 		return 0;
@@ -54,10 +56,17 @@ int __ptrace_may_access(struct task_stru
 	}
 	rcu_read_unlock();
 	smp_rmb();
-	if (task->mm)
+	if (task->mm) {
 		dumpable = get_dumpable(task->mm);
+		vps_dumpable = (task->mm->vps_dumpable == VD_PTRACE_COREDUMP);
+	}
+
 	if (dumpable != SUID_DUMP_USER && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
+	if (!vps_dumpable && !ve_is_super(get_exec_env()))
+		return -EPERM;
+	if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env()))
+		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
 }
@@ -176,6 +185,10 @@ static struct task_struct *ptrace_get_ta
 {
 	struct task_struct *child;
 
+	/* ptracing of init from inside CT is dangerous */
+	if (pid == 1 && !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
 	rcu_read_lock();
 	child = find_task_by_vpid(pid);
 	if (child)
@@ -455,7 +468,12 @@ int ptrace_attach(struct task_struct *ta
 
 	task_lock(task);
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	if (!retval) {
+		if (!task->mm || task->mm->vps_dumpable == VD_LICDATA_ACCESS)
+			retval = -EACCES;
+	}
 	task_unlock(task);
+
 	if (retval)
 		goto unlock_creds;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/rcupdate.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/rcupdate.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/rcupdate.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/rcupdate.c	2015-01-21 12:02:58.116831024 +0300
@@ -50,7 +50,7 @@
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
 	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-EXPORT_SYMBOL_GPL(rcu_lock_map);
+EXPORT_SYMBOL(rcu_lock_map);
 #endif
 
 int rcu_scheduler_active __read_mostly;
@@ -67,6 +67,53 @@ void wakeme_after_rcu(struct rcu_head  *
 	complete(&rcu->completion);
 }
 
+static DEFINE_PER_CPU(struct work_struct, rcu_in_process_work);
+static DEFINE_PER_CPU(struct rcu_head *, rcu_in_process_head);
+
+static void do_rcu_in_process(struct work_struct *work)
+{
+	struct rcu_head *head;
+
+	local_irq_disable();
+	while ((head = __get_cpu_var(rcu_in_process_head))) {
+		__get_cpu_var(rcu_in_process_head) = head->next;
+		local_irq_enable();
+		head->func(head);
+		local_irq_disable();
+	}
+	local_irq_enable();
+}
+
+static int __init init_rcu_in_process(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+	       INIT_WORK(&per_cpu(rcu_in_process_work, cpu),
+			       do_rcu_in_process);
+       return 0;
+}
+module_init(init_rcu_in_process)
+
+int call_rcu_in_process(struct rcu_head *head,
+		void (*func)(struct rcu_head *head))
+{
+	unsigned long flags;
+
+	if (!in_interrupt())
+		return 0;
+
+	head->func = func;
+	local_irq_save(flags);
+	head->next = __get_cpu_var(rcu_in_process_head);
+	__get_cpu_var(rcu_in_process_head) = head;
+	local_irq_restore(flags);
+	schedule_work_on(smp_processor_id(),
+			&__get_cpu_var(rcu_in_process_work));
+	return 1;
+}
+EXPORT_SYMBOL_GPL(call_rcu_in_process);
+
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 /**
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/relay.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/relay.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/relay.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/relay.c	2015-01-21 12:02:42.922234390 +0300
@@ -237,7 +237,6 @@ static void relay_destroy_buf(struct rch
 static void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	buf->chan->cb->remove_buf_file(buf->dentry);
 	relay_destroy_buf(buf);
 }
 
@@ -487,6 +486,7 @@ static void relay_close_buf(struct rchan
 {
 	buf->finalized = 1;
 	del_timer_sync(&buf->timer);
+	buf->chan->cb->remove_buf_file(buf->dentry);
 	kref_put(&buf->kref, relay_remove_buf);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched.c	2015-01-21 12:02:57.974834794 +0300
@@ -72,6 +72,7 @@
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/clocksource.h>
+#include <linux/ve_proto.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -254,6 +255,9 @@ struct cfs_bandwidth {
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
+#define CFS_IDLE_SCALE 100
+	u64 idle_scale_inv;
+
 	/* statistics */
 	int nr_periods, nr_throttled;
 	u64 throttled_time;
@@ -276,6 +280,8 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+	unsigned long orig_shares;
+	unsigned int min_shares_pct;
 #ifndef __GENKSYMS__
 	atomic_t load_weight;
 #endif
@@ -300,9 +306,25 @@ struct task_group {
 	struct autogroup *autogroup;
 #endif
 #endif
+	struct kernel_cpustat __percpu *cpustat;
+	struct taskstats __percpu *taskstats;
+	unsigned long		avenrun[3];	/* loadavg data */
+	struct timespec start_time;
+
+	struct kernel_cpustat *cpustat_last;
+	struct kernel_cpustat *vcpustat;
+	ktime_t vcpustat_last_update;
+	spinlock_t vcpustat_lock;
 #ifndef __GENKSYMS__
 	struct cfs_bandwidth cfs_bandwidth;
 #endif
+
+#ifdef CONFIG_CFS_CPULIMIT
+#define MAX_CPU_RATE 1024
+	unsigned long cpu_rate;
+	unsigned int nr_cpus;
+	atomic_t nr_cpus_active;
+#endif
 };
 
 #ifdef CONFIG_USER_SCHED
@@ -338,6 +360,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(str
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
+static int tg_multilevel_hierarchy;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 #ifdef CONFIG_USER_SCHED
@@ -354,7 +378,7 @@ static DEFINE_SPINLOCK(task_group_lock);
  * (The default weight is 1024 - so there's no practical
  *  limitation from this.)
  */
-#define MIN_SHARES	2
+#define MIN_SHARES	(1UL <<  1)
 #define MAX_SHARES	(1UL << 18)
 
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
@@ -387,6 +411,48 @@ static inline struct task_group *task_gr
 	return tg;
 }
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+	unsigned int nr_cpus = 0;
+	unsigned int max_nr_cpus = num_online_cpus();
+
+	rcu_read_lock();
+	nr_cpus = task_group(p)->nr_cpus;
+	rcu_read_unlock();
+
+	if (!nr_cpus || nr_cpus > max_nr_cpus)
+		nr_cpus = max_nr_cpus;
+
+	return nr_cpus;
+}
+
+unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p) % task_nr_cpus(p);
+}
+
+unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1;
+
+unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	unsigned long rate, max_rate;
+
+	if (!sysctl_sched_cpulimit_scale_cpufreq)
+		return freq;
+
+	rcu_read_lock();
+	rate = task_group(current)->cpu_rate;
+	rcu_read_unlock();
+
+	max_rate = num_online_vcpus() * MAX_CPU_RATE;
+	if (!rate || rate >= max_rate)
+		return freq;
+
+	return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */
+}
+#endif
+
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
@@ -422,6 +488,8 @@ struct cfs_rq {
 #ifndef __GENKSYMS__
 	unsigned long h_nr_running;
 #endif
+	unsigned long nr_iowait;
+	unsigned long nr_unint;
 
 	u64 exec_clock;
 	u64 min_vruntime;
@@ -439,9 +507,12 @@ struct cfs_rq {
 #ifdef __GENKSYMS__
 	struct sched_entity *curr, *next, *last;
 #else
-	struct sched_entity *curr, *next, *last, *skip;
+	struct sched_entity *curr, *next, *last, *skip, *prev;
 #endif
 
+	u64 nr_switches;
+	unsigned long nr_forks;
+
 	unsigned int nr_spread_over;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -508,6 +579,12 @@ struct cfs_rq {
 	u64 throttled_timestamp;
 	int throttled, throttle_count;
 	struct list_head throttled_list;
+
+	struct list_head boosted_entities;
+#endif
+#ifdef CONFIG_CFS_CPULIMIT
+	int active;
+	struct hrtimer active_timer;
 #endif
 #endif
 #endif
@@ -559,6 +636,7 @@ static void init_cfs_bandwidth(struct cf
 	cfs_b->runtime = 0;
 	cfs_b->quota = RUNTIME_INF;
 	cfs_b->period = ns_to_ktime(default_cfs_period());
+	cfs_b->idle_scale_inv = CFS_IDLE_SCALE;
 
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -567,10 +645,17 @@ static void init_cfs_bandwidth(struct cf
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer);
+
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+	INIT_LIST_HEAD(&cfs_rq->boosted_entities);
+#ifdef CONFIG_CFS_CPULIMIT
+	hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_rq->active_timer.function = sched_cfs_active_timer;
+#endif
 }
 
 /* requires cfs_b->lock, may release to reprogram timer */
@@ -727,11 +812,18 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+	unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct list_head leaf_rt_rq_list;
 #endif
 
+	/* nr_running last seen in update_cpu_load() */
+	unsigned long nr_active;
+
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
@@ -739,6 +831,10 @@ struct rq {
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
+	unsigned long nr_iothrottled;
+
+	unsigned long nr_sleeping;
+	unsigned long nr_stopped;
 
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
@@ -752,10 +848,15 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 
+	/* if bit N is set, the cpu is currently responsible
+	 * for load balance on sched domain level N */
+	unsigned long balance_delegate;
+
 	unsigned char idle_at_tick;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
+#define ACTIVE_BALANCE_CPULIMIT		0x10000
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
@@ -795,6 +896,10 @@ struct rq {
 	struct hrtimer hrtick_timer;
 #endif
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	int cfs_quota_exceeded;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -834,6 +939,37 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+struct kernel_stat_glob kstat_glob;
+DEFINE_SPINLOCK(kstat_glb_lock);
+EXPORT_SYMBOL(kstat_glob);
+EXPORT_SYMBOL(kstat_glb_lock);
+
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_lat);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_page_in);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, alloc_kstat_lat[KSTAT_ALLOCSTAT_NR]);
+
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_ttfp);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_cache_reap);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_shrink_icache);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_shrink_dcache);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_refill_inact);
+
+void __init kstat_init(void)
+{
+	int i;
+
+	kstat_glob.sched_lat.cur = &per_cpu_var(glob_kstat_lat);
+	kstat_glob.page_in.cur = &per_cpu_var(glob_kstat_page_in);
+	for ( i = 0 ; i < KSTAT_ALLOCSTAT_NR ; i++)
+		kstat_glob.alloc_lat[i].cur = &per_cpu_var(alloc_kstat_lat[i]);
+
+	kstat_glob.ttfp.cur = &per_cpu_var(kstat_pcpu_ttfp);
+	kstat_glob.cache_reap.cur = &per_cpu_var(kstat_pcpu_cache_reap);
+	kstat_glob.shrink_icache.cur = &per_cpu_var(kstat_pcpu_shrink_icache);
+	kstat_glob.shrink_dcache.cur = &per_cpu_var(kstat_pcpu_shrink_dcache);
+	kstat_glob.refill_inact.cur = &per_cpu_var(kstat_pcpu_refill_inact);
+}
+
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -1230,6 +1366,41 @@ static inline void task_rq_unlock(struct
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
+#ifdef CONFIG_VE
+static inline void write_wakeup_stamp(struct task_struct *p, u64 now)
+{
+	struct ve_task_info *ti;
+
+	ti = VE_TASK_INFO(p);
+	write_seqcount_begin(&ti->wakeup_lock);
+	ti->wakeup_stamp = now;
+	write_seqcount_end(&ti->wakeup_lock);
+}
+
+static inline void update_sched_lat(struct task_struct *t, u64 now)
+{
+	int cpu;
+	u64 ve_wstamp;
+
+	/* safe due to runqueue lock */
+	cpu = smp_processor_id();
+	ve_wstamp = t->ve_task_info.wakeup_stamp;
+
+	if (ve_wstamp && now > ve_wstamp) {
+		KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat,
+				cpu, now - ve_wstamp);
+		KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve,
+				cpu, now - ve_wstamp);
+	}
+}
+#endif
+
+unsigned long nr_zombie = 0;	/* protected by tasklist_lock */
+EXPORT_SYMBOL(nr_zombie);
+
+atomic_t nr_dead = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_dead);
+
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
@@ -1572,15 +1743,27 @@ calc_delta_mine(unsigned long delta_exec
 {
 	u64 tmp;
 
+	/*
+	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+	 * 2^SCHED_LOAD_RESOLUTION.
+	 */
+	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+		tmp = (u64)delta_exec * scale_load_down(weight);
+	else
+		tmp = (u64)delta_exec;
+
 	if (!lw->inv_weight) {
-		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+		unsigned long w = scale_load_down(lw->weight);
+
+		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 			lw->inv_weight = 1;
+		else if (unlikely(!w))
+			lw->inv_weight = WMULT_CONST;
 		else
-			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
-				/ (lw->weight+1);
+			lw->inv_weight = 1 + (WMULT_CONST - w/2) / (w + 1);
 	}
 
-	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
@@ -1682,12 +1865,12 @@ static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
-	      int *this_best_prio, struct rq_iterator *iterator);
+	      struct rq_iterator *iterator, int force);
 
 static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle,
-		   struct rq_iterator *iterator);
+iter_move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		struct sched_domain *sd, enum cpu_idle_type idle,
+		struct rq_iterator *iterator, int max_nr_migrate, int force);
 #endif
 
 /* Time spent by the tasks of the cpu accounting group executing in ... */
@@ -1831,12 +2014,13 @@ static unsigned long power_of(int cpu)
 	struct sched_group *group = group_of(cpu);
 
 	if (!group)
-		return SCHED_LOAD_SCALE;
+		return SCHED_POWER_SCALE;
 
 	return group->cpu_power;
 }
 
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+static int entity_hot(struct sched_entity *se, u64 now,
+		      struct sched_domain *sd);
 
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
@@ -1878,6 +2062,17 @@ static int tg_load_down(struct task_grou
 
 static void update_h_load(long cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long now = jiffies;
+
+	if (!tg_multilevel_hierarchy)
+		return;
+
+	if (rq->h_load_throttle == now)
+		return;
+
+	rq->h_load_throttle = now;
+
 	rcu_read_lock();
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 	rcu_read_unlock();
@@ -2036,9 +2231,12 @@ void sched_set_stop_task(int cpu, struct
 
 static void set_load_weight(struct task_struct *p)
 {
+	int prio = p->static_prio - MAX_RT_PRIO;
+	struct load_weight *load = &p->se.load;
+
 	if (task_has_rt_policy(p)) {
-		p->se.load.weight = prio_to_weight[0] * 2;
-		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+		load->weight = prio_to_weight[0] * 2;
+		load->inv_weight = prio_to_wmult[0] >> 1;
 		return;
 	}
 
@@ -2046,13 +2244,13 @@ static void set_load_weight(struct task_
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
-		p->se.load.weight = WEIGHT_IDLEPRIO;
-		p->se.load.inv_weight = WMULT_IDLEPRIO;
+		load->weight = scale_load(WEIGHT_IDLEPRIO);
+		load->inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 
-	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
-	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+	load->weight = scale_load(prio_to_weight[prio]);
+	load->inv_weight = prio_to_wmult[prio];
 }
 
 static void update_avg(u64 *avg, u64 sample)
@@ -2123,15 +2321,41 @@ static int effective_prio(struct task_st
 	return p->prio;
 }
 
+static inline void check_inc_sleeping(struct rq *rq, struct task_struct *t)
+{
+	if (t->state == TASK_INTERRUPTIBLE)
+		rq->nr_sleeping++;
+}
+
+static inline void check_dec_sleeping(struct rq *rq, struct task_struct *t)
+{
+	if (t->state == TASK_INTERRUPTIBLE)
+		rq->nr_sleeping--;
+}
+
 /*
  * activate_task - move a task to the runqueue.
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (task_contributes_to_load(p))
+	u64 now;
+	if (task_contributes_to_load(p)) {
 		rq->nr_uninterruptible--;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled--;
+		task_cfs_rq(p)->nr_unint--;
+	}
+
+	check_dec_sleeping(rq, p);
 
 	enqueue_task(rq, p, flags);
+
+	/* rq->clock is updated in enqueue_task() */
+	now = rq->clock;
+#ifdef CONFIG_VE
+	write_wakeup_stamp(p, now);
+	p->ve_task_info.sleep_time += now;
+#endif
 }
 
 /*
@@ -2139,10 +2363,32 @@ static void activate_task(struct rq *rq,
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (task_contributes_to_load(p))
+	unsigned int cpu;
+	u64 now;
+
+	cpu = task_cpu(p);
+
+	check_inc_sleeping(rq, p);
+
+#if 0 /* this is broken */
+	if (p->state == TASK_STOPPED) {
+		rq->nr_stopped++;
+	}
+#endif
+
+	if (task_contributes_to_load(p)) {
 		rq->nr_uninterruptible++;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled++;
+		task_cfs_rq(p)->nr_unint++;
+	}
 
 	dequeue_task(rq, p, flags);
+
+	/* rq->clock is updated in denqueue_task() */
+	now = rq->clock;
+	p->ve_task_info.sleep_time -= now;
+
 }
 
 /**
@@ -2231,17 +2477,23 @@ EXPORT_SYMBOL(kthread_bind);
 static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
-	s64 delta;
-
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
+	return entity_hot(&p->se, now, sd);
+}
+
+static int
+entity_hot(struct sched_entity *se, u64 now, struct sched_domain *sd)
+{
+	s64 delta;
+
 	/*
 	 * Buddy candidates are cache hot:
 	 */
 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
-			(&p->se == cfs_rq_of(&p->se)->next ||
-			 &p->se == cfs_rq_of(&p->se)->last))
+			(se == cfs_rq_of(se)->next ||
+			 se == cfs_rq_of(se)->last))
 		return 1;
 
 	if (sysctl_sched_migration_cost == -1)
@@ -2249,7 +2501,7 @@ task_hot(struct task_struct *p, u64 now,
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 
-	delta = now - p->se.exec_start;
+	delta = now - se->exec_start;
 
 	return delta < (s64)sysctl_sched_migration_cost;
 }
@@ -2416,6 +2668,7 @@ unsigned long wait_task_inactive(struct 
 
 	return ncsw;
 }
+EXPORT_SYMBOL(wait_task_inactive);
 
 /***
  * kick_process - kick a running thread to enter/exit the kernel
@@ -2583,12 +2836,17 @@ static int try_to_wake_up(struct task_st
 	if (!(p->state & state))
 		goto out;
 
-	if (p->se.on_rq)
+	if (p->se.on_rq) {
+		p->woken_while_running = 1;
 		goto out_running;
+	}
 
 	cpu = task_cpu(p);
 	orig_cpu = cpu;
 
+	if (p->in_iowait && p->sched_class->nr_iowait_dec)
+		p->sched_class->nr_iowait_dec(p);
+
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
@@ -2600,11 +2858,30 @@ static int try_to_wake_up(struct task_st
 	 * First fix up the nr_uninterruptible count:
 	 */
 	if (task_contributes_to_load(p)) {
-		if (likely(cpu_online(orig_cpu)))
+		if (likely(cpu_online(orig_cpu))) {
 			rq->nr_uninterruptible--;
-		else
+			if (task_iothrottled(p))
+				rq->nr_iothrottled--;
+			task_cfs_rq(p)->nr_unint--;
+		} else {
 			this_rq()->nr_uninterruptible--;
+			if (task_iothrottled(p))
+				this_rq()->nr_iothrottled--;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+			task_group(p)->cfs_rq[this_cpu]->nr_unint--;
+#else
+			this_rq()->cfs.nr_unint--;
+#endif
+		}
+	}
+
+	if (p->state == TASK_INTERRUPTIBLE) {
+		if (likely(cpu_online(orig_cpu)))
+			rq->nr_sleeping--;
+		else
+			this_rq()->nr_sleeping--;
 	}
+
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking) {
@@ -2758,6 +3035,10 @@ static void __sched_fork(struct task_str
 	p->se.on_rq = 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	p->se.boosted = 0;
+#endif
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
@@ -2820,10 +3101,14 @@ void sched_fork(struct task_struct *p, i
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_VE
+	/* cosmetic: sleep till wakeup below */
+	p->ve_task_info.sleep_time -= task_rq(p)->clock;
+#endif
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 
 	put_cpu();
@@ -3003,6 +3288,16 @@ static void finish_task_switch(struct rq
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+
+	/* kernel threads don't care about cpuid faulting */
+	if (current->mm)
+		set_cpuid_faulting(!ve_is_super(get_exec_env()));
+}
+
+static inline void task_scheduled(struct rq *rq, struct task_struct *p)
+{
+	if (p->sched_class->task_scheduled)
+		p->sched_class->task_scheduled(rq, p);
 }
 
 #ifdef CONFIG_SMP
@@ -3120,6 +3415,18 @@ context_switch(struct rq *rq, struct tas
 	finish_task_switch(this_rq(), prev);
 }
 
+#define DECLARE_NR_ONLINE(varname)			\
+	unsigned long varname(void)			\
+	{						\
+		unsigned long i, sum = 0;		\
+		for_each_online_cpu(i)			\
+			sum += cpu_rq(i)->varname;	\
+		if (unlikely((long)sum < 0))		\
+			return 0;			\
+		return sum;				\
+	}						\
+	EXPORT_SYMBOL(varname);				\
+
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
@@ -3127,15 +3434,9 @@ context_switch(struct rq *rq, struct tas
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
-unsigned long nr_running(void)
-{
-	unsigned long i, sum = 0;
-
-	for_each_online_cpu(i)
-		sum += cpu_rq(i)->nr_running;
-
-	return sum;
-}
+DECLARE_NR_ONLINE(nr_running);
+DECLARE_NR_ONLINE(nr_sleeping);
+DECLARE_NR_ONLINE(nr_stopped);
 
 unsigned long nr_uninterruptible(void)
 {
@@ -3153,6 +3454,7 @@ unsigned long nr_uninterruptible(void)
 
 	return sum;
 }
+EXPORT_SYMBOL(nr_uninterruptible);
 
 unsigned long long nr_context_switches(void)
 {
@@ -3181,12 +3483,32 @@ unsigned long nr_iowait_cpu(int cpu)
 	return atomic_read(&this->nr_iowait);
 }
 
-unsigned long this_cpu_load(void)
+unsigned long nr_active_cpu(void)
 {
 	struct rq *this = this_rq();
-	return this->cpu_load[0];
+	return this->nr_active;
+}
+
+#ifdef CONFIG_VE
+unsigned long nr_running_ve(void)
+{
+	struct task_group *tg = task_group(current);
+	unsigned long nr_running = 0;
+	int i;
+
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		nr_running += tg->cfs_rq[i]->nr_running;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	return nr_running;
 }
 
+#endif
 
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
@@ -3209,6 +3531,16 @@ void get_avenrun(unsigned long *loads, u
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 
+void get_avenrun_ve(unsigned long *loads, unsigned long offset, int shift)
+{
+	struct task_group *tg = task_group(current);
+	loads[0] = (tg->avenrun[0] + offset) << shift;
+	loads[1] = (tg->avenrun[1] + offset) << shift;
+	loads[2] = (tg->avenrun[2] + offset) << shift;
+}
+
+
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -3217,6 +3549,45 @@ calc_load(unsigned long load, unsigned l
 	return load >> FSHIFT;
 }
 
+#ifdef CONFIG_VE
+static void calc_load_ve(void)
+{
+	unsigned long flags, nr_unint, nr_active;
+	struct task_group *tg;
+	int i;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		nr_active = 0;
+		for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+			nr_active += tg->cfs_rq[i]->nr_running;
+			nr_active += tg->cfs_rq[i]->nr_unint;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+			nr_active += tg->rt_rq[i]->rt_nr_running;
+#endif
+		}
+		nr_active *= FIXED_1;
+
+		tg->avenrun[0] = calc_load(tg->avenrun[0], EXP_1, nr_active);
+		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
+		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
+	}
+	rcu_read_unlock();
+
+	nr_unint = nr_uninterruptible() * FIXED_1;
+	spin_lock_irqsave(&kstat_glb_lock, flags);
+	CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
+	CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
+	CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
+	spin_unlock_irqrestore(&kstat_glb_lock, flags);
+
+}
+#else
+#define calc_load_ve()	do { } while (0)
+#endif
+
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
@@ -3236,6 +3607,8 @@ void calc_global_load(void)
 	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
 	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
 
+	calc_load_ve();
+
 	calc_load_update += LOAD_FREQ;
 }
 
@@ -3248,6 +3621,7 @@ static void calc_load_account_active(str
 
 	nr_active = this_rq->nr_running;
 	nr_active += (long) this_rq->nr_uninterruptible;
+	nr_active -= (long) this_rq->nr_iothrottled;
 
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
@@ -3345,6 +3719,7 @@ static void update_cpu_load(struct rq *t
 	this_rq->last_load_update_tick = curr_jiffies;
 
 	/* Update our load: */
+	this_rq->nr_active = this_rq->nr_running;
 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
@@ -3477,7 +3852,7 @@ static void pull_task(struct rq *src_rq,
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum cpu_idle_type idle,
-		     int *all_pinned)
+		     int *all_pinned, int force)
 {
 	int tsk_cache_hot = 0;
 	/*
@@ -3504,7 +3879,7 @@ int can_migrate_task(struct task_struct 
 	 */
 
 	tsk_cache_hot = task_hot(p, rq->clock, sd);
-	if (!tsk_cache_hot ||
+	if (!tsk_cache_hot || force ||
 		sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
@@ -3526,17 +3901,15 @@ static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
-	      int *this_best_prio, struct rq_iterator *iterator)
+	      struct rq_iterator *iterator, int force)
 {
-	int loops = 0, pulled = 0, pinned = 0;
+	int loops = 0, pulled = 0;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 
 	if (max_load_move == 0)
 		goto out;
 
-	pinned = 1;
-
 	/*
 	 * Start the load-balancing iterator:
 	 */
@@ -3545,8 +3918,9 @@ next:
 	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
 
-	if ((p->se.load.weight >> 1) > rem_load_move ||
-	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+	if (((p->se.load.weight >> 1) > rem_load_move && !force) ||
+	    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+			      all_pinned, force)) {
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
@@ -3569,8 +3943,6 @@ next:
 	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
 	if (rem_load_move > 0) {
-		if (p->prio < *this_best_prio)
-			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
@@ -3582,9 +3954,6 @@ out:
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 
-	if (all_pinned)
-		*all_pinned = pinned;
-
 	return max_load_move - rem_load_move;
 }
 
@@ -3602,13 +3971,12 @@ static int move_tasks(struct rq *this_rq
 {
 	const struct sched_class *class = sched_class_highest;
 	unsigned long total_load_moved = 0;
-	int this_best_prio = this_rq->curr->prio;
 
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
 				max_load_move - total_load_moved,
-				sd, idle, all_pinned, &this_best_prio);
+				sd, idle, all_pinned);
 		class = class->next;
 
 #ifdef CONFIG_PREEMPT
@@ -3626,15 +3994,20 @@ static int move_tasks(struct rq *this_rq
 }
 
 static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle,
-		   struct rq_iterator *iterator)
+iter_move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		struct sched_domain *sd, enum cpu_idle_type idle,
+		struct rq_iterator *iterator, int max_nr_migrate, int force)
 {
 	struct task_struct *p = iterator->start(iterator->arg);
+	int nr_migrated = 0;
 	int pinned = 0;
 
+	if (max_nr_migrate <= 0)
+		return 0;
+
 	while (p) {
-		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+		if (can_migrate_task(p, busiest, this_cpu, sd, idle,
+				     &pinned, force)) {
 			pull_task(busiest, p, this_rq, this_cpu);
 			/*
 			 * Right now, this is only the second place pull_task()
@@ -3643,12 +4016,13 @@ iter_move_one_task(struct rq *this_rq, i
 			 */
 			schedstat_inc(sd, lb_gained[idle]);
 
-			return 1;
+			if (++nr_migrated >= max_nr_migrate)
+				break;
 		}
 		p = iterator->next(iterator->arg);
 	}
 
-	return 0;
+	return nr_migrated;
 }
 
 /*
@@ -3895,7 +4269,7 @@ static inline int check_power_save_busie
 
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
-	return SCHED_LOAD_SCALE;
+	return SCHED_POWER_SCALE;
 }
 
 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -3934,10 +4308,10 @@ unsigned long scale_rt_power(int cpu)
 		available = total - rq->rt_avg;
 	}
 
-	if (unlikely((s64)total < SCHED_LOAD_SCALE))
-		total = SCHED_LOAD_SCALE;
+	if (unlikely((s64)total < SCHED_POWER_SCALE))
+		total = SCHED_POWER_SCALE;
 
-	total >>= SCHED_LOAD_SHIFT;
+	total >>= SCHED_POWER_SHIFT;
 
 	return div_u64(available, total);
 }
@@ -3945,7 +4319,7 @@ unsigned long scale_rt_power(int cpu)
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = sd->span_weight;
-	unsigned long power = SCHED_LOAD_SCALE;
+	unsigned long power = SCHED_POWER_SCALE;
 	struct sched_group *sdg = sd->groups;
 
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -3954,7 +4328,7 @@ static void update_cpu_power(struct sche
 		else
 			power *= default_scale_smt_power(sd, cpu);
 
-		power >>= SCHED_LOAD_SHIFT;
+		power >>= SCHED_POWER_SHIFT;
 	}
 
 	sdg->cpu_power_orig = power;
@@ -3964,10 +4338,10 @@ static void update_cpu_power(struct sche
 	else
 		power *= default_scale_freq_power(sd, cpu);
 
-	power >>= SCHED_LOAD_SHIFT;
+	power >>= SCHED_POWER_SHIFT;
 
 	power *= scale_rt_power(cpu);
-	power >>= SCHED_LOAD_SHIFT;
+	power >>= SCHED_POWER_SHIFT;
 
 	if (!power)
 		power = 1;
@@ -4008,7 +4382,7 @@ static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
 	/*
-	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
+	 * Only siblings can have significantly less than SCHED_POWER_SCALE
 	 */
 	if (sd->level != SD_LV_SIBLING)
 		return 0;
@@ -4043,6 +4417,7 @@ static inline void update_sg_lb_stats(st
 {
 	unsigned long load, max_cpu_load, min_cpu_load;
 	int i;
+	int delegate_cpu = -1;
 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
 	unsigned long avg_load_per_task = 0;
 
@@ -4066,6 +4441,9 @@ static inline void update_sg_lb_stats(st
 				balance_cpu = i;
 			}
 
+			if (test_bit(sd->level, &rq->balance_delegate))
+				delegate_cpu = i;
+
 			load = target_load(i, load_idx);
 		} else {
 			load = source_load(i, load_idx);
@@ -4088,6 +4466,13 @@ static inline void update_sg_lb_stats(st
 	 * to do the newly idle load balance.
 	 */
 	if (idle != CPU_NEWLY_IDLE && local_group) {
+		if (balance_cpu == delegate_cpu)
+			/* we did a round-robin delegating load balance,
+			 * start to back off then */
+			balance_cpu = -1;
+		else if (delegate_cpu >= 0)
+			balance_cpu = delegate_cpu;
+
 		if (balance_cpu != this_cpu) {
 			*balance = 0;
 			return;
@@ -4096,7 +4481,7 @@ static inline void update_sg_lb_stats(st
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
@@ -4113,8 +4498,8 @@ static inline void update_sg_lb_stats(st
 	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity =
-		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+						SCHED_POWER_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
 }
@@ -4276,7 +4661,7 @@ static int check_asym_packing(struct sch
 		return 0;
 
 	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
-				       SCHED_LOAD_SCALE);
+				       SCHED_POWER_SCALE);
 	return 1;
 }
 
@@ -4305,7 +4690,7 @@ static inline void fix_small_imbalance(s
 			cpu_avg_load_per_task(this_cpu);
 
 	scaled_busy_load_per_task = sds->busiest_load_per_task
-						 * SCHED_LOAD_SCALE;
+					 * SCHED_POWER_SCALE;
 	scaled_busy_load_per_task /= sds->busiest->cpu_power;
 
 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
@@ -4328,10 +4713,10 @@ static inline void fix_small_imbalance(s
 			min(sds->busiest_load_per_task, sds->max_load);
 	pwr_now += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load);
-	pwr_now /= SCHED_LOAD_SCALE;
+	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
-	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
 		sds->busiest->cpu_power;
 	if (sds->max_load > tmp)
 		pwr_move += sds->busiest->cpu_power *
@@ -4339,15 +4724,15 @@ static inline void fix_small_imbalance(s
 
 	/* Amount of load we'd add */
 	if (sds->max_load * sds->busiest->cpu_power <
-		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+		sds->busiest_load_per_task * SCHED_POWER_SCALE)
 		tmp = (sds->max_load * sds->busiest->cpu_power) /
 			sds->this->cpu_power;
 	else
-		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
 			sds->this->cpu_power;
 	pwr_move += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
-	pwr_move /= SCHED_LOAD_SCALE;
+	pwr_move /= SCHED_POWER_SCALE;
 
 	/* Move if we gain throughput */
 	if (pwr_move > pwr_now)
@@ -4389,7 +4774,7 @@ static inline void calculate_imbalance(s
 		load_above_capacity = (sds->busiest_nr_running -
 						sds->busiest_group_capacity);
 
-		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 
 		load_above_capacity /= sds->busiest->cpu_power;
 	}
@@ -4408,8 +4793,8 @@ static inline void calculate_imbalance(s
 
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * sds->busiest->cpu_power, (!sds->this ? 0 :
-		(sds->avg_load - sds->this_load) * sds->this->cpu_power))
-			/ SCHED_LOAD_SCALE;
+		(sds->max_load - sds->this_load) / 2 * sds->this->cpu_power))
+			/ SCHED_POWER_SCALE;
 
 	/*
 	 * if *imbalance is less than the average load per runnable task
@@ -4486,10 +4871,7 @@ find_busiest_group(struct sched_domain *
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
-	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
-	if (sds.this_load >= sds.avg_load)
-		goto out_balanced;
+	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
 
 	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
@@ -4524,7 +4906,8 @@ find_busiest_queue(struct sched_domain *
 
 	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long power = power_of(i);
-		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+		unsigned long capacity = DIV_ROUND_CLOSEST(power,
+							   SCHED_POWER_SCALE);
 		unsigned long wl;
 
 		if (!capacity)
@@ -4549,7 +4932,7 @@ find_busiest_queue(struct sched_domain *
 		 * the load can be moved away from the cpu that is potentially
 		 * running at a lower capacity.
 		 */
-		wl = (wl * SCHED_LOAD_SCALE) / power;
+		wl = (wl * SCHED_POWER_SCALE) / power;
 
 		if (wl > max_load) {
 			max_load = wl;
@@ -4560,6 +4943,32 @@ find_busiest_queue(struct sched_domain *
 	return busiest;
 }
 
+static void delegate_load_balance(int this_cpu, struct rq *this_rq,
+				  struct sched_domain *sd,
+				  int balanced, int all_pinned)
+{
+	int cpu;
+
+	if (!all_pinned)
+		goto out;
+
+	/* load balance on this cpu got stuck,
+	 * delegate to the next cpu of this group then */
+	cpu = cpumask_next_and(this_cpu, cpu_active_mask,
+			       sched_group_cpus(sd->groups));
+	if (cpu >= nr_cpu_ids)
+		cpu = cpumask_first_and(cpu_active_mask,
+					sched_group_cpus(sd->groups));
+	BUG_ON(cpu >= nr_cpu_ids);
+
+	if (cpu != this_cpu)
+		set_bit(sd->level, &cpu_rq(cpu)->balance_delegate);
+
+out:
+	if (balanced || all_pinned)
+		clear_bit(sd->level, &this_rq->balance_delegate);
+}
+
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
@@ -4578,6 +4987,7 @@ static int load_balance(int this_cpu, st
 			int *balance)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	int balanced = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
@@ -4610,9 +5020,12 @@ redo:
 		goto out_balanced;
 	}
 
+pick_next_queue:
 	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
+		if (cpumask_andnot(cpus, cpus, sched_group_cpus(group)))
+			goto redo;
 		goto out_balanced;
 	}
 
@@ -4628,6 +5041,8 @@ redo:
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
+		all_pinned = 1;
+		update_h_load(cpu_of(busiest));
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -4644,6 +5059,8 @@ redo:
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (cpumask_intersects(cpus, sched_group_cpus(group)))
+				goto pick_next_queue;
 			if (!cpumask_empty(cpus))
 				goto redo;
 			goto out_balanced;
@@ -4714,6 +5131,7 @@ redo:
 	goto out;
 
 out_balanced:
+	balanced = 1;
 	schedstat_inc(sd, lb_balanced[idle]);
 
 	sd->nr_balance_failed = 0;
@@ -4730,6 +5148,7 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
+	delegate_load_balance(this_cpu, this_rq, sd, balanced, all_pinned);
 	return ld_moved;
 }
 
@@ -4775,10 +5194,13 @@ redo:
 		goto out_balanced;
 	}
 
+pick_next_queue:
 	busiest = find_busiest_queue(sd, group, CPU_NEWLY_IDLE, imbalance, cpus);
 	if (!busiest) {
 		spin_lock(&this_rq->lock);
 		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
+		if (cpumask_andnot(cpus, cpus, sched_group_cpus(group)))
+			goto redo;
 		goto out_balanced;
 	}
 
@@ -4790,6 +5212,8 @@ redo:
 	spin_lock(&this_rq->lock);
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
+		all_pinned = 1;
+		update_h_load(cpu_of(busiest));
 		double_lock_balance(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
@@ -4798,6 +5222,10 @@ redo:
 
 		if (unlikely(all_pinned)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (cpumask_intersects(cpus, sched_group_cpus(group))) {
+				spin_unlock(&this_rq->lock);
+				goto pick_next_queue;
+			}
 			if (!cpumask_empty(cpus))
 				goto redo;
 		}
@@ -5269,6 +5697,7 @@ static void rebalance_domains(int cpu, e
 	int update_next_balance = 0;
 	int need_serialize, need_decay = 0;
 	u64 max_cost = 0;
+	unsigned long balance_delegate = rq->balance_delegate;
 
 	update_shares(cpu);
 
@@ -5300,6 +5729,9 @@ static void rebalance_domains(int cpu, e
 		}
 
 		interval = sd->balance_interval;
+		if (test_bit(sd->level, &balance_delegate))
+			/* we were delegated, do not wait long */
+			interval = min(interval, sd->max_interval);
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 
@@ -5335,6 +5767,10 @@ out:
 			next_balance = sd->last_balance + interval;
 			update_next_balance = 1;
 		}
+
+		__clear_bit(sd->level, &balance_delegate);
+		if (balance_delegate)
+			balance = 1;
 	}
 	if (need_decay) {
 		/*
@@ -5610,6 +6046,25 @@ unsigned long long thread_group_sched_ru
 	return ns;
 }
 
+static inline void task_group_account_field(struct task_struct *p,
+						u64 tmp, int index)
+{
+#ifdef CONFIG_CGROUP_SCHED
+	struct kernel_cpustat *kcpustat;
+	struct task_group *tg;
+
+	rcu_read_lock();
+	tg = task_group(p);
+	while (tg) {
+		kcpustat = this_cpu_ptr(tg->cpustat);
+		kcpustat->cpustat[index] += tmp;
+		tg = tg->parent;
+	}
+	rcu_read_unlock();
+#endif
+}
+
+
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -5629,10 +6084,13 @@ void account_user_time(struct task_struc
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
-	if (TASK_NICE(p) > 0)
+	if (TASK_NICE(p) > 0) {
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
-	else
+		task_group_account_field(p, tmp, NICE);
+	} else {
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+		task_group_account_field(p, tmp, USER);
+	}
 
 	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
 	/* Account for user time used */
@@ -5696,6 +6154,7 @@ void account_system_time(struct task_str
 	else
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 
+	task_group_account_field(p, tmp, SYSTEM);
 	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
 
 	/* Account for system time used */
@@ -6027,8 +6486,6 @@ EXPORT_SYMBOL(sub_preempt_count);
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-	struct pt_regs *regs = get_irq_regs();
-
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 
@@ -6036,11 +6493,7 @@ static noinline void __schedule_bug(stru
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-
-	if (regs)
-		show_regs(regs);
-	else
-		dump_stack();
+	dump_stack();
 }
 
 /*
@@ -6156,6 +6609,21 @@ need_resched_nonpreemptible:
 		rq->curr = next;
 		++*switch_count;
 
+#ifdef CONFIG_VE
+		prev->ve_task_info.sleep_stamp = rq->clock;
+		if (prev->state == TASK_RUNNING && prev != this_rq()->idle)
+			write_wakeup_stamp(prev, rq->clock);
+		update_sched_lat(next, rq->clock);
+
+		/* because next & prev are protected with
+		 * runqueue lock we may not worry about
+		 * wakeup_stamp and sched_time protection
+		 * (same thing in 'else' branch below)
+		 */
+		next->ve_task_info.sched_time = rq->clock;
+		write_wakeup_stamp(next, 0);
+#endif
+
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
@@ -6174,6 +6642,8 @@ need_resched_nonpreemptible:
 	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
+
+	task_scheduled(rq, current);
 }
 EXPORT_SYMBOL(schedule);
 
@@ -6461,7 +6931,8 @@ void complete_all(struct completion *x)
 EXPORT_SYMBOL(complete_all);
 
 static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+		   long (*action)(long), long timeout, int state)
 {
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
@@ -6475,7 +6946,7 @@ do_wait_for_common(struct completion *x,
 			}
 			__set_current_state(state);
 			spin_unlock_irq(&x->wait.lock);
-			timeout = schedule_timeout(timeout);
+			timeout = action(timeout);
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
 		__remove_wait_queue(&x->wait, &wait);
@@ -6486,17 +6957,30 @@ do_wait_for_common(struct completion *x,
 	return timeout ?: 1;
 }
 
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+static inline long __sched
+__wait_for_common(struct completion *x,
+		  long (*action)(long), long timeout, int state)
 {
 	might_sleep();
 
 	spin_lock_irq(&x->wait.lock);
-	timeout = do_wait_for_common(x, timeout, state);
+	timeout = do_wait_for_common(x, action, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+	return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+	return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
 /**
  * wait_for_completion: - waits for completion of a task
  * @x:  holds the state of this particular completion
@@ -6513,6 +6997,12 @@ void __sched wait_for_completion(struct 
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+void __sched wait_for_completion_io(struct completion *x)
+{
+	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
 /**
  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  * @x:  holds the state of this particular completion
@@ -6982,7 +7472,7 @@ recheck:
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
-	if (user && !capable(CAP_SYS_NICE)) {
+	if (user && !capable(CAP_SYS_ADMIN)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 
@@ -7101,7 +7591,7 @@ int sched_setscheduler(struct task_struc
 {
 	return __sched_setscheduler(p, policy, param, true);
 }
-EXPORT_SYMBOL_GPL(sched_setscheduler);
+EXPORT_SYMBOL(sched_setscheduler);
 
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
@@ -7119,6 +7609,7 @@ int sched_setscheduler_nocheck(struct ta
 {
 	return __sched_setscheduler(p, policy, param, false);
 }
+EXPORT_SYMBOL(sched_setscheduler_nocheck);
 
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -7238,6 +7729,9 @@ long sched_setaffinity(pid_t pid, const 
 	struct task_struct *p;
 	int retval;
 
+	if (!ve_is_super(get_exec_env()))
+		return 0;
+
 	get_online_cpus();
 	rcu_read_lock();
 
@@ -7347,6 +7841,12 @@ long sched_getaffinity(pid_t pid, struct
 	if (retval)
 		goto out_unlock;
 
+	if (!ve_is_super(get_exec_env())) {
+		cpumask_clear(mask);
+		bitmap_fill(cpumask_bits(mask), num_online_vcpus());
+		goto out_unlock;
+	}
+
 	rq = task_rq_lock(p, &flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 	task_rq_unlock(rq, &flags);
@@ -7354,7 +7854,6 @@ long sched_getaffinity(pid_t pid, struct
 out_unlock:
 	rcu_read_unlock();
 	put_online_cpus();
-
 	return retval;
 }
 
@@ -7424,23 +7923,37 @@ static inline int should_resched(void)
 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
 
-static void __cond_resched(void)
+static void __cond_resched(int may_throttle)
 {
 	add_preempt_count(PREEMPT_ACTIVE);
+	if (may_throttle)
+		current->may_throttle = 1;
 	schedule();
+	if (may_throttle)
+		current->may_throttle = 0;
 	sub_preempt_count(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
-		__cond_resched();
+		__cond_resched(0);
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 
+int __sched _cond_resched_may_throttle(void)
+{
+	if (should_resched()) {
+		__cond_resched(1);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched_may_throttle);
+
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
@@ -7459,7 +7972,7 @@ int __cond_resched_lock(spinlock_t *lock
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
-			__cond_resched();
+			__cond_resched(0);
 		else
 			cpu_relax();
 		ret = 1;
@@ -7475,7 +7988,7 @@ int __sched __cond_resched_softirq(void)
 
 	if (should_resched()) {
 		local_bh_enable();
-		__cond_resched();
+		__cond_resched(0);
 		local_bh_disable();
 		return 1;
 	}
@@ -7705,26 +8218,17 @@ void sched_show_task(struct task_struct 
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
+	printk(KERN_CONT " %p ", p);
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
-	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-		task_pid_nr(p), task_pid_nr(p->real_parent),
+	printk(KERN_CONT "%5lu %5d %6d %4u 0x%08lx\n", free,
+		task_pid_nr(p), task_pid_nr(p->real_parent), task_veid(p),
 		(unsigned long)task_thread_info(p)->flags);
 
 	show_stack(p, NULL);
 }
+EXPORT_SYMBOL_GPL(sched_show_task);
 
 void show_state_filter(unsigned long state_filter)
 {
@@ -7732,13 +8236,13 @@ void show_state_filter(unsigned long sta
 
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
-		"  task                PC stack   pid father\n");
+		"  task          taskaddr stack   pid father veid\n");
 #else
 	printk(KERN_INFO
-		"  task                        PC stack   pid father\n");
+		"  task                  taskaddr stack   pid father veid\n");
 #endif
 	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
@@ -7746,14 +8250,11 @@ void show_state_filter(unsigned long sta
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 
 	touch_all_softlockup_watchdogs();
 	clocksource_touch_watchdog();
 
-#ifdef CONFIG_SCHED_DEBUG
-	sysrq_sched_debug_show();
-#endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
@@ -7796,7 +8297,7 @@ void __cpuinit init_idle(struct task_str
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
+#if defined(CONFIG_PREEMPT_COUNT)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
@@ -8107,6 +8608,8 @@ static void migrate_nr_uninterruptible(s
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
+	rq_dest->nr_iothrottled += rq_src->nr_iothrottled;
+	rq_src->nr_iothrottled = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
@@ -8118,13 +8621,13 @@ static void migrate_live_tasks(int src_c
 
 	read_lock(&tasklist_lock);
 
-	do_each_thread(t, p) {
+	do_each_thread_all(t, p) {
 		if (p == current)
 			continue;
 
 		if (task_cpu(p) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, p);
-	} while_each_thread(t, p);
+	} while_each_thread_all(t, p);
 
 	read_unlock(&tasklist_lock);
 }
@@ -8155,6 +8658,13 @@ void sched_idle_next(void)
 	activate_task(rq, p, 0);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
+
+	/*
+	 * Disable cpuid faulting when a cpu goes offline. Note, it cannot be
+	 * re-enabled when switching to the idle task, because idle tasks do
+	 * not have mm (see finish_task_switch()).
+	 */
+	set_cpuid_faulting(false);
 }
 
 /*
@@ -8467,6 +8977,7 @@ migration_call(struct notifier_block *nf
 
 			set_rq_online(rq);
 		}
+		start_cfs_idle_time_accounting(cpu);
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
@@ -8507,6 +9018,7 @@ migration_call(struct notifier_block *nf
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
+		stop_cfs_idle_time_accounting(cpu);
 		spin_unlock_irq(&rq->lock);
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
@@ -8677,7 +9189,7 @@ static int sched_domain_debug_one(struct
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->cpu_power != SCHED_LOAD_SCALE) {
+		if (group->cpu_power != SCHED_POWER_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
 				group->cpu_power);
 		}
@@ -9233,7 +9745,7 @@ static void init_numa_sched_groups_power
 		return;
 	do {
 		/* Estimate the final value to avoid /0 on weird topologies. */
-		sg->cpu_power = SCHED_LOAD_SCALE * cpumask_weight(
+		sg->cpu_power = SCHED_POWER_SCALE * cpumask_weight(
 		                    sched_group_cpus(sg));
 		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
@@ -9249,7 +9761,7 @@ static void init_numa_sched_groups_power
 
 			sg->cpu_power += sd->groups->cpu_power;
 			/* Discharge initial estimate of updated cpus. */
-			sg->cpu_power -= SCHED_LOAD_SCALE * cpumask_weight(
+			sg->cpu_power -= SCHED_POWER_SCALE * cpumask_weight(
 			                     sched_group_cpus(sd->groups));
 		}
 		sg = sg->next;
@@ -9288,7 +9800,7 @@ static int build_numa_sched_groups(struc
 	}
 
 	/* Weird topologies might otherwise result in a /0 trap */
-	sg->cpu_power = SCHED_LOAD_SCALE * cpumask_weight(d->nodemask);
+	sg->cpu_power = SCHED_POWER_SCALE * cpumask_weight(d->nodemask);
 	cpumask_copy(sched_group_cpus(sg), d->nodemask);
 	sg->next = sg;
 	cpumask_or(d->covered, d->covered, d->nodemask);
@@ -9311,7 +9823,7 @@ static int build_numa_sched_groups(struc
 			       "Can not alloc domain group for node %d\n", j);
 			return -ENOMEM;
 		}
-		sg->cpu_power = SCHED_LOAD_SCALE * cpumask_weight(d->tmpmask);
+		sg->cpu_power = SCHED_POWER_SCALE * cpumask_weight(d->tmpmask);
 		cpumask_copy(sched_group_cpus(sg), d->tmpmask);
 		sg->next = prev->next;
 		cpumask_or(d->covered, d->covered, d->tmpmask);
@@ -10274,6 +10786,11 @@ static void init_tg_cfs_entry(struct tas
 	se->my_q = cfs_rq;
 	update_load_set(&se->load, 0);
 	se->parent = parent;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (cpu_online(cpu))
+		se->sleep_start = cpu_clock(cpu);
+#endif
 }
 #endif
 
@@ -10367,6 +10884,9 @@ void __init sched_init(void)
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 
+	root_task_group.cpustat = alloc_percpu(struct kernel_cpustat);
+	root_task_group.taskstats = alloc_percpu(struct taskstats);
+
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
@@ -10388,6 +10908,8 @@ void __init sched_init(void)
 	INIT_LIST_HEAD(&init_task_group.children);
 	autogroup_init(&init_task);
 
+	root_task_group.start_time = (struct timespec){0, 0};
+
 #ifdef CONFIG_USER_SCHED
 	INIT_LIST_HEAD(&root_task_group.children);
 	init_task_group.parent = &root_task_group;
@@ -10407,6 +10929,7 @@ void __init sched_init(void)
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
+		init_task_group.orig_shares = init_task_group.shares;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
 		/*
@@ -10431,6 +10954,7 @@ void __init sched_init(void)
 		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
 #elif defined CONFIG_USER_SCHED
 		root_task_group.shares = NICE_0_LOAD;
+		root_task_group.orig_shares = root_task_group.shares;
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 		/*
@@ -10606,7 +11130,7 @@ void normalize_rt_tasks(void)
 	struct rq *rq;
 
 	read_lock_irqsave(&tasklist_lock, flags);
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
@@ -10637,7 +11161,7 @@ void normalize_rt_tasks(void)
 
 		__task_rq_unlock(rq);
 		spin_unlock(&p->pi_lock);
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
@@ -10722,6 +11246,7 @@ int alloc_fair_sched_group(struct task_g
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
+	tg->orig_shares = tg->shares;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
@@ -10854,9 +11379,14 @@ static void free_sched_group(struct task
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
+	free_percpu(tg->cpustat);
+	free_percpu(tg->taskstats);
+	kfree(tg->cpustat_last);
+	kfree(tg->vcpustat);
 	kfree(tg);
 }
 
+static void update_effective_shares(struct task_group *parent);
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
@@ -10873,6 +11403,33 @@ struct task_group *sched_create_group(st
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	tg->cpustat = alloc_percpu(struct kernel_cpustat);
+	if (!tg->cpustat)
+		goto err;
+
+	tg->taskstats = alloc_percpu(struct taskstats);
+	if (!tg->taskstats)
+		goto err;
+
+	tg->cpustat_last = kcalloc(nr_cpu_ids, sizeof(struct kernel_cpustat),
+				   GFP_KERNEL);
+	if (!tg->cpustat_last)
+		goto err;
+
+	tg->vcpustat = kcalloc(nr_cpu_ids, sizeof(struct kernel_cpustat),
+			       GFP_KERNEL);
+	if (!tg->vcpustat)
+		goto err;
+
+	tg->vcpustat_last_update = ktime_set(0, 0);
+	spin_lock_init(&tg->vcpustat_lock);
+
+	update_effective_shares(parent);
+
+	/* start_timespec is saved CT0 uptime */
+	do_posix_clock_monotonic_gettime(&tg->start_time);
+	monotonic_to_bootbased(&tg->start_time);
+
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_add_rcu(&tg->list, &task_groups);
 
@@ -10881,6 +11438,8 @@ struct task_group *sched_create_group(st
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
+	if (parent != &root_task_group)
+		tg_multilevel_hierarchy++;
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	return tg;
@@ -10909,8 +11468,12 @@ void sched_destroy_group(struct task_gro
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
+	if (tg->parent != &root_task_group)
+		tg_multilevel_hierarchy--;
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
+	update_effective_shares(tg->parent);
+
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
@@ -10931,6 +11494,17 @@ void __sched_move_task(struct task_struc
 
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
+	else {
+		if (!(tsk->state & TASK_WAKING) && tsk->in_iowait &&
+				tsk->sched_class->nr_iowait_dec)
+			tsk->sched_class->nr_iowait_dec(tsk);
+
+		if (task_contributes_to_load(tsk))
+			task_cfs_rq(tsk)->nr_unint--;
+
+		check_dec_sleeping(rq, tsk);
+	}
+
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
@@ -10945,6 +11519,16 @@ void __sched_move_task(struct task_struc
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
+	else {
+		if (!(tsk->state & TASK_WAKING) && tsk->in_iowait &&
+				tsk->sched_class->nr_iowait_inc)
+			tsk->sched_class->nr_iowait_inc(tsk);
+
+		if (task_contributes_to_load(tsk))
+			task_cfs_rq(tsk)->nr_unint++;
+
+		check_inc_sleeping(rq, tsk);
+	}
 }
 
 void sched_move_task(struct task_struct *tsk)
@@ -10961,27 +11545,11 @@ void sched_move_task(struct task_struct 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static DEFINE_MUTEX(shares_mutex);
 
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static void propagate_shares_change(struct task_group *tg)
 {
 	int i;
 	unsigned long flags;
 
-	/*
-	 * We can't change the weight of the root cgroup.
-	 */
-	if (!tg->se[0])
-		return -EINVAL;
-
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	else if (shares > MAX_SHARES)
-		shares = MAX_SHARES;
-
-	mutex_lock(&shares_mutex);
-	if (tg->shares == shares)
-		goto done;
-
-	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se;
@@ -10993,7 +11561,77 @@ int sched_group_set_shares(struct task_g
 			update_cfs_shares(group_cfs_rq(se));
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
+}
+
+static void set_effective_shares(struct task_group *tg, unsigned long total)
+{
+	unsigned long min_shares, shares;
+
+	min_shares = total * tg->min_shares_pct / 100;
+	shares = min(max(tg->orig_shares, min_shares), scale_load(MAX_SHARES));
+
+	if (tg->shares != shares) {
+		tg->shares = shares;
+		propagate_shares_change(tg);
+	}
+}
+
+static void update_effective_shares_locked(struct task_group *parent)
+{
+	struct task_group *child;
+	unsigned long total = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(child, &parent->children, siblings)
+		total += child->orig_shares;
+
+	list_for_each_entry_rcu(child, &parent->children, siblings)
+		set_effective_shares(child, total);
+
+	rcu_read_unlock();
+}
+
+static void update_effective_shares(struct task_group *parent)
+{
+	mutex_lock(&shares_mutex);
+	update_effective_shares_locked(parent);
+	mutex_unlock(&shares_mutex);
+}
+
+static void update_effective_shares_group_locked(struct task_group *tg)
+{
+	struct task_group *child;
+	unsigned long total = 0;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(child, &tg->parent->children, siblings)
+		total += child->orig_shares;
+
+	rcu_read_unlock();
 
+	set_effective_shares(tg, total);
+}
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+	/*
+	 * We can't change the weight of the root cgroup.
+	 */
+	if (!tg->se[0])
+		return -EINVAL;
+
+	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+
+	mutex_lock(&shares_mutex);
+	if (tg->orig_shares == shares)
+		goto done;
+
+	tg->shares = shares;
+	tg->orig_shares = shares;
+	update_effective_shares_locked(tg->parent);
+
+	propagate_shares_change(tg);
 done:
 	mutex_unlock(&shares_mutex);
 	return 0;
@@ -11003,6 +11641,10 @@ unsigned long sched_group_shares(struct 
 {
 	return tg->shares;
 }
+#else
+static void update_effective_shares(struct task_group *parent)
+{
+}
 #endif
 
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -11026,10 +11668,10 @@ static inline int tg_has_rt_tasks(struct
 {
 	struct task_struct *g, *p;
 
-	do_each_thread(g, p) {
+	do_each_thread_ve(g, p) {
 		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
 			return 1;
-	} while_each_thread(g, p);
+	} while_each_thread_ve(g, p);
 
 	return 0;
 }
@@ -11378,22 +12020,86 @@ cpu_cgroup_exit(struct cgroup_subsys *ss
 		return;
 
 	sched_move_task(task);
+
+	if (thread_group_leader(task)) {
+		struct task_group *tg = cgroup_tg(old_cgrp);
+		struct taskstats *stats = get_cpu_ptr(tg->taskstats);
+		struct signal_struct *sig = task->signal;
+
+		if (sig->stats)
+			delayacct_add_stats(stats, sig->stats);
+		else
+			delayacct_add_tsk(stats, task);
+
+		put_cpu_ptr(stats);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
-	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
 }
 
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 
-	return (u64) tg->shares;
+	return (u64) scale_load_down(tg->orig_shares);
 }
 
+static u64 cpu_effective_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	return (u64) scale_load_down(tg->shares);
+}
+
+static int cpu_min_shares_pct_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 min_shares)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (!tg->se[0])
+		return -EINVAL;
+
+	mutex_lock(&shares_mutex);
+	tg->min_shares_pct = min_shares;
+	update_effective_shares_group_locked(tg);
+	mutex_unlock(&shares_mutex);
+
+	return 0;
+}
+
+static u64 cpu_min_shares_pct_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	return (u64) tg->min_shares_pct;
+}
+
+int sched_cgroup_set_shares(struct cgroup *cgrp, unsigned long shares)
+{
+	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shares));
+}
+
+unsigned long sched_cgroup_get_shares(struct cgroup *cgrp)
+{
+	return scale_load_down(cgroup_tg(cgrp)->orig_shares);
+}
+
+unsigned long sched_cgroup_get_nr_running(struct cgroup *cgrp)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	unsigned long i, sum = 0;
+
+	/* FIXME make it recursive over sub-cgroups */
+	for_each_online_cpu(i)
+		sum += tg->cfs_rq[i]->nr_running;
+
+	return sum;
+}
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
@@ -11402,7 +12108,8 @@ const u64 min_cfs_quota_period = 1 * NSE
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+/* call with cfs_constraints_mutex held */
+static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
 	int i, ret = 0, runtime_enabled;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
@@ -11426,10 +12133,9 @@ static int tg_set_cfs_bandwidth(struct t
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 
-	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	runtime_enabled = quota != RUNTIME_INF;
 	spin_lock_irq(&cfs_b->lock);
@@ -11437,6 +12143,7 @@ static int tg_set_cfs_bandwidth(struct t
 	cfs_b->quota = quota;
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
+	update_cfs_bandwidth_idle_scale(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
 	if (runtime_enabled && cfs_b->timer_active) {
 		/* force a reprogram */
@@ -11451,13 +12158,24 @@ static int tg_set_cfs_bandwidth(struct t
 
 		spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
+		cfs_rq->runtime_remaining = 1;
 
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 		spin_unlock_irq(&rq->lock);
 	}
-out_unlock:
+	return ret;
+}
+
+static void tg_update_cpu_limit(struct task_group *tg);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int ret;
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __tg_set_cfs_bandwidth(tg, period, quota);
+	tg_update_cpu_limit(tg);
 	mutex_unlock(&cfs_constraints_mutex);
 
 	return ret;
@@ -11624,10 +12342,115 @@ static int cpu_stats_show(struct cgroup 
 
 	return 0;
 }
+
+#ifdef CONFIG_CFS_CPULIMIT
+static void tg_update_cpu_limit(struct task_group *tg)
+{
+	long quota, period;
+	unsigned long rate = 0;
+
+	quota = tg_get_cfs_quota(tg);
+	period = tg_get_cfs_period(tg);
+
+	if (quota > 0 && period > 0) {
+		rate = quota * MAX_CPU_RATE / period;
+		rate = max(rate, 1UL);
+	}
+
+	tg->cpu_rate = rate;
+	tg->nr_cpus = 0;
+}
+
+static int tg_set_cpu_limit(struct task_group *tg,
+			    unsigned long cpu_rate, unsigned int nr_cpus)
+{
+	int ret;
+	unsigned long rate;
+	u64 quota = RUNTIME_INF;
+	u64 period = default_cfs_period();
+
+	rate = (cpu_rate && nr_cpus) ?
+		min_t(unsigned long, cpu_rate, nr_cpus * MAX_CPU_RATE) :
+		max_t(unsigned long, cpu_rate, nr_cpus * MAX_CPU_RATE);
+	if (rate) {
+		quota = div_u64(period * rate, MAX_CPU_RATE);
+		quota = max(quota, min_cfs_quota_period);
+	}
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __tg_set_cfs_bandwidth(tg, period, quota);
+	if (!ret) {
+		tg->cpu_rate = cpu_rate;
+		tg->nr_cpus = nr_cpus;
+	}
+	mutex_unlock(&cfs_constraints_mutex);
+
+	return ret;
+}
+
+int sched_cgroup_set_rate(struct cgroup *cgrp, unsigned long rate)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (rate > num_online_cpus() * MAX_CPU_RATE)
+		rate = num_online_cpus() * MAX_CPU_RATE;
+	return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
+}
+
+unsigned long sched_cgroup_get_rate(struct cgroup *cgrp)
+{
+	return cgroup_tg(cgrp)->cpu_rate;
+}
+
+int sched_cgroup_set_nr_cpus(struct cgroup *cgrp, unsigned int nr_cpus)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (nr_cpus > num_online_cpus())
+		nr_cpus = num_online_cpus();
+	return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
+}
+
+unsigned int sched_cgroup_get_nr_cpus(struct cgroup *cgrp)
+{
+	return cgroup_tg(cgrp)->nr_cpus;
+}
+
+static u64 cpu_rate_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return sched_cgroup_get_rate(cgrp);
+}
+
+static int cpu_rate_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+			      u64 rate)
+{
+	return sched_cgroup_set_rate(cgrp, rate);
+}
+
+static u64 nr_cpus_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return sched_cgroup_get_nr_cpus(cgrp);
+}
+
+static int nr_cpus_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+			     u64 nr_cpus)
+{
+	return sched_cgroup_set_nr_cpus(cgrp, nr_cpus);
+}
+#else
+static void tg_update_cpu_limit(struct task_group *tg)
+{
+}
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
+int sched_cgroup_set_rt_runtime(struct cgroup *cgrp, long rt_runtime_us)
+{
+	return sched_group_set_rt_runtime(cgroup_tg(cgrp), rt_runtime_us);
+}
+
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				s64 val)
 {
@@ -11651,13 +12474,486 @@ static u64 cpu_rt_period_read_uint(struc
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+void sched_group_set_start_time(struct task_struct *tsk,
+				const struct timespec *ts)
+{
+	task_group(tsk)->start_time = *ts;
+}
+EXPORT_SYMBOL(sched_group_set_start_time);
+
+static u64 cpu_cgroup_usage_cpu(struct task_group *tg, int i)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SCHEDSTATS)
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return cpu_rq(i)->rq_cpu_time;
+
+	return tg->se[i]->sum_exec_runtime;;
+#else
+	return 0;
+#endif
+}
+
+static void cpu_cgroup_update_stat(struct task_group *tg, int i)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
+	struct sched_entity *se = tg->se[i];
+	struct kernel_cpustat *kcpustat = per_cpu_ptr(tg->cpustat, i);
+	u64 now = cpu_clock(i);
+	u64 delta, idle, iowait;
+
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return;
+
+	iowait = se->iowait_sum;
+	idle = se->sum_sleep_runtime;
+	kcpustat->cpustat[STEAL] = se->wait_sum;
+
+	if (idle > iowait)
+		idle -= iowait;
+	else
+		idle = 0;
+
+	if (se->sleep_start) {
+		delta = now - se->sleep_start;
+		if ((s64)delta > 0)
+			idle += delta;
+	} else if (se->block_start) {
+		delta = now - se->block_start;
+		if ((s64)delta > 0)
+			iowait += delta;
+	} else if (se->wait_start) {
+		delta = now - se->wait_start;
+		if ((s64)delta > 0)
+			kcpustat->cpustat[STEAL] += delta;
+	}
+
+	kcpustat->cpustat[IDLE] = max(kcpustat->cpustat[IDLE], idle);
+	kcpustat->cpustat[IOWAIT] = max(kcpustat->cpustat[IOWAIT], iowait);
+
+	kcpustat->cpustat[USED] = cpu_cgroup_usage_cpu(tg, i);
+#endif
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+				       struct kernel_cpustat *rem, int ind,
+				       u64 cur_usage, u64 target_usage,
+				       u64 rem_usage)
+{
+	s64 scaled_val;
+	u32 scale_pct = 0;
+
+	/* distribute the delta among USER, NICE, and SYSTEM proportionally */
+	if (cur_usage < target_usage) {
+		if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * rem->cpustat[ind],
+					      rem_usage);
+	} else {
+		if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * cur->cpustat[ind],
+					      cur_usage);
+	}
+
+	scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
+
+	cur->cpustat[ind] += scaled_val;
+	if ((s64)cur->cpustat[ind] < 0)
+		cur->cpustat[ind] = 0;
+
+	rem->cpustat[ind] -= scaled_val;
+	if ((s64)rem->cpustat[ind] < 0)
+		rem->cpustat[ind] = 0;
+}
+
+static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
+				     int ind, u64 cur_idle, u64 target_idle)
+{
+	/* distribute target_idle between IDLE and IOWAIT proportionally to
+	 * what we initially had on this vcpu */
+	if ((s64)cur_idle > 0) {
+		u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
+		cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
+	} else {
+		cur->cpustat[ind] = ind == IDLE ? target_idle : 0;
+	}
+}
+
+static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
+				 struct kernel_cpustat *rem,
+				 u64 max_usage)
+{
+	u64 cur_usage, target_usage, rem_usage;
+	u64 cur_idle, target_idle;
+
+	cur_usage = kernel_cpustat_total_usage(cur);
+	rem_usage = kernel_cpustat_total_usage(rem);
+
+	target_usage = min(cur_usage + rem_usage,
+			   max_usage);
+
+	if (cur_usage != target_usage) {
+		fixup_vcpustat_delta_usage(cur, rem, USER,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, NICE,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, SYSTEM,
+				cur_usage, target_usage, rem_usage);
+	}
+
+	cur_idle = kernel_cpustat_total_idle(cur);
+	target_idle = (max_usage - target_usage) * TICK_NSEC;
+
+	if (cur_idle != target_idle) {
+		calc_vcpustat_delta_idle(cur, IDLE,
+					 cur_idle, target_idle);
+		calc_vcpustat_delta_idle(cur, IOWAIT,
+					 cur_idle, target_idle);
+	}
+
+	cur->cpustat[USED] = target_usage * TICK_NSEC;
+
+	/* do not show steal time inside ve */
+	cur->cpustat[STEAL] = 0;
+}
+
+static void update_tg_vcpustat(struct task_group *tg)
+{
+	int i, j;
+	int nr_vcpus;
+	int vcpu_rate;
+	ktime_t now;
+	u64 abs_delta_ns, max_usage;
+	struct kernel_cpustat stat_delta, stat_rem;
+	int first_pass = 1;
+
+	spin_lock(&tg->vcpustat_lock);
+
+	now = ktime_get();
+	nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	vcpu_rate = DIV_ROUND_UP(tg->cpu_rate, nr_vcpus);
+	if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
+		vcpu_rate = MAX_CPU_RATE;
+
+	if (!ktime_to_ns(tg->vcpustat_last_update)) {
+		/* on the first read initialize vcpu i stat as a sum of stats
+		 * over pcpus j such that j % nr_vcpus == i */
+		for (i = 0; i < nr_vcpus; i++) {
+			for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+				if (!cpu_possible(j))
+					continue;
+				kernel_cpustat_add(tg->vcpustat + i,
+						   per_cpu_ptr(tg->cpustat, j),
+						   tg->vcpustat + i);
+			}
+		}
+		goto out_update_last;
+	}
+
+	abs_delta_ns = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
+	max_usage = div_u64(abs_delta_ns, TICK_NSEC);
+	max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
+	/* don't allow to update stats too often to avoid calculation errors */
+	if (max_usage < 10)
+		goto out_unlock;
+
+	/* temporarily copy per cpu usage delta to tg->cpustat_last */
+	for_each_possible_cpu(i)
+		kernel_cpustat_sub(per_cpu_ptr(tg->cpustat, i),
+				   tg->cpustat_last + i,
+				   tg->cpustat_last + i);
+
+	/* proceed to calculating per vcpu delta */
+	kernel_cpustat_zero(&stat_rem);
+
+again:
+	for (i = 0; i < nr_vcpus; i++) {
+		int exceeds_max;
+
+		kernel_cpustat_zero(&stat_delta);
+		for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+			if (!cpu_possible(j))
+				continue;
+			kernel_cpustat_add(&stat_delta,
+					   tg->cpustat_last + j, &stat_delta);
+		}
+
+		exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
+								max_usage;
+		/*
+		 * On the first pass calculate delta for vcpus with usage >
+		 * max_usage in order to accumulate excess in stat_rem.
+		 *
+		 * Once the remainder is accumulated, proceed to the rest of
+		 * vcpus so that it will be distributed among them.
+		*/
+		if (exceeds_max != first_pass)
+			continue;
+
+		fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
+		kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
+				   tg->vcpustat + i);
+	}
+
+	if (first_pass) {
+		first_pass = 0;
+		goto again;
+	}
+out_update_last:
+	for_each_possible_cpu(i)
+		tg->cpustat_last[i] = *per_cpu_ptr(tg->cpustat, i);
+	tg->vcpustat_last_update = now;
+out_unlock:
+	spin_unlock(&tg->vcpustat_lock);
+}
+
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *p)
+{
+	int i;
+	unsigned long jif;
+	cputime64_t user, nice, system, idle, iowait, steal;
+	struct timespec boottime;
+	struct task_group *tg = cgroup_tg(cgrp);
+	int virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
+	int nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	struct kernel_cpustat *kcpustat;
+	unsigned long tg_nr_running = 0;
+	unsigned long tg_nr_iowait = 0;
+	unsigned long long tg_nr_switches = 0;
+	unsigned long tg_nr_forks = 0;
+	int virtual = !ve_is_super(get_exec_env());
+
+	getboottime(&boottime);
+	jif = boottime.tv_sec + tg->start_time.tv_sec;
+
+	for_each_possible_cpu(i) {
+		cpu_cgroup_update_stat(tg, i);
+
+		/* root task group has autogrouping, so this doesn't hold */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		tg_nr_running += tg->cfs_rq[i]->nr_running;
+		tg_nr_iowait += tg->cfs_rq[i]->nr_iowait;
+		tg_nr_switches += tg->cfs_rq[i]->nr_switches;
+		tg_nr_forks += tg->cfs_rq[i]->nr_forks;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		tg_nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	if (virt)
+		update_tg_vcpustat(tg);
+
+	user = nice = system = idle = iowait = steal = cputime64_zero;
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_possible(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i :
+			per_cpu_ptr(tg->cpustat, i);
+		user += kcpustat->cpustat[USER];
+		nice += kcpustat->cpustat[NICE];
+		system += kcpustat->cpustat[SYSTEM];
+		idle += kcpustat->cpustat[IDLE];
+		iowait += kcpustat->cpustat[IOWAIT];
+		steal += kcpustat->cpustat[STEAL];
+	}
+
+	if (virtual)
+		steal = cputime64_zero;
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu 0 0 %llu\n",
+		(unsigned long long)cputime64_to_clock_t(user),
+		(unsigned long long)cputime64_to_clock_t(nice),
+		(unsigned long long)cputime64_to_clock_t(system),
+		(unsigned long long)nsec_to_clock_t(idle),
+		(unsigned long long)nsec_to_clock_t(iowait),
+		(unsigned long long)nsec_to_clock_t(steal));
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_online(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i :
+			per_cpu_ptr(tg->cpustat, i);
+		user = kcpustat->cpustat[USER];
+		nice = kcpustat->cpustat[NICE];
+		system = kcpustat->cpustat[SYSTEM];
+		idle = kcpustat->cpustat[IDLE];
+		iowait = kcpustat->cpustat[IOWAIT];
+		steal = kcpustat->cpustat[STEAL];
+
+		if (virtual)
+			steal = cputime64_zero;
+
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
+			i,
+			(unsigned long long)cputime64_to_clock_t(user),
+			(unsigned long long)cputime64_to_clock_t(nice),
+			(unsigned long long)cputime64_to_clock_t(system),
+			(unsigned long long)nsec_to_clock_t(idle),
+			(unsigned long long)nsec_to_clock_t(iowait),
+			(unsigned long long)nsec_to_clock_t(steal));
+	}
+	seq_printf(p, "intr 0\nswap 0 0\n");
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		tg_nr_switches,
+		(unsigned long)jif,
+		tg_nr_forks,
+		tg_nr_running,
+		tg_nr_iowait);
+
+	return 0;
+}
+
+void cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	int nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	int i;
+
+	for_each_possible_cpu(i)
+		cpu_cgroup_update_stat(tg, i);
+
+	update_tg_vcpustat(tg);
+
+	kernel_cpustat_zero(kstat);
+	for (i = 0; i < nr_vcpus; i++)
+		kernel_cpustat_add(tg->vcpustat + i, kstat, kstat);
+}
+
+int cpu_cgroup_get_avenrun(struct cgroup *cgrp, unsigned long *avenrun)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (tg == &root_task_group)
+		return -ENOSYS;
+
+	avenrun[0] = tg->avenrun[0];
+	avenrun[1] = tg->avenrun[1];
+	avenrun[2] = tg->avenrun[2];
+
+	return 0;
+}
+
+static const char *cpuacct_stat_desc[] = {
+	[CPUACCT_STAT_USER] = "user",
+	[CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
+		struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	int cpu;
+	s64 user = 0, sys = 0;
+
+	for_each_present_cpu(cpu) {
+		struct kernel_cpustat *kcpustat = per_cpu_ptr(tg->cpustat, cpu);
+		user += kcpustat->cpustat[USER];
+		user += kcpustat->cpustat[NICE];
+		sys += kcpustat->cpustat[SYSTEM];
+	}
+
+	user = cputime64_to_clock_t(user);
+	sys = cputime64_to_clock_t(sys);
+	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], user);
+	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], sys);
+
+	return 0;
+}
+
+static u64 cpu_cgroup_cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	u64 totalcpuusage = 0;
+	int i;
+
+	for_each_present_cpu(i)
+		totalcpuusage += cpu_cgroup_usage_cpu(tg, i);
+
+	return totalcpuusage;
+}
+
+static int cpu_cgroup_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+				      struct seq_file *m)
+{
+	struct task_group *tg = cgroup_tg(cgroup);
+	u64 percpu;
+	int i;
+
+	for_each_present_cpu(i) {
+		percpu = cpu_cgroup_usage_cpu(tg, i);
+		seq_printf(m, "%llu ", (unsigned long long) percpu);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static int cpu_cgroup_delay_show(struct cgroup *cgrp, struct cftype *cft,
+				 struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct taskstats stats;
+	struct cgroup_iter it;
+	struct task_struct *p;
+	int cpu;
+
+	memset(&stats, 0, sizeof stats);
+
+	for_each_present_cpu(cpu)
+		delayacct_add_stats(&stats, per_cpu_ptr(tg->taskstats, cpu));
+
+	cgroup_iter_start(cgrp, &it);
+	while ((p = cgroup_iter_next(cgrp, &it))) {
+		if (thread_group_leader(p) && p->signal->stats)
+			delayacct_add_stats(&stats, p->signal->stats);
+		delayacct_add_tsk(&stats, p);
+	}
+	cgroup_iter_end(cgrp, &it);
+
+	cb->fill(cb, "cpu_count", stats.cpu_count);
+	cb->fill(cb, "cpu_delay", stats.cpu_delay_total);
+	cb->fill(cb, "cpu_run_real", stats.cpu_run_real_total);
+	cb->fill(cb, "cpu_run_virtual", stats.cpu_run_virtual_total);
+	cb->fill(cb, "cpu_scaled_run_real", stats.cpu_scaled_run_real_total);
+	cb->fill(cb, "blkio_count", stats.blkio_count);
+	cb->fill(cb, "blkio_delay", stats.blkio_delay_total);
+	cb->fill(cb, "swapin_count", stats.swapin_count);
+	cb->fill(cb, "swapin_delay", stats.swapin_delay_total);
+	cb->fill(cb, "freepages_count", stats.freepages_count);
+	cb->fill(cb, "freepages_delay", stats.freepages_delay_total);
+
+	return 0;
+}
+
 static struct cftype cpu_files[] = {
+	{
+		.name = "proc.stat",
+		.read_seq_string = cpu_cgroup_proc_stat,
+	},
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
+	{
+		.name = "effective_shares",
+		.read_u64 = cpu_effective_shares_read_u64,
+	},
+	{
+		.name = "min_shares_pct",
+		.read_u64 = cpu_min_shares_pct_read_u64,
+		.write_u64 = cpu_min_shares_pct_write_u64,
+	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
@@ -11675,6 +12971,18 @@ static struct cftype cpu_files[] = {
 		.read_map = cpu_stats_show,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.name = "rate",
+		.read_u64 = cpu_rate_read_u64,
+		.write_u64 = cpu_rate_write_u64,
+	},
+	{
+		.name = "nr_cpus",
+		.read_u64 = nr_cpus_read_u64,
+		.write_u64 = nr_cpus_write_u64,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
@@ -11687,6 +12995,22 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
+	{
+		.name = "acct.stat",
+		.read_map = cpu_cgroup_stats_show,
+	},
+	{
+		.name = "usage",
+		.read_u64 = cpu_cgroup_cpuusage_read,
+	},
+	{
+		.name = "usage_percpu",
+		.read_seq_string = cpu_cgroup_percpu_seq_read,
+	},
+	{
+		.name = "delayacct.total",
+		.read_map = cpu_cgroup_delay_show,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -11870,11 +13194,6 @@ static int cpuacct_percpu_seq_read(struc
 	return 0;
 }
 
-static const char *cpuacct_stat_desc[] = {
-	[CPUACCT_STAT_USER] = "user",
-	[CPUACCT_STAT_SYSTEM] = "system",
-};
-
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
 		struct cgroup_map_cb *cb)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched_debug.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_debug.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched_debug.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_debug.c	2015-01-21 12:02:54.283932765 +0300
@@ -151,12 +151,12 @@ static void print_rq(struct seq_file *m,
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
-	do_each_thread(g, p) {
+	do_each_thread_all(g, p) {
 		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
 			continue;
 
 		print_task(m, rq, p);
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
@@ -224,6 +224,11 @@ void print_cfs_rq(struct seq_file *m, in
 		   atomic_read(&tg->load_weight));
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+	SEQ_printf(m, "  .%-30s: %d\n", "nr_cpus_active",
+		   atomic_read(&tg->nr_cpus_active));
+#endif
+
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -361,9 +366,11 @@ static int sched_debug_show(struct seq_f
 	return 0;
 }
 
-static void sysrq_sched_debug_show(void)
+void show_sched_debug(void)
 {
+	read_lock(&tasklist_lock);
 	sched_debug_show(NULL, NULL);
+	read_unlock(&tasklist_lock);
 }
 
 static int sched_debug_open(struct inode *inode, struct file *filp)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched_fair.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_fair.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched_fair.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_fair.c	2015-01-21 12:02:54.649923051 +0300
@@ -109,6 +109,10 @@ unsigned int __read_mostly sysctl_sched_
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
+#endif
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -325,6 +329,126 @@ find_matching_se(struct sched_entity **s
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttled;
+}
+
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return !list_empty(&cfs_rq->boosted_entities);
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return se->boosted;
+}
+#else
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_CFS_CPULIMIT
+static inline int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->active;
+}
+
+static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+	/* if we canceled delayed dec, there is no need to do inc */
+	if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
+		atomic_inc(&cfs_rq->tg->nr_cpus_active);
+	cfs_rq->active = 1;
+}
+
+static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+	if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
+		postpone = 0;
+
+	if (!postpone) {
+		cfs_rq->active = 0;
+		atomic_dec(&cfs_rq->tg->nr_cpus_active);
+	} else {
+		__hrtimer_start_range_ns(&cfs_rq->active_timer,
+				ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
+				HRTIMER_MODE_REL_PINNED, 0);
+	}
+}
+
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+	struct cfs_rq *cfs_rq =
+		container_of(timer, struct cfs_rq, active_timer);
+	struct rq *rq = rq_of(cfs_rq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	cfs_rq->active = !!cfs_rq->task_weight;
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	atomic_dec(&cfs_rq->tg->nr_cpus_active);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline int check_cpulimit_spread(struct cfs_rq *cfs_rq, int target_cpu)
+{
+	struct task_group *tg = cfs_rq->tg;
+	int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
+	int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
+
+	nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
+		min_t(int, nr_cpus_limit, tg->nr_cpus) :
+		max_t(int, nr_cpus_limit, tg->nr_cpus);
+
+	if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
+		return 1;
+
+	if (nr_cpus_active > nr_cpus_limit)
+		return -1;
+
+	return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
+}
+#else /* !CONFIG_CFS_CPULIMIT */
+static inline int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+	return 1;
+}
+
+static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+}
+
+static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+	return 0;
+}
+
+static inline int check_cpulimit_spread(struct cfs_rq *cfs_rq, int target_cpu)
+{
+	return 1;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				   unsigned long delta_exec);
 
@@ -682,6 +806,98 @@ update_stats_curr_start(struct cfs_rq *c
 	se->exec_start = rq_of(cfs_rq)->clock;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_entity_boost(struct sched_entity *se)
+{
+	if (!entity_is_task(se))
+		se->boosted = cfs_rq_has_boosted_entities(group_cfs_rq(se));
+	else {
+		struct task_struct *p = task_of(se);
+
+		if (!(preempt_count() & PREEMPT_ACTIVE)) {
+			se->boosted = sched_feat(BOOST_WAKEUPS) &&
+					p->woken_while_running;
+			p->woken_while_running = 0;
+		} else {
+			se->boosted = sched_feat(BOOST_PREEMPT) &&
+					!p->may_throttle;
+		}
+	}
+}
+
+static int check_enqueue_boost(struct rq *rq, struct task_struct *p, int flags)
+{
+	if (sched_feat(BOOST_WAKEUPS) && (flags & ENQUEUE_WAKEUP))
+		p->se.boosted = 1;
+	return p->se.boosted;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	list_add(&se->boost_node, &cfs_rq->boosted_entities);
+}
+
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	list_del(&se->boost_node);
+}
+
+static int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (entity_is_task(se) || !entity_boosted(se)) {
+		if (se != cfs_rq->curr)
+			__enqueue_boosted_entity(cfs_rq, se);
+		se->boosted = 1;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (entity_is_task(se) ||
+	    !cfs_rq_has_boosted_entities(group_cfs_rq(se))) {
+		if (se != cfs_rq->curr)
+			__dequeue_boosted_entity(cfs_rq, se);
+		if (!entity_is_task(se))
+			se->boosted = 0;
+		return 1;
+	}
+
+	return 0;
+}
+#else
+static inline void update_entity_boost(struct sched_entity *se) {}
+
+static inline int check_enqueue_boost(struct rq *rq,
+				      struct task_struct *p, int flags)
+{
+	return 0;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+
+static inline int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+#endif
+
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -879,12 +1095,13 @@ static void enqueue_sleeper(struct cfs_r
 			se->sleep_max = delta;
 
 		se->sleep_start = 0;
-		se->sum_sleep_runtime += delta;
 
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
+
+		se->sum_sleep_runtime += delta;
 	}
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -896,7 +1113,6 @@ static void enqueue_sleeper(struct cfs_r
 			se->block_max = delta;
 
 		se->block_start = 0;
-		se->sum_sleep_runtime += delta;
 
 		if (tsk) {
 			if (tsk->in_iowait) {
@@ -918,11 +1134,63 @@ static void enqueue_sleeper(struct cfs_r
 						delta >> 20);
 			}
 			account_scheduler_latency(tsk, delta >> 10, 0);
-		}
+		} else
+			se->iowait_sum += delta;
+
+		se->sum_sleep_runtime += delta;
 	}
 #endif
 }
 
+static void dequeue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+	if (entity_is_task(se)) {
+		struct task_struct *tsk = task_of(se);
+
+		if (tsk->state & TASK_INTERRUPTIBLE)
+			se->sleep_start = rq_of(cfs_rq)->clock;
+		if (tsk->state & TASK_UNINTERRUPTIBLE)
+			se->block_start = rq_of(cfs_rq)->clock;
+		if (tsk->in_iowait)
+			cfs_rq->nr_iowait++;
+	} else if (!cfs_rq_throttled(group_cfs_rq(se))) {
+		if (group_cfs_rq(se)->nr_iowait)
+			se->block_start = rq_of(cfs_rq)->clock;
+		else
+			se->sleep_start = rq_of(cfs_rq)->clock;
+	}
+#endif
+}
+
+static void stop_cfs_idle_time_accounting(int cpu)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+
+	list_for_each_entry(tg, &task_groups, list) {
+		if (tg != &root_task_group &&
+		    !tg->cfs_rq[cpu]->nr_running) {
+			se = tg->se[cpu];
+			enqueue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+}
+
+static void start_cfs_idle_time_accounting(int cpu)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+
+	list_for_each_entry(tg, &task_groups, list) {
+		if (tg != &root_task_group &&
+		    !tg->cfs_rq[cpu]->nr_running) {
+			se = tg->se[cpu];
+			dequeue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+}
+
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1003,7 +1271,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
-		check_enqueue_throttle(cfs_rq);
+		if (!(flags & ENQUEUE_BOOST))
+			check_enqueue_throttle(cfs_rq);
 	}
 }
 
@@ -1063,21 +1332,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 	update_curr(cfs_rq);
 
 	update_stats_dequeue(cfs_rq, se);
-	if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->sleep_start = rq_of(cfs_rq)->clock;
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->block_start = rq_of(cfs_rq)->clock;
-		}
-#endif
-	}
+	if (flags & DEQUEUE_SLEEP)
+		dequeue_sleeper(cfs_rq, se);
 
 	clear_buddies(cfs_rq, se);
 
+	if (cfs_rq->prev == se)
+		cfs_rq->prev = NULL;
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
@@ -1095,9 +1357,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
 
-	/* return excess runtime on last dequeue */
-	if (!cfs_rq->nr_running)
+	if (!cfs_rq->nr_running) {
+		/* return excess runtime on last dequeue */
 		return_cfs_rq_runtime(cfs_rq);
+		/* account switch to idle task */
+		cfs_rq->nr_switches++;
+	}
 }
 
 /*
@@ -1155,10 +1420,14 @@ set_next_entity(struct cfs_rq *cfs_rq, s
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		if (entity_boosted(se))
+			__dequeue_boosted_entity(cfs_rq, se);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
+	if (cfs_rq->prev != se)
+		cfs_rq->nr_switches++;
 #ifdef CONFIG_SCHEDSTATS
 	/*
 	 * Track our maximum slice length, if the CPU's load is at
@@ -1183,7 +1452,7 @@ wakeup_preempt_entity(struct sched_entit
  * 3) pick the "last" process, for cache locality
  * 4) do not run the "skip" process, if something else is available
  */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *do_pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *left = se;
@@ -1215,7 +1484,43 @@ static struct sched_entity *pick_next_en
 	return se;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se = NULL;
+
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) {
+		rq->cfs_quota_exceeded = 1;
+		if (cfs_rq_has_boosted_entities(cfs_rq)) {
+			se = list_first_entry(&cfs_rq->boosted_entities,
+					      struct sched_entity, boost_node);
+			clear_buddies(cfs_rq, se);
+		}
+	}
+
+	if (!se)
+		se = do_pick_next_entity(cfs_rq);
+
+	if (entity_is_task(se) &&
+	    !entity_boosted(se) && rq->cfs_quota_exceeded)
+		resched_task(task_of(se));
+
+	return se;
+}
+
+static void task_scheduled_fair(struct rq *rq, struct task_struct *p)
+{
+	if (rq->cfs_quota_exceeded)
+		set_tsk_need_resched(p);
+	rq->cfs_quota_exceeded = 0;
+}
+#else
+# define pick_next_entity do_pick_next_entity
+#endif
+
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -1226,6 +1531,14 @@ static void put_prev_entity(struct cfs_r
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
+	update_entity_boost(prev);
+	if (entity_boosted(prev) && prev->on_rq) {
+		__enqueue_boosted_entity(cfs_rq, prev);
+		if (unlikely(cfs_rq_throttled(cfs_rq)))
+			/* prev was moved to throttled cfs_rq */
+			unthrottle_cfs_rq(cfs_rq);
+	}
+
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
@@ -1234,7 +1547,9 @@ static void put_prev_entity(struct cfs_r
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
-	}
+		cfs_rq->prev = prev;
+	} else
+		cfs_rq->prev = NULL;
 	cfs_rq->curr = NULL;
 }
 
@@ -1292,6 +1607,55 @@ static inline u64 sched_cfs_bandwidth_sl
 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 }
 
+static void restart_tg_idle_time_accounting(struct task_group *tg)
+{
+#ifdef CONFIG_SCHEDSTATS
+	int cpu;
+
+	if (tg == &root_task_group)
+		return;
+
+	/*
+	 * XXX: We call enqueue_sleeper/dequeue_sleeper without rq lock for
+	 * the sake of performance, because in the worst case this can only
+	 * lead to an idle/iowait period lost in stats.
+	 */
+	for_each_online_cpu(cpu) {
+		struct sched_entity *se = tg->se[cpu];
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+
+		if (!cfs_rq->load.weight) {
+			enqueue_sleeper(cfs_rq_of(se), se);
+			dequeue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+#endif
+}
+
+static void update_cfs_bandwidth_idle_scale(struct cfs_bandwidth *cfs_b)
+{
+	u64 runtime = cfs_b->runtime;
+	u64 quota = cfs_b->quota;
+	u64 max_quota = ktime_to_ns(cfs_b->period) * num_online_cpus();
+	struct task_group *tg =
+		container_of(cfs_b, struct task_group, cfs_bandwidth);
+
+	restart_tg_idle_time_accounting(tg);
+
+	/*
+	 * idle_scale = quota_left / (period * nr_idle_cpus)
+	 * nr_idle_cpus = nr_cpus - nr_busy_cpus
+	 * nr_busy_cpus = (quota - quota_left) / period
+	 */
+	if (quota == RUNTIME_INF || quota >= max_quota)
+		cfs_b->idle_scale_inv = CFS_IDLE_SCALE;
+	else if (runtime)
+		cfs_b->idle_scale_inv = div64_u64(CFS_IDLE_SCALE *
+				(max_quota - quota + runtime), runtime);
+	else
+		cfs_b->idle_scale_inv = 0;
+}
+
 /*
  * Replenish runtime according to assigned quota and update expiration time.
  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
@@ -1303,6 +1667,8 @@ static void __refill_cfs_bandwidth_runti
 {
 	u64 now;
 
+	update_cfs_bandwidth_idle_scale(cfs_b);
+
 	if (cfs_b->quota == RUNTIME_INF)
 		return;
 
@@ -1412,11 +1778,6 @@ static void account_cfs_rq_runtime(struc
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->throttled;
-}
-
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -1490,6 +1851,8 @@ static void throttle_cfs_rq(struct cfs_r
 			  (void *)(long)rq_of(cfs_rq)->cpu);
 	rcu_read_unlock();
 
+	cfs_rq->throttled = 1;
+
 	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -1508,7 +1871,6 @@ static void throttle_cfs_rq(struct cfs_r
 	if (!se)
 		rq->nr_running -= task_delta;
 
-	cfs_rq->throttled = 1;
 	cfs_rq->throttled_timestamp = rq->clock;
 	spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1551,6 +1913,9 @@ static void check_cfs_rq_runtime(struct 
 	if (cfs_rq_throttled(cfs_rq))
 		return;
 
+	if (cfs_rq_has_boosted_entities(cfs_rq))
+		return;
+
 	throttle_cfs_rq(cfs_rq);
 }
 
@@ -1812,7 +2177,6 @@ static void do_sched_cfs_slack_timer(str
 		cfs_b->runtime = runtime;
 	spin_unlock(&cfs_b->lock);
 }
-
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 		unsigned long delta_exec) {}
@@ -1820,11 +2184,6 @@ static void check_cfs_rq_runtime(struct 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-	return 0;
-}
-
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -1907,11 +2266,17 @@ enqueue_task_fair(struct rq *rq, struct 
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boost = check_enqueue_boost(rq, p, flags);
+
+	if (!task_cfs_rq(p)->task_weight)
+		inc_nr_active_cfs_rqs(cfs_rq_of(&p->se));
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
+		if (boost)
+			flags |= ENQUEUE_BOOST;
 		enqueue_entity(cfs_rq, se, flags);
 
 		/*
@@ -1924,6 +2289,9 @@ enqueue_task_fair(struct rq *rq, struct 
 			break;
 		cfs_rq->h_nr_running++;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		flags = ENQUEUE_WAKEUP;
 	}
        for_each_sched_entity(se) {
@@ -1933,12 +2301,23 @@ enqueue_task_fair(struct rq *rq, struct 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
 
 	if (!se)
 		inc_nr_running(rq);
+	else if (boost)
+		for_each_sched_entity(se) {
+			cfs_rq = cfs_rq_of(se);
+			if (!enqueue_boosted_entity(cfs_rq, se))
+				break;
+			if (cfs_rq_throttled(cfs_rq))
+				unthrottle_cfs_rq(cfs_rq);
+		}
 	hrtick_update(rq);
 }
 
@@ -1953,6 +2332,7 @@ static void dequeue_task_fair(struct rq 
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boosted = entity_boosted(se);
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
 	for_each_sched_entity(se) {
@@ -1969,6 +2349,9 @@ static void dequeue_task_fair(struct rq 
 			break;
 		cfs_rq->h_nr_running--;
 
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/*
@@ -1990,6 +2373,9 @@ static void dequeue_task_fair(struct rq 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
+
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
@@ -1997,6 +2383,9 @@ static void dequeue_task_fair(struct rq 
 	if (!se)
 		dec_nr_running(rq);
 	hrtick_update(rq);
+
+	if (!task_cfs_rq(p)->task_weight)
+		dec_nr_active_cfs_rqs(cfs_rq_of(&p->se), task_sleep);
 }
 
 #ifdef CONFIG_SMP
@@ -2177,7 +2566,7 @@ find_idlest_group(struct sched_domain *s
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2266,6 +2655,33 @@ static int select_idle_sibling(struct ta
 	return target;
 }
 
+static inline int cpu_is_runnable(struct task_struct *p, int cpu)
+{
+	return cfs_rq_active(cpu_cfs_rq(task_cfs_rq(p), cpu));
+}
+
+static int select_runnable_cpu(struct task_struct *p, int new_cpu)
+{
+	struct sched_domain *sd;
+	int prev_cpu = task_cpu(p);
+	int cpu;
+
+	if (cpu_is_runnable(p, new_cpu))
+		return new_cpu;
+
+	if (cpu_is_runnable(p, prev_cpu))
+		return prev_cpu;
+
+	for_each_domain(new_cpu, sd) {
+		for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) {
+			if (cpu_is_runnable(p, cpu))
+				return cpu;
+		}
+	}
+
+	return new_cpu;
+}
+
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -2313,7 +2729,7 @@ select_task_rq_fair(struct rq *rq, struc
 				nr_running += cpu_rq(i)->cfs.nr_running;
 			}
 
-			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
 
 			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 				nr_running /= 2;
@@ -2343,11 +2759,17 @@ select_task_rq_fair(struct rq *rq, struc
 
 	if (affine_sd) {
 		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-			return select_idle_sibling(p, cpu);
+			new_cpu = cpu;
 		else
-			return select_idle_sibling(p, prev_cpu);
+			new_cpu = prev_cpu;
 	}
 
+	if (check_cpulimit_spread(task_cfs_rq(p), new_cpu) <= 0)
+		return select_runnable_cpu(p, new_cpu);
+
+	if (affine_sd)
+		return select_idle_sibling(p, new_cpu);
+
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
 		struct sched_group *group;
@@ -2577,6 +2999,32 @@ static struct task_struct *pick_next_tas
 	return p;
 }
 
+static int trigger_cpulimit_balance(struct rq *this_rq, struct task_struct *p)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct cfs_rq *this_cfs_rq = task_cfs_rq(p);
+	int this_cpu = cpu_of(this_rq);
+	int cpu;
+
+	spin_unlock(&this_rq->lock);
+	for_each_online_cpu(cpu) {
+		struct cfs_rq *cfs_rq = cpu_cfs_rq(this_cfs_rq, cpu);
+
+		if (cpu != this_cpu && cfs_rq_active(cfs_rq) &&
+		    cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+			wake_up_process(this_rq->migration_thread);
+			spin_lock(&this_rq->lock);
+			this_rq->active_balance = ACTIVE_BALANCE_CPULIMIT;
+			this_rq->push_cpu = cpu;
+			return 1;
+		}
+	}
+	spin_lock(&this_rq->lock);
+
+	return 0;
+#endif
+}
+
 /*
  * Account for a descheduled task:
  */
@@ -2589,6 +3037,10 @@ static void put_prev_task_fair(struct rq
 		cfs_rq = cfs_rq_of(se);
 		put_prev_entity(cfs_rq, se);
 	}
+
+	if (prev->se.on_rq &&
+	    check_cpulimit_spread(task_cfs_rq(prev), cpu_of(rq)) < 0)
+		trigger_cpulimit_balance(rq, prev);
 }
 
 /*
@@ -2682,8 +3134,8 @@ static struct task_struct *load_balance_
 static unsigned long
 __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		unsigned long max_load_move, struct sched_domain *sd,
-		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
-		struct cfs_rq *cfs_rq)
+		enum cpu_idle_type idle, int *all_pinned,
+		struct cfs_rq *cfs_rq, int force)
 {
 	struct rq_iterator cfs_rq_iterator;
 
@@ -2693,7 +3145,7 @@ __load_balance_fair(struct rq *this_rq, 
 
 	return balance_tasks(this_rq, this_cpu, busiest,
 			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &cfs_rq_iterator);
+			&cfs_rq_iterator, force);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2744,24 +3196,51 @@ static void update_shares(int cpu)
 	rcu_read_unlock();
 }
 
+static int
+can_migrate_cfs_rq_tasks(struct cfs_rq *cfs_rq, struct rq *rq,
+			 int this_cpu, struct sched_domain *sd,
+			 int *all_pinned, int force)
+{
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+	struct sched_entity *iter;
+	struct task_struct *p;
+
+	if (unlikely(!se))
+		return 0;
+
+	list_for_each_entry(iter, &cfs_rq->tasks, group_node) {
+		p = task_of(iter);
+		if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed))
+			return 0;
+	}
+	*all_pinned = 0;
+
+	if (cfs_rq_of(se)->curr == se)
+		return 0;
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries ||
+	    force || !entity_hot(se, rq->clock, sd))
+		return 1;
+
+	return 0;
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
+		  int *all_pinned)
 {
 	long rem_load_move = max_load_move;
 	int busiest_cpu = cpu_of(busiest);
-	struct task_group *tg;
-
-	rcu_read_lock();
-	update_h_load(busiest_cpu);
+	struct cfs_rq *busiest_cfs_rq;
 
-	list_for_each_entry_rcu(tg, &task_groups, list) {
-		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
-		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
+		struct task_group *tg = busiest_cfs_rq->tg;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+		unsigned long busiest_h_load;
 		u64 rem_load, moved_load;
+		int force = 0;
 
 		/*
 		 * empty group or part of a throttled hierarchy
@@ -2770,12 +3249,28 @@ load_balance_fair(struct rq *this_rq, in
 		    throttled_lb_pair(tg, busiest_cpu, this_cpu))
 			continue;
 
+		if (tg == &root_task_group)
+			busiest_h_load = busiest_weight;
+		else if (tg->parent == &root_task_group)
+			busiest_h_load = tg->se[busiest_cpu]->load.weight;
+		else
+			busiest_h_load = busiest_cfs_rq->h_load;
+
 		rem_load = (u64)rem_load_move * busiest_weight;
 		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
+		if (check_cpulimit_spread(busiest_cfs_rq, this_cpu) < 0) {
+			if (rem_load <= busiest_cfs_rq->task_weight / 2 ||
+			    !can_migrate_cfs_rq_tasks(busiest_cfs_rq, busiest,
+					this_cpu, sd, all_pinned, 0))
+				continue;
+			rem_load = busiest_cfs_rq->task_weight;
+			force = 1;
+		}
+
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-				rem_load, sd, idle, all_pinned, this_best_prio,
-				tg->cfs_rq[busiest_cpu]);
+				rem_load, sd, idle, all_pinned,
+				tg->cfs_rq[busiest_cpu], force);
 
 		if (!moved_load)
 			continue;
@@ -2787,7 +3282,6 @@ load_balance_fair(struct rq *this_rq, in
 		if (rem_load_move < 0)
 			break;
 	}
-	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
@@ -2800,11 +3294,11 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
+		  int *all_pinned)
 {
 	return __load_balance_fair(this_rq, this_cpu, busiest,
 			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &busiest->cfs);
+			&busiest->cfs, 0);
 }
 #endif
 
@@ -2814,23 +3308,41 @@ move_one_task_fair(struct rq *this_rq, i
 {
 	struct cfs_rq *busy_cfs_rq;
 	struct rq_iterator cfs_rq_iterator;
+	int cpulimit_balance =
+		busiest->active_balance & ACTIVE_BALANCE_CPULIMIT;
+	int pinned = 0;
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+		struct cfs_rq *this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		int max_nr_migrate = 1;
+		int force = 0;
+
 #ifdef CONFIG_CFS_BANDWIDTH
 		if (throttled_lb_pair(busy_cfs_rq->tg, busiest->cpu, this_cpu))
 			continue;
 #endif
+
+		if (check_cpulimit_spread(busy_cfs_rq, this_cpu) < 0) {
+			if ((cpulimit_balance && !cfs_rq_active(this_cfs_rq)) ||
+			    !can_migrate_cfs_rq_tasks(busy_cfs_rq, busiest,
+					this_cpu, sd, &pinned, 1))
+				continue;
+			max_nr_migrate = busy_cfs_rq->nr_running;
+			force = 1;
+		} else if (cpulimit_balance)
+			continue;
+
 		/*
 		 * pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
-				       &cfs_rq_iterator))
-		    return 1;
+		if (iter_move_tasks(this_rq, this_cpu, busiest, sd, idle,
+				    &cfs_rq_iterator, max_nr_migrate, force))
+			return 1;
 	}
 
 	return 0;
@@ -2909,6 +3421,8 @@ static void task_fork_fair(struct task_s
 
 	se->vruntime -= cfs_rq->min_vruntime;
 
+	cfs_rq->nr_forks++;
+
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -3020,6 +3534,69 @@ unsigned int get_rr_interval_fair(struct
 	return rr_interval;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void nr_iowait_dec_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = p->se.parent;
+
+	cfs_rq->nr_iowait--;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (!cfs_rq->nr_iowait && se && se->block_start) {
+		u64 delta;
+		struct rq *rq = rq_of(cfs_rq);
+
+		update_rq_clock(rq);
+
+		delta = rq->clock - se->block_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > se->block_max))
+			se->block_max = delta;
+
+		se->block_start = 0;
+		se->sleep_start = rq->clock;
+
+		se->iowait_sum += delta;
+		se->sum_sleep_runtime += delta;
+	}
+#endif
+}
+
+static void nr_iowait_inc_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = p->se.parent;
+
+	cfs_rq->nr_iowait++;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (cfs_rq->nr_iowait && se && se->sleep_start) {
+		u64 delta;
+		struct rq *rq = rq_of(cfs_rq);
+
+		update_rq_clock(rq);
+
+		delta = rq->clock - se->sleep_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > se->sleep_max))
+			se->sleep_max = delta;
+
+		se->sleep_start = 0;
+		se->block_start = rq->clock;
+
+		se->sum_sleep_runtime += delta;
+	}
+#endif
+}
+#endif
+
 /*
  * All the scheduling class methods:
  */
@@ -3057,6 +3634,11 @@ static const struct sched_class fair_sch
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.moved_group		= task_move_group_fair,
+	.nr_iowait_inc		= nr_iowait_inc_fair,
+	.nr_iowait_dec		= nr_iowait_dec_fair,
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	.task_scheduled		= task_scheduled_fair,
 #endif
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched_features.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_features.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched_features.h	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_features.h	2015-01-21 12:02:54.352930934 +0300
@@ -59,3 +59,6 @@ SCHED_FEAT(LB_BIAS, 1)
  * release the lock. Decreases scheduling overhead.
  */
 SCHED_FEAT(OWNER_SPIN, 1)
+
+SCHED_FEAT(BOOST_WAKEUPS, 1)
+SCHED_FEAT(BOOST_PREEMPT, 1)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched_idletask.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_idletask.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched_idletask.c	2014-12-12 23:28:59.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_idletask.c	2015-01-21 12:02:42.716239859 +0300
@@ -50,7 +50,7 @@ static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
+		  int *all_pinned)
 {
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched_rt.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_rt.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched_rt.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_rt.c	2015-01-21 12:02:42.748239011 +0300
@@ -1198,7 +1198,12 @@ static struct task_struct *pick_next_hig
 		if (next && next->prio < idx)
 			continue;
 		list_for_each_entry(rt_se, array->queue + idx, run_list) {
-			struct task_struct *p = rt_task_of(rt_se);
+			struct task_struct *p;
+
+			if (!rt_entity_is_task(rt_se))
+				continue;
+
+			p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
 				break;
@@ -1549,7 +1554,7 @@ static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		unsigned long max_load_move,
 		struct sched_domain *sd, enum cpu_idle_type idle,
-		int *all_pinned, int *this_best_prio)
+		int *all_pinned)
 {
 	/* don't touch RT tasks */
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sched_stoptask.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_stoptask.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sched_stoptask.c	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sched_stoptask.c	2015-01-21 12:02:54.493927192 +0300
@@ -19,7 +19,7 @@ static unsigned long
 load_balance_stop(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
+		  int *all_pinned)
 {
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/signal.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/signal.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/signal.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/signal.c	2015-01-21 12:02:58.023833492 +0300
@@ -27,6 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
+#include <linux/interrupt.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -34,6 +35,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
+#include <bc/kmem.h>
 #include "audit.h"	/* audit_signal_info() */
 
 /*
@@ -42,6 +44,28 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+static inline int is_si_special(const struct siginfo *info)
+{
+	return info <= SEND_SIG_FORCED;
+}
+
+static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t)
+{
+	struct ve_struct *ve;
+
+	/* always allow signals from the kernel */
+	if (info == SEND_SIG_FORCED ||
+		       (!is_si_special(info) && SI_FROMKERNEL(info)))
+		return 0;
+
+	ve = current->ve_task_info.owner_env;
+	if (get_env_init(ve) != t)
+		return 0;
+	if (ve_is_super(get_exec_env()))
+		return 0;
+	return !sig_user_defined(t, sig) || sig_kernel_only(sig);
+}
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
@@ -119,7 +143,7 @@ static inline int has_pending_signals(si
 
 #define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
 
-static int recalc_sigpending_tsk(struct task_struct *t)
+int recalc_sigpending_tsk(struct task_struct *t)
 {
 	if (t->signal->group_stop_count > 0 ||
 	    PENDING(&t->pending, &t->blocked) ||
@@ -134,6 +158,7 @@ static int recalc_sigpending_tsk(struct 
 	 */
 	return 0;
 }
+EXPORT_SYMBOL(recalc_sigpending_tsk);
 
 /*
  * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
@@ -194,7 +219,7 @@ int next_signal(struct sigpending *pendi
  * - this may be called without locks if and only if t == current, otherwise an
  *   appopriate lock must be held to stop the target task from exiting
  */
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
+struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
 	struct sigqueue *q = NULL;
@@ -210,8 +235,15 @@ static struct sigqueue *__sigqueue_alloc
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
-			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
-		q = kmem_cache_alloc(sigqueue_cachep, flags);
+			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+		struct user_beancounter *ub = user->user_ub;
+
+		if (!charge_beancounter_fast(ub, UB_NUMSIGINFO, 1, UB_HARD)) {
+			q = ub_kmem_alloc(ub, sigqueue_cachep, flags);
+			if (!q)
+				uncharge_beancounter_fast(ub, UB_NUMSIGINFO, 1);
+		}
+	}
 	if (unlikely(q == NULL)) {
 		atomic_dec(&user->sigpending);
 		free_uid(user);
@@ -223,14 +255,19 @@ static struct sigqueue *__sigqueue_alloc
 
 	return q;
 }
+EXPORT_SYMBOL_GPL(__sigqueue_alloc);
 
 static void __sigqueue_free(struct sigqueue *q)
 {
+	struct user_struct *user;
+
 	if (q->flags & SIGQUEUE_PREALLOC)
 		return;
-	atomic_dec(&q->user->sigpending);
-	free_uid(q->user);
-	kmem_cache_free(sigqueue_cachep, q);
+	user = q->user;
+	ub_kmem_free(user->user_ub, sigqueue_cachep, q);
+	uncharge_beancounter_fast(user->user_ub, UB_NUMSIGINFO, 1);
+	atomic_dec(&user->sigpending);
+	free_uid(user);
 }
 
 void flush_sigqueue(struct sigpending *queue)
@@ -244,6 +281,7 @@ void flush_sigqueue(struct sigpending *q
 		__sigqueue_free(q);
 	}
 }
+EXPORT_SYMBOL(flush_sigqueue);
 
 /*
  * Flush all pending signals for a task.
@@ -413,7 +451,18 @@ still_pending:
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			siginfo_t *info)
 {
-	int sig = next_signal(pending, mask);
+	int sig = 0;
+
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 
 	if (sig) {
 		if (current->notifier) {
@@ -536,6 +585,7 @@ void signal_wake_up(struct task_struct *
 	if (!wake_up_state(t, mask))
 		kick_process(t);
 }
+EXPORT_SYMBOL(signal_wake_up);
 
 /*
  * Remove signals in mask from the pending set and queue.
@@ -588,11 +638,6 @@ static int rm_from_queue(unsigned long m
 	return 1;
 }
 
-static inline int is_si_special(const struct siginfo *info)
-{
-	return info <= SEND_SIG_FORCED;
-}
-
 static inline bool si_fromuser(const struct siginfo *info)
 {
 	return info == SEND_SIG_NOINFO ||
@@ -674,7 +719,7 @@ static bool prepare_signal(int sig, stru
 		t = p;
 		do {
 			rm_from_queue(sigmask(SIGCONT), &t->pending);
-		} while_each_thread(p, t);
+		} while_each_thread_all(p, t);
 	} else if (sig == SIGCONT) {
 		unsigned int why;
 		/*
@@ -706,7 +751,7 @@ static bool prepare_signal(int sig, stru
 				state |= TASK_INTERRUPTIBLE;
 			}
 			wake_up_state(t, state);
-		} while_each_thread(p, t);
+		} while_each_thread_all(p, t);
 
 		/*
 		 * Notify the parent with CLD_CONTINUED if we were stopped.
@@ -828,7 +873,7 @@ static void complete_signal(int sig, str
 			do {
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
-			} while_each_thread(p, t);
+			} while_each_thread_all(p, t);
 			return;
 		}
 	}
@@ -1103,6 +1148,7 @@ struct sighand_struct *lock_task_sighand
 
 	return sighand;
 }
+EXPORT_SYMBOL(lock_task_sighand);
 
 /*
  * send signal info to all the members of a group
@@ -1113,7 +1159,8 @@ int group_send_sig_info(int sig, struct 
 	int ret = check_kill_permission(sig, info, p);
 
 	if (!ret && sig)
-		ret = do_send_sig_info(sig, info, p, true);
+		ret = sig_ve_ignored(sig, info, p) ? 0 :
+			do_send_sig_info(sig, info, p, true);
 
 	return ret;
 }
@@ -1237,7 +1284,7 @@ static int kill_something_info(int sig, 
 		int retval = 0, count = 0;
 		struct task_struct * p;
 
-		for_each_process(p) {
+		for_each_process_ve(p) {
 			if (task_pid_vnr(p) > 1 &&
 					!same_thread_group(p, current)) {
 				int err = group_send_sig_info(sig, info, p);
@@ -1432,6 +1479,14 @@ int do_notify_parent(struct task_struct 
 	BUG_ON(!task_ptrace(tsk) &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
+#ifdef CONFIG_VE
+	/* Allow to send only SIGCHLD from VE */
+	if (sig != SIGCHLD &&
+			tsk->ve_task_info.owner_env != 
+			tsk->parent->ve_task_info.owner_env)
+		sig = SIGCHLD;
+#endif
+
 	info.si_signo = sig;
 	info.si_errno = 0;
 	/*
@@ -1468,9 +1523,10 @@ int do_notify_parent(struct task_struct 
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
-	if (!task_ptrace(tsk) && sig == SIGCHLD &&
+	if ((!task_ptrace(tsk) && sig == SIGCHLD &&
 	    (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
-	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
+	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) ||
+		tsk->flags & PF_EXIT_RESTART)	{
 		/*
 		 * We are exiting and our parent doesn't care.  POSIX.1
 		 * defines special semantics for setting SIGCHLD to SIG_IGN
@@ -1756,7 +1812,9 @@ static int do_signal_stop(int signr)
 
 	/* Now we don't run again until woken by SIGCONT or SIGKILL */
 	do {
+		set_stop_state(current);
 		schedule();
+		clear_stop_state(current);
 	} while (try_to_freeze());
 
 	tracehook_finish_jctl();
@@ -2327,7 +2385,8 @@ do_send_specific(pid_t tgid, pid_t pid, 
 		 * probe.  No signal is actually delivered.
 		 */
 		if (!error && sig) {
-			error = do_send_sig_info(sig, info, p, false);
+			if (!sig_ve_ignored(sig, info, p))
+				error = do_send_sig_info(sig, info, p, false);
 			/*
 			 * If lock_task_sighand() failed we pretend the task
 			 * dies after receiving the signal. The window is tiny,
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/smp.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/smp.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/smp.c	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/smp.c	2015-01-21 12:02:41.316277028 +0300
@@ -31,6 +31,7 @@ struct call_function_data {
 	struct call_single_data	csd;
 	atomic_t		refs;
 	cpumask_var_t		cpumask;
+	cpumask_var_t		cpumask_ipi;
 };
 
 struct call_single_queue {
@@ -52,6 +53,9 @@ hotplug_cfd(struct notifier_block *nfb, 
 		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				cpu_to_node(cpu)))
 			return NOTIFY_BAD;
+		if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+				cpu_to_node(cpu)))
+			return notifier_from_errno(-ENOMEM);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -61,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, 
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		free_cpumask_var(cfd->cpumask);
+		free_cpumask_var(cfd->cpumask_ipi);
 		break;
 #endif
 	};
@@ -482,6 +487,12 @@ void smp_call_function_many(const struct
 		return;
 	}
 
+	/*
+	 * After we put an entry into the list, data->cpumask
+	 * may be cleared again when another CPU sends another IPI for
+	 * a SMP function call, so data->cpumask will be zero.
+	 */
+	cpumask_copy(data->cpumask_ipi, data->cpumask);
 	spin_lock_irqsave(&call_function.lock, flags);
 	/*
 	 * Place entry at the _HEAD_ of the list, so that any cpu still
@@ -505,7 +516,7 @@ void smp_call_function_many(const struct
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(data->cpumask);
+	arch_send_call_function_ipi_mask(data->cpumask_ipi);
 
 	/* Optionally wait for the CPUs to complete */
 	if (wait)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/softirq.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/softirq.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/softirq.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/softirq.c	2015-01-21 12:02:43.690214001 +0300
@@ -25,6 +25,8 @@
 #include <linux/smp.h>
 #include <linux/tick.h>
 
+#include <bc/beancounter.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
@@ -206,10 +208,14 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 
 asmlinkage void __do_softirq(void)
 {
+	struct user_beancounter *ub;
 	struct softirq_action *h;
 	__u32 pending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	int cpu;
+	struct ve_struct *envid;
+
+	envid = set_exec_env(get_ve0());
 
 	pending = local_softirq_pending();
 	account_system_vtime(current);
@@ -227,6 +233,7 @@ restart:
 
 	h = softirq_vec;
 
+	ub = set_exec_ub(get_ub0());
 	do {
 		if (pending & 1) {
 			int prev_count = preempt_count();
@@ -249,6 +256,7 @@ restart:
 		h++;
 		pending >>= 1;
 	} while (pending);
+	(void)set_exec_ub(ub);
 
 	local_irq_disable();
 
@@ -262,6 +270,7 @@ restart:
 	lockdep_softirq_exit();
 
 	account_system_vtime(current);
+	(void)set_exec_env(envid);
 	__local_bh_enable(SOFTIRQ_OFFSET);
 }
 
@@ -321,6 +330,7 @@ void irq_exit(void)
 {
 	account_system_vtime(current);
 	trace_hardirq_exit();
+	restore_context();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sys.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sys.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sys.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sys.c	2015-01-21 12:02:57.855837951 +0300
@@ -10,6 +10,7 @@
 #include <linux/mman.h>
 #include <linux/smp_lock.h>
 #include <linux/notifier.h>
+#include <linux/virtinfo.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
 #include <linux/highuid.h>
@@ -117,6 +118,125 @@ EXPORT_SYMBOL(cad_pid);
 
 void (*pm_power_off_prepare)(void);
 
+DECLARE_MUTEX(virtinfo_sem);
+EXPORT_SYMBOL(virtinfo_sem);
+static struct vnotifier_block *virtinfo_chain[VIRT_TYPES];
+
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+	struct vnotifier_block **p;
+
+	for (p = &virtinfo_chain[type];
+	     *p != NULL && nb->priority < (*p)->priority;
+	     p = &(*p)->next);
+	nb->next = *p;
+	smp_wmb();
+	*p = nb;
+}
+
+EXPORT_SYMBOL(__virtinfo_notifier_register);
+
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+	down(&virtinfo_sem);
+	__virtinfo_notifier_register(type, nb);
+	up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_register);
+
+struct virtinfo_cnt_struct {
+	volatile unsigned long exit[NR_CPUS];
+	volatile unsigned long entry;
+};
+static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt);
+
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb)
+{
+	struct vnotifier_block **p;
+	int entry_cpu, exit_cpu;
+	unsigned long cnt, ent;
+	struct rcu_synchronize rcu;
+
+	down(&virtinfo_sem);
+	for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next);
+	*p = nb->next;
+	smp_mb();
+
+	init_completion(&rcu.completion);
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+
+	for_each_cpu_mask(entry_cpu, cpu_possible_map) {
+		while (1) {
+			cnt = 0;
+			for_each_cpu_mask(exit_cpu, cpu_possible_map)
+				cnt +=
+				    per_cpu(virtcnt, entry_cpu).exit[exit_cpu];
+			smp_rmb();
+			ent = per_cpu(virtcnt, entry_cpu).entry;
+			if (cnt == ent)
+				break;
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ / 100);
+		}
+	}
+
+	wait_for_completion(&rcu.completion);
+
+	up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_unregister);
+
+static int do_virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+	int ret;
+	struct vnotifier_block *nb;
+
+	nb = virtinfo_chain[type];
+	ret = NOTIFY_DONE;
+	while (nb)
+	{
+		ret = nb->notifier_call(nb, n, data, ret);
+		if(ret & NOTIFY_STOP_MASK) {
+			ret &= ~NOTIFY_STOP_MASK;
+			break;
+		}
+		nb = nb->next;
+	}
+
+	return ret;
+}
+
+int virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+	int ret;
+	int entry_cpu, exit_cpu;
+
+	entry_cpu = get_cpu();
+	per_cpu(virtcnt, entry_cpu).entry++;
+	smp_wmb();
+	put_cpu();
+
+	ret = do_virtinfo_notifier_call(type, n, data);
+
+	exit_cpu = get_cpu();
+	smp_wmb();
+	per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++;
+	put_cpu();
+
+	return ret;
+}
+EXPORT_SYMBOL(virtinfo_notifier_call);
+
+int virtinfo_notifier_call_irq(int type, unsigned long n, void *data)
+{
+	if (!in_interrupt())
+		return virtinfo_notifier_call(type, n, data);
+	return do_virtinfo_notifier_call(type, n, data);
+}
+EXPORT_SYMBOL(virtinfo_notifier_call_irq);
+
 /*
  * set the priority of a task
  * - the caller must hold the RCU read lock
@@ -192,10 +312,10 @@ SYSCALL_DEFINE3(setpriority, int, which,
 				 !(user = find_user(who)))
 				goto out_unlock;	/* No processes for this user */
 
-			do_each_thread(g, p)
+			do_each_thread_ve(g, p) {
 				if (__task_cred(p)->uid == who)
 					error = set_one_prio(p, niceval, error);
-			while_each_thread(g, p);
+			} while_each_thread_ve(g, p);
 			if (who != cred->uid)
 				free_uid(user);		/* For find_user() */
 			break;
@@ -255,13 +375,13 @@ SYSCALL_DEFINE2(getpriority, int, which,
 				 !(user = find_user(who)))
 				goto out_unlock;	/* No processes for this user */
 
-			do_each_thread(g, p)
+			do_each_thread_ve(g, p)
 				if (__task_cred(p)->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
-			while_each_thread(g, p);
+			while_each_thread_ve(g, p);
 			if (who != cred->uid)
 				free_uid(user);		/* for find_user() */
 			break;
@@ -381,6 +501,26 @@ SYSCALL_DEFINE4(reboot, int, magic1, int
 	                magic2 != LINUX_REBOOT_MAGIC2C))
 		return -EINVAL;
 
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		switch (cmd) {
+		case LINUX_REBOOT_CMD_RESTART:
+		case LINUX_REBOOT_CMD_RESTART2:
+			set_bit(VE_REBOOT, &get_exec_env()->flags);
+
+		case LINUX_REBOOT_CMD_HALT:
+		case LINUX_REBOOT_CMD_POWER_OFF:
+			force_sig(SIGKILL, get_exec_env_init());
+
+		case LINUX_REBOOT_CMD_CAD_ON:
+		case LINUX_REBOOT_CMD_CAD_OFF:
+			return 0;
+
+		default:
+			return -EINVAL;
+		}
+#endif
+
 	/*
 	 * If pid namespaces are enabled and the current task is in a child
 	 * pid_namespace, the command is handled by reboot_pid_ns() which will
@@ -939,8 +1079,27 @@ void do_sys_times(struct tms *tms)
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
 
+#ifdef CONFIG_VE
+unsigned long long ve_relative_clock(struct timespec * ts)
+{
+	unsigned long long offset = 0;
+
+	if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec ||
+	    (ts->tv_sec == get_exec_env()->start_timespec.tv_sec &&
+	     ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec))
+		offset = (unsigned long long)(ts->tv_sec -
+			get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC
+			+ ts->tv_nsec -	get_exec_env()->start_timespec.tv_nsec;
+	return nsec_to_clock_t(offset);
+}
+#endif
+
 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 {
+#ifdef CONFIG_VE
+	struct timespec now;
+#endif
+
 	if (tbuf) {
 		struct tms tmp;
 
@@ -948,8 +1107,15 @@ SYSCALL_DEFINE1(times, struct tms __user
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
+#ifndef CONFIG_VE
 	force_successful_syscall_return();
 	return (long) jiffies_64_to_clock_t(get_jiffies_64());
+#else
+	/* Compare to calculation in fs/proc/array.c */
+	do_posix_clock_monotonic_gettime(&now);
+	force_successful_syscall_return();
+	return ve_relative_clock(&now);
+#endif
 }
 
 /*
@@ -1132,6 +1298,7 @@ out:
 }
 
 DECLARE_RWSEM(uts_sem);
+EXPORT_SYMBOL_GPL(uts_sem);
 
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
@@ -1149,7 +1316,7 @@ SYSCALL_DEFINE2(sethostname, char __user
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_VE_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
@@ -1198,7 +1365,7 @@ SYSCALL_DEFINE2(setdomainname, char __us
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_VE_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
@@ -1615,6 +1782,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_SET_DATA_CSUM:
+			switch (arg2) {
+			case PR_DATA_CSUM_OFF:
+			case PR_DATA_CSUM_ON:
+				current->data_csum_enabled = arg2;
+				break;
+			default:
+				error = -EINVAL;
+				break;
+			}
+			break;
 		default:
 			error = -EINVAL;
 			break;
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sys_ni.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sys_ni.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sys_ni.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sys_ni.c	2015-01-21 12:02:53.810945322 +0300
@@ -185,5 +185,27 @@ cond_syscall(compat_sys_timerfd_gettime)
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
+
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
+cond_syscall(sys_getluid);
+cond_syscall(sys_setluid);
+cond_syscall(sys_setublimit);
+cond_syscall(compat_sys_setublimit);
+cond_syscall(sys_ubstat);
+
+/* fairsched compat */
+cond_syscall(sys_fairsched_mknod);
+cond_syscall(sys_fairsched_rmnod);
+cond_syscall(sys_fairsched_mvpr);
+cond_syscall(sys_fairsched_vcpus);
+cond_syscall(sys_fairsched_chwt);
+cond_syscall(sys_fairsched_rate);
+cond_syscall(sys_fairsched_cpumask);
+cond_syscall(sys_fairsched_nodemask);
+
+cond_syscall(compat_sys_lutime);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sysctl.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sysctl.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sysctl.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sysctl.c	2015-01-21 12:02:58.891810455 +0300
@@ -53,6 +53,10 @@
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
 #include <linux/kmod.h>
+#include <linux/cpuset.h>
+#include <linux/ve_task.h>
+#include <linux/mmgang.h>
+#include <linux/mnt_namespace.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -78,22 +82,41 @@ extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_would_have_oomkilled;
+extern int sysctl_oom_relaxation;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
-extern int pid_max;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+int ve_allow_kthreads = 1;
+EXPORT_SYMBOL(ve_allow_kthreads);
+int ve_allow_module_load = 1;
+EXPORT_SYMBOL(ve_allow_module_load);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+extern int sysrq_key_scancode;
+#endif
+extern unsigned relatime_interval; /* fs/inode.c */
+
+extern int alloc_fail_warn;
+int decode_call_traces = 1;
+
+#ifdef CONFIG_VE
+int glob_ve_meminfo = 0;
+#endif
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
 extern int kexec_load_disabled;
+extern int kexec_preserve_uptime;
+extern int kexec_reuse_crash;
+extern int pramcache_ploop_nosync;
 /* bz790921 */
 int unmap_area_factor_sysctl_handler(ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
@@ -112,6 +135,7 @@ int exec_shield = (1<<0);
  * (1<<2) 4: vdso just below .text of main (unless too low)
  * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low)
  * are ignored because the vdso is placed completely randomly
+ * (1<<4) 16: strict exec area randomization on ia32
  */
 
 static int __init setup_exec_shield(char *str)
@@ -140,6 +164,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int two_hundred = 200;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -196,6 +221,16 @@ extern int unaligned_dump_stack;
 extern int max_lock_depth;
 #endif
 
+#ifdef CONFIG_MEMORY_GANGS
+extern int vm_usage_factor;
+extern int vm_shadow_factor;
+extern int vm_age_factor;
+extern unsigned long commitment_for_unlimited_containers;
+extern int commitment_for_unlimited_containers_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int vm_force_scan_thresh;
+#endif /* CONFIG_MEMORY_GANGS */
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -217,9 +252,56 @@ static struct ctl_table_header root_tabl
 	.root = &sysctl_table_root,
 	.set = &sysctl_table_root.default_set,
 };
-static struct ctl_table_root sysctl_table_root = {
+
+#ifdef CONFIG_VE
+static int sysctl_root_perms(struct ctl_table_root *root,
+			struct nsproxy *namespaces, struct ctl_table *table)
+{
+	if (ve_is_super(get_exec_env()))
+		return table->mode;
+	else if (table->mode & S_ISVTX)
+		return table->mode;
+	else
+		return table->mode & ~0222;
+}
+
+struct ctl_table *sysctl_ve_table(struct ctl_table *orig,
+		struct ctl_table *onstack, int write)
+{
+	if (!(orig->mode & S_ISVTX))
+		return orig;
+
+	*onstack = *orig;
+	if (orig->extra1 != NULL) { /* per-ve_struct variable */
+		onstack->data = (void *)get_exec_env() +
+			(unsigned long)orig->extra1;
+		onstack->extra1 = NULL;
+	}
+	else if (write && !ve_is_super(get_exec_env())) /* immutable */
+		return NULL;
+
+	return onstack;
+}
+
+static struct ctl_table_root sysctl_table_groot = {
 	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+	.default_set.list = LIST_HEAD_INIT(sysctl_table_groot.default_set.list),
+	.default_set.parent = &sysctl_table_root.default_set,
+};
+#else
+#define sysctl_root_perms NULL
+#define sysctl_table_groot sysctl_table_root
+struct ctl_table_root *sysctl_ve_table(struct ctl_table *orig,
+		struct ctl_table *onstack, int write)
+{
+	return orig;
+}
+#endif
+
+static struct ctl_table_root sysctl_table_root = {
+	.root_list = LIST_HEAD_INIT(sysctl_table_groot.root_list),
 	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+	.permissions = sysctl_root_perms,
 };
 
 static struct ctl_table kern_table[];
@@ -234,6 +316,9 @@ extern struct ctl_table inotify_table[];
 #ifdef CONFIG_EPOLL
 extern struct ctl_table epoll_table[];
 #endif
+#ifdef CONFIG_PRAMCACHE
+extern struct ctl_table pramcache_table[];
+#endif
 
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 int sysctl_legacy_va_layout;
@@ -296,6 +381,16 @@ static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
 #endif
 
+static int proc_dointvec_pidmax(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp;
+
+	tmp = *table;
+	tmp.data = &current->nsproxy->pid_ns->pid_max;
+	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
+
 static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -453,6 +548,25 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.procname	= "sched_vcpu_hotslice",
+		.data		= &sysctl_sched_vcpu_hotslice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_cpulimit_scale_cpufreq",
+		.data		= &sysctl_sched_cpulimit_scale_cpufreq,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -588,6 +702,20 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.procname	= "silence-level",
+		.data		= &console_silence_loglevel,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "alloc_fail_warn",
+		.data		= &alloc_fail_warn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef __hppa__
 	{
 		.ctl_name	= KERN_HPPA_PWRSW,
@@ -678,6 +806,30 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 		.extra2		= &one,
 	},
+#ifdef CONFIG_PRAM
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kexec_preserve_uptime",
+		.data		= &kexec_preserve_uptime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
+#endif
+#ifdef CONFIG_KEXEC_REUSE_CRASH
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kexec_reuse_crash",
+		.data		= &kexec_reuse_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	{
@@ -685,7 +837,7 @@ static struct ctl_table kern_table[] = {
 		.procname	= "hotplug",
 		.data		= &uevent_helper,
 		.maxlen		= UEVENT_HELPER_PATH_LEN,
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
@@ -793,14 +945,41 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PIDMAX,
 		.procname	= "pid_max",
-		.data		= &pid_max,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec_pidmax,
 		.strategy	= sysctl_intvec,
 		.extra1		= &pid_max_min,
 		.extra2		= &pid_max_max,
 	},
+#ifdef CONFIG_MAGIC_SYSRQ
+	{
+		.procname	= "sysrq-key",
+		.data		= &sysrq_key_scancode,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_VE
+	{
+		.procname	= "ve_meminfo",
+		.data		= &glob_ve_meminfo,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "ve_allow_module_load",
+		.data		= &ve_allow_module_load,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{
 		.ctl_name	= KERN_PANIC_ON_OOPS,
 		.procname	= "panic_on_oops",
@@ -815,7 +994,7 @@ static struct ctl_table kern_table[] = {
 		.procname	= "printk",
 		.data		= &console_loglevel,
 		.maxlen		= 4*sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
@@ -984,10 +1163,13 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_RANDOMIZE,
 		.procname	= "randomize_va_space",
-		.data		= &randomize_va_space,
+		.data		= &_randomize_va_space,
+		.extra1		= (void *)offsetof(struct ve_struct,
+							_randomize_va_space),
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_data,
 	},
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
@@ -1066,6 +1248,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_verbosity",
+		.data		= &sysctl_hung_task_verbosity,
+		.maxlen		= sizeof(sysctl_hung_task_verbosity),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
 #endif
 #ifdef CONFIG_COMPAT
 	{
@@ -1236,6 +1427,14 @@ static struct ctl_table vm_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "oom_relaxation",
+		.data		= &sysctl_oom_relaxation,
+		.maxlen		= sizeof(sysctl_oom_relaxation),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_ms_jiffies,
+	},
+	{
 		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
@@ -1333,7 +1532,18 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
-		.extra2		= &one_hundred,
+		.extra2		= &two_hundred,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sync_reclaim",
+		.data		= &vm_sync_reclaim,
+		.maxlen		= sizeof(vm_sync_reclaim),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
 	},
 #ifdef CONFIG_HUGETLB_PAGE
 	{
@@ -1559,7 +1769,7 @@ static struct ctl_table vm_table[] = {
 		.procname	= "mmap_min_addr",
 		.data		= &dac_mmap_min_addr,
 		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= &mmap_min_addr_handler,
 	},
 #endif
@@ -1632,6 +1842,112 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_CPUSETS
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "strict_mem_cpuset",
+		.data		= &sysctl_strict_mem_cpuset,
+		.maxlen		= sizeof(sysctl_strict_mem_cpuset),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+	{
+		.procname	= "gangs_migration_max_isolate",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &gangs_migration_max_isolate,
+		.maxlen		= sizeof(gangs_migration_max_isolate),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+	},
+	{
+		.procname	= "gangs_migration_min_batch",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &gangs_migration_min_batch,
+		.maxlen		= sizeof(gangs_migration_min_batch),
+		.mode		= 0644,
+		.proc_handler	= &gangs_migration_batch_sysctl_handler,
+	},
+	{
+		.procname	= "gangs_migration_max_batch",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &gangs_migration_max_batch,
+		.maxlen		= sizeof(gangs_migration_max_batch),
+		.mode		= 0644,
+		.proc_handler	= &gangs_migration_batch_sysctl_handler,
+	},
+	{
+		.procname	= "gangs_migration_interval",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &gangs_migration_interval,
+		.maxlen		= sizeof(gangs_migration_interval),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif /* CONFIG_MEMORY_GANGS_MIGRATION */
+#ifdef CONFIG_MEMORY_GANGS
+	{
+		.procname	= "usage_factor",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &vm_usage_factor,
+		.maxlen		= sizeof(vm_usage_factor),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "shadow_factor",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &vm_shadow_factor,
+		.maxlen		= sizeof(vm_shadow_factor),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "age_factor",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &vm_age_factor,
+		.maxlen		= sizeof(vm_age_factor),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "commitment_for_unlimited_containers",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &commitment_for_unlimited_containers,
+		.maxlen		= sizeof(commitment_for_unlimited_containers),
+		.mode		= 0644,
+		.proc_handler	= commitment_for_unlimited_containers_handler,
+	},
+	{
+		.procname	= "force_scan_thresh",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &vm_force_scan_thresh,
+		.maxlen		= sizeof(vm_force_scan_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif /* CONFIG_MEMORY_GANGS */
+#ifdef CONFIG_PSWAP
+	{
+		.procname	= "prune_pswap",
+		.ctl_name	= CTL_UNNUMBERED,
+		.data		= &sysctl_prune_pswap,
+		.maxlen		= sizeof(sysctl_prune_pswap),
+		.mode		= 0644,
+		.proc_handler	= prune_pswap_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
+#endif
 
 /*
  * NOTE: do not add new entries to this table unless you have read
@@ -1648,6 +1964,14 @@ static struct ctl_table binfmt_misc_tabl
 
 static struct ctl_table fs_table[] = {
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "relatime_interval",
+		.data		= &relatime_interval,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= FS_NRINODE,
 		.procname	= "inode-nr",
 		.data		= &inodes_stat,
@@ -1753,16 +2077,16 @@ static struct ctl_table fs_table[] = {
 #ifdef CONFIG_AIO
 	{
 		.procname	= "aio-nr",
-		.data		= &aio_nr,
-		.maxlen		= sizeof(aio_nr),
-		.mode		= 0444,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444 | S_ISVTX,
+		.extra1		= (void *)offsetof(struct ve_struct, aio_nr),
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "aio-max-nr",
-		.data		= &aio_max_nr,
-		.maxlen		= sizeof(aio_max_nr),
-		.mode		= 0644,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644 | S_ISVTX,
+		.extra1		= (void *)offsetof(struct ve_struct, aio_max_nr),
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
 #endif /* CONFIG_AIO */
@@ -1801,6 +2125,36 @@ static struct ctl_table fs_table[] = {
 		.child		= binfmt_misc_table,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "odirect_enable",
+		.maxlen		= sizeof(int),
+		.extra1		= (void *)offsetof(struct ve_struct, odirect_enable),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "ve-mount-nr",
+		.data		= &sysctl_ve_mount_nr,
+		.maxlen		= sizeof(sysctl_ve_mount_nr),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#ifdef CONFIG_PRAMCACHE
+	{
+		.procname	= "pramcache",
+		.mode		= 0555,
+		.child		= pramcache_table,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "pramcache_ploop_nosync",
+		.data		= &pramcache_ploop_nosync,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
@@ -1809,6 +2163,13 @@ static struct ctl_table fs_table[] = {
 };
 
 static struct ctl_table debug_table[] = {
+	{
+		.procname	= "decode_call_traces",
+		.data		= &decode_call_traces,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #if defined(CONFIG_X86) || defined(CONFIG_PPC)
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -1997,6 +2358,7 @@ static int do_sysctl_strategy(struct ctl
 			void __user *newval, size_t newlen)
 {
 	int op = 0, rc;
+	struct ctl_table onstack;
 
 	if (oldval)
 		op |= MAY_READ;
@@ -2006,6 +2368,10 @@ static int do_sysctl_strategy(struct ctl
 		return -EPERM;
 
 	if (table->strategy) {
+		table = sysctl_ve_table(table, &onstack, op & MAY_WRITE);
+		if (table == NULL)
+			return 0;
+
 		rc = table->strategy(table, oldval, oldlenp, newval, newlen);
 		if (rc < 0)
 			return rc;
@@ -2370,10 +2736,28 @@ struct ctl_table_header *__register_sysc
 struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 						struct ctl_table *table)
 {
+	if (!ve_is_super(get_exec_env())) {
+		WARN_ON(1);
+		return NULL;
+	}
+
 	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
 					path, table);
 }
 
+struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path,
+		struct ctl_table *table, int virtual_handler)
+{
+	if (!ve_is_super(get_exec_env())) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	return __register_sysctl_paths(&sysctl_table_groot, current->nsproxy,
+					path, table);
+}
+EXPORT_SYMBOL(register_sysctl_glob_paths);
+
 /**
  * register_sysctl_table - register a sysctl table hierarchy
  * @table: the top-level table structure
@@ -2390,6 +2774,14 @@ struct ctl_table_header *register_sysctl
 	return register_sysctl_paths(null_path, table);
 }
 
+struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table,
+		int virtual_handler)
+{
+	static const struct ctl_path null_path[] = { {} };
+
+	return register_sysctl_glob_paths(null_path, table, virtual_handler);
+}
+
 /**
  * unregister_sysctl_table - unregister a sysctl table hierarchy
  * @header: the header returned from register_sysctl_table
@@ -2451,6 +2843,18 @@ struct ctl_table_header *register_sysctl
 	return NULL;
 }
 
+struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table,
+		int vh)
+{
+	return NULL;
+}
+
+struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path,
+						struct ctl_table *table, int vh)
+{
+	return NULL;
+}
+
 void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
@@ -2817,6 +3221,14 @@ int proc_dointvec(struct ctl_table *tabl
 		    	    NULL,NULL);
 }
 
+int proc_dointvec_once(struct ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && *(int *)table->data)
+		return 0;
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+
 /*
  * Taint values can only be increased
  * This means we can safely use a temporary.
@@ -3382,6 +3794,12 @@ int proc_dointvec(struct ctl_table *tabl
 	return -ENOSYS;
 }
 
+int proc_dointvec_once(struct ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3702,11 +4120,62 @@ static int deprecated_sysctl_warning(str
 	return 0;
 }
 
+#ifdef CONFIG_PID_NS
+#include <linux/pid_namespace.h>
+
+static int proc_pid_ns_hide_child(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int tmp, res;
+
+	tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0;
+
+	res = __do_proc_dointvec(&tmp, table, write, buffer,
+			       lenp, ppos, NULL, NULL);
+	if (res || !write)
+		return res;
+
+	if (tmp)
+		current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD;
+	else
+		current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD;
+	return 0;
+}
+
+static struct ctl_table pid_ns_kern_table[] = {
+	{
+		.procname	= "pid_ns_hide_child",
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_pid_ns_hide_child,
+	},
+	{}
+};
+
+static struct ctl_table pid_ns_root_table[] = {
+	{
+		.ctl_name	= CTL_KERN,
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= pid_ns_kern_table,
+	},
+	{}
+};
+
+static __init int pid_ns_sysctl_init(void)
+{
+	register_sysctl_table(pid_ns_root_table);
+	return 0;
+}
+postcore_initcall(pid_ns_sysctl_init);
+#endif /* CONFIG_PID_NS */
+
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
  */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_dointvec_once);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/sysctl_check.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sysctl_check.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/sysctl_check.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/sysctl_check.c	2015-01-21 12:02:46.992126337 +0300
@@ -1506,7 +1506,8 @@ int sysctl_check_table(struct nsproxy *n
 			    (table->proc_handler == proc_dointvec_ms_jiffies) ||
 			    (table->proc_handler == proc_doulongvec_minmax) ||
 			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-				if (!table->data)
+				if (!table->data &&
+				    !((table->mode & S_ISVTX) && table->extra1))
 					set_fail(&fail, table, "No data");
 				if (!table->maxlen)
 					set_fail(&fail, table, "No maxlen");
@@ -1530,7 +1531,7 @@ int sysctl_check_table(struct nsproxy *n
 			sysctl_check_leaf(namespaces, table, &fail);
 		}
 		sysctl_check_bin_path(table, &fail);
-		if (table->mode > 0777)
+		if (table->mode & ~(0777 | S_ISVTX))
 			set_fail(&fail, table, "bogus .mode");
 		if (fail) {
 			set_fail(&fail, table, NULL);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/taskstats.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/taskstats.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/taskstats.c	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/taskstats.c	2015-01-21 12:02:43.808210868 +0300
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
+#include <linux/pid_namespace.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -44,6 +45,7 @@ static struct genl_family family = {
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+	.netnsok	= true,
 };
 
 static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
@@ -254,7 +256,7 @@ static int fill_tgid(pid_t tgid, struct 
 
 		stats->nvcsw += tsk->nvcsw;
 		stats->nivcsw += tsk->nivcsw;
-	} while_each_thread(first, tsk);
+	} while_each_thread_all(first, tsk);
 
 	unlock_task_sighand(first, &flags);
 	rc = 0;
@@ -299,6 +301,10 @@ static int add_del_listener(pid_t pid, c
 	if (!cpumask_subset(mask, cpu_possible_mask))
 		return -EINVAL;
 
+	/* send_cpu_listeners() isn't pidns aware */
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
 	s = NULL;
 	if (isadd == REGISTER) {
 		for_each_cpu(cpu, mask) {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/time/timekeeping.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/time/timekeeping.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/time/timekeeping.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/time/timekeeping.c	2015-01-21 12:02:58.097831530 +0300
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
+#include <linux/pram.h>
 
 /* Structure holding internal timekeeping values. */
 struct timekeeper {
@@ -281,7 +282,7 @@ ktime_t ktime_get(void)
 	 */
 	return ktime_add_ns(ktime_set(secs, 0), nsecs);
 }
-EXPORT_SYMBOL_GPL(ktime_get);
+EXPORT_SYMBOL(ktime_get);
 
 /**
  * ktime_get_ts - get the monotonic clock in timespec format
@@ -554,6 +555,80 @@ void __attribute__((weak)) read_boot_clo
 	ts->tv_nsec = 0;
 }
 
+#if defined(CONFIG_KEXEC) && defined(CONFIG_PRAM)
+int kexec_preserve_uptime = 1;
+
+static struct timespec preserved_uptime;
+
+static inline void add_preserved_uptime(struct timespec *ts)
+{
+	*ts = timespec_add_safe(*ts, preserved_uptime);
+}
+
+static inline void sub_preserved_uptime(struct timespec *ts)
+{
+	*ts = timespec_sub(*ts, preserved_uptime);
+}
+
+#define PRESERVED_UPTIME_PRAM		"uptime"
+
+static void preserve_uptime(void)
+{
+	struct pram_stream stream;
+	static struct timespec uptime;
+	__u64 uptime_raw;
+	int err;
+
+	if (!kexec_preserve_uptime)
+		return;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	uptime_raw = (((__u64)uptime.tv_sec) << 32) + uptime.tv_nsec;
+
+	err = pram_open(PRESERVED_UPTIME_PRAM, PRAM_WRITE, &stream);
+	if (err)
+		goto out;
+	if (pram_write(&stream, &uptime_raw, 8) != 8)
+		err = -EIO;
+	pram_close(&stream, err);
+out:
+	if (err)
+		printk(KERN_ERR "Failed to preserve uptime: %d\n", err);
+	else
+		printk(KERN_INFO "Uptime preserved (%llu)\n",
+		       (unsigned long long)uptime_raw);
+}
+
+static void __init init_preserved_uptime(void)
+{
+	struct pram_stream stream;
+	__u64 uptime_raw;
+	int err;
+
+	err = pram_open(PRESERVED_UPTIME_PRAM, PRAM_READ, &stream);
+	if (err)
+		goto out;
+	if (pram_read(&stream, &uptime_raw, 8) != 8)
+		err = -EIO;
+	pram_close(&stream, err);
+out:
+	if (err && err != -ENOENT)
+		printk(KERN_ERR "Failed to preserve uptime: %d\n", err);
+	if (!err) {
+		preserved_uptime.tv_sec = uptime_raw >> 32;
+		preserved_uptime.tv_nsec = uptime_raw & 0xFFFFFFFF;
+		printk(KERN_INFO "Uptime preserved (%llu)\n",
+		       (unsigned long long)uptime_raw);
+	}
+}
+#else
+static inline void add_preserved_uptime(struct timespec *ts) { }
+static inline void sub_preserved_uptime(struct timespec *ts) { }
+static inline void preserve_uptime(void) { }
+static inline void init_preserved_uptime(void) { }
+#endif
+
 /*
  * timekeeping_init - Initializes the clocksource and common timekeeping values
  */
@@ -603,6 +678,14 @@ void __init timekeeping_init(void)
 	timekeeper.total_sleep_time.tv_sec = 0;
 	timekeeper.total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
+	init_preserved_uptime();
+}
+
+static int timekeeping_shutdown(struct sys_device *dev)
+{
+	preserve_uptime();
+	return 0;
 }
 
 /* time in seconds when suspend began */
@@ -753,6 +836,7 @@ static int timekeeping_suspend(struct sy
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
 	.name		= "timekeeping",
+	.shutdown	= timekeeping_shutdown,
 	.resume		= timekeeping_resume,
 	.suspend	= timekeeping_suspend,
 };
@@ -1114,7 +1198,7 @@ out:
  * basically means that however wrong your real time clock is at boot time,
  * you get the right time here).
  */
-void getboottime(struct timespec *ts)
+void getrealboottime(struct timespec *ts)
 {
 	struct timespec boottime = {
 		.tv_sec = timekeeper.wall_to_monotonic.tv_sec +
@@ -1125,8 +1209,14 @@ void getboottime(struct timespec *ts)
 
 	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getrealboottime);
 
+void getboottime(struct timespec *ts)
+{
+	getrealboottime(ts);
+	sub_preserved_uptime(ts);
+}
+EXPORT_SYMBOL_GPL(getboottime);
 
 /**
  * get_monotonic_boottime - Returns monotonic time since boot
@@ -1183,6 +1273,7 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
 void monotonic_to_bootbased(struct timespec *ts)
 {
 	*ts = timespec_add(*ts, timekeeper.total_sleep_time);
+	add_preserved_uptime(ts);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 
@@ -1196,6 +1287,7 @@ struct timespec __current_kernel_time(vo
 {
 	return timekeeper.xtime;
 }
+EXPORT_SYMBOL(__current_kernel_time);
 
 struct timespec current_kernel_time(void)
 {
@@ -1288,7 +1380,7 @@ void get_xtime_and_monotonic_and_sleep_o
  *
  * RHEL6: We do not have real vs boot clocks in RHEL.
  */
-ktime_t ktime_get_update_offsets(ktime_t *offs_real)
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 {
 	ktime_t now;
 	unsigned int seq;
@@ -1304,6 +1396,7 @@ ktime_t ktime_get_update_offsets(ktime_t
 		nsecs += arch_gettimeoffset();
 
 		*offs_real = timekeeper.offs_real;
+		*offs_boot = timekeeper.offs_boot;
 	} while (read_seqretry(&timekeeper.lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/time.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/time.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/time.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/time.c	2015-01-21 12:02:41.403274720 +0300
@@ -602,10 +602,12 @@ EXPORT_SYMBOL(jiffies_to_clock_t);
 unsigned long clock_t_to_jiffies(unsigned long x)
 {
 #if (HZ % USER_HZ)==0
+	WARN_ON((long)x < 0);
 	if (x >= ~0UL / (HZ / USER_HZ))
 		return ~0UL;
 	return x * (HZ / USER_HZ);
 #else
+	WARN_ON((long)x < 0);
 	/* Don't worry about loss of precision here .. */
 	if (x >= ~0UL / HZ * USER_HZ)
 		return ~0UL;
@@ -618,6 +620,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
 
 u64 jiffies_64_to_clock_t(u64 x)
 {
+	WARN_ON((s64)x < 0);
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ
 	x = div_u64(x * USER_HZ, HZ);
@@ -640,6 +643,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
 
 u64 nsec_to_clock_t(u64 x)
 {
+	WARN_ON((s64)x < 0);
 #if (NSEC_PER_SEC % USER_HZ) == 0
 	return div_u64(x, NSEC_PER_SEC / USER_HZ);
 #elif (USER_HZ % 512) == 0
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/timer.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/timer.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/timer.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/timer.c	2015-01-21 12:02:54.167935844 +0300
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
 #include <linux/sched.h>
+#include <linux/virtinfo.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1009,6 +1010,7 @@ static inline void __run_timers(struct t
 			spin_unlock_irq(&base->lock);
 			{
 				int preempt_count = preempt_count();
+				struct ve_struct *ve;
 
 #ifdef CONFIG_LOCKDEP
 				/*
@@ -1032,7 +1034,9 @@ static inline void __run_timers(struct t
 				lock_map_acquire(&lockdep_map);
 
 				trace_timer_expire_entry(timer);
+				ve = set_exec_env(get_ve0());
 				fn(data);
+				(void)set_exec_env(ve);
 				trace_timer_expire_exit(timer);
 
 				lock_map_release(&lockdep_map);
@@ -1290,7 +1294,7 @@ SYSCALL_DEFINE0(getppid)
 	int pid;
 
 	rcu_read_lock();
-	pid = task_tgid_vnr(current->real_parent);
+	pid = ve_task_ppid_nr_ns(current, current->nsproxy->pid_ns);
 	rcu_read_unlock();
 
 	return pid;
@@ -1444,19 +1448,34 @@ int do_sysinfo(struct sysinfo *info)
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
 	struct timespec tp;
+	struct ve_struct *ve;
 
 	memset(info, 0, sizeof(struct sysinfo));
+	si_meminfo(info);
+	si_swapinfo(info);
+
+#ifdef CONFIG_BEANCOUNTERS
+	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info)
+			& NOTIFY_FAIL)
+		return -ENOMSG;
+#endif
+	ve = get_exec_env();
 
 	ktime_get_ts(&tp);
 	monotonic_to_bootbased(&tp);
 	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+	if (ve_is_super(ve)) {
+		get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-	info->procs = nr_threads;
+		info->procs = nr_threads;
+	} else {
+		info->uptime -= ve->real_start_timespec.tv_sec;
 
-	si_meminfo(info);
-	si_swapinfo(info);
+		info->procs = ve->pcounter;
+
+		get_avenrun_ve(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+	}
 
 	/*
 	 * If the sum of all the available memory (i.e. ram + swap)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/trace/ftrace.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/trace/ftrace.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/trace/ftrace.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/trace/ftrace.c	2015-01-21 12:02:43.798211132 +0300
@@ -3871,7 +3871,7 @@ static int alloc_retstack_tasklist(struc
 	}
 
 	read_lock_irqsave(&tasklist_lock, flags);
-	do_each_thread(g, t) {
+	do_each_thread_all(g, t) {
 		if (start == end) {
 			ret = -EAGAIN;
 			goto unlock;
@@ -3885,7 +3885,7 @@ static int alloc_retstack_tasklist(struc
 			smp_wmb();
 			t->ret_stack = ret_stack_list[start++];
 		}
-	} while_each_thread(g, t);
+	} while_each_thread_all(g, t);
 
 unlock:
 	read_unlock_irqrestore(&tasklist_lock, flags);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/trace/trace.h linux-2.6.32-504.3.3.el6-042stab103_6/kernel/trace/trace.h
--- linux-2.6.32-504.3.3.el6.orig/kernel/trace/trace.h	2014-12-12 23:29:17.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/trace/trace.h	2015-01-21 12:02:41.329276684 +0300
@@ -816,7 +816,8 @@ extern const char *__stop___trace_bprint
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print)		\
-	extern struct ftrace_event_call event_##call;
+	extern struct ftrace_event_call					\
+	__attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)		\
 	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/tracepoint.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/tracepoint.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/tracepoint.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/tracepoint.c	2015-01-21 12:02:45.791158222 +0300
@@ -596,11 +596,11 @@ void syscall_regfunc(void)
 
 	if (!sys_tracepoint_refcount) {
 		read_lock_irqsave(&tasklist_lock, flags);
-		do_each_thread(g, t) {
+		do_each_thread_all(g, t) {
 			/* Skip kernel threads. */
 			if (t->mm)
 				set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-		} while_each_thread(g, t);
+		} while_each_thread_all(g, t);
 		read_unlock_irqrestore(&tasklist_lock, flags);
 	}
 	sys_tracepoint_refcount++;
@@ -614,9 +614,9 @@ void syscall_unregfunc(void)
 	sys_tracepoint_refcount--;
 	if (!sys_tracepoint_refcount) {
 		read_lock_irqsave(&tasklist_lock, flags);
-		do_each_thread(g, t) {
+		do_each_thread_all(g, t) {
 			clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-		} while_each_thread(g, t);
+		} while_each_thread_all(g, t);
 		read_unlock_irqrestore(&tasklist_lock, flags);
 	}
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/user.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/user.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/user.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/user.c	2015-01-21 12:02:57.975834767 +0300
@@ -18,6 +18,8 @@
 #include <linux/user_namespace.h>
 #include "cred-internals.h"
 
+#include <bc/kmem.h>
+
 struct user_namespace init_user_ns = {
 	.kref = {
 		.refcount	= ATOMIC_INIT(2),
@@ -56,6 +58,7 @@ struct user_struct root_user = {
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
 	.user_ns	= &init_user_ns,
+	.user_ub	= &ub0,
 #ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
@@ -112,8 +115,11 @@ static struct user_struct *uid_hash_find
 	hlist_for_each_entry(user, h, hashent, uidhash_node) {
 		if (user->uid == uid) {
 			/* possibly resurrect an "almost deleted" object */
-			if (atomic_inc_return(&user->__count) == 1)
+			if (atomic_inc_return(&user->__count) == 1) {
+				user->user_ub = get_beancounter(get_exec_ub());
+				ub_kmem_charge(user->user_ub, uid_cachep->objuse, __GFP_NOFAIL);
 				cancel_delayed_work(&user->work);
+			}
 			return user;
 		}
 	}
@@ -330,6 +336,8 @@ done:
  */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
+	ub_kmem_uncharge(user->user_ub, uid_cachep->objuse);
+	put_beancounter(up->user_ub);
 	INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
 	schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
 	spin_unlock_irqrestore(&uidhash_lock, flags);
@@ -363,12 +371,15 @@ static inline void uids_mutex_unlock(voi
  */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
+	struct user_beancounter *ub = up->user_ub;
+
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
-	kmem_cache_free(uid_cachep, up);
+	ub_kmem_free(ub, uid_cachep, up);
+	put_beancounter(ub);
 }
 
 #endif
@@ -422,6 +433,7 @@ void free_uid(struct user_struct *up)
 	else
 		local_irq_restore(flags);
 }
+EXPORT_SYMBOL(free_uid);
 
 struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 {
@@ -438,9 +450,12 @@ struct user_struct *alloc_uid(struct use
 	spin_unlock_irq(&uidhash_lock);
 
 	if (!up) {
-		new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
+		struct user_beancounter *ub = get_exec_ub();
+
+		new = ub_kmem_alloc(ub, uid_cachep, GFP_KERNEL | __GFP_ZERO);
 		if (!new)
 			goto out_unlock;
+		new->user_ub = ub;
 
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
@@ -467,8 +482,9 @@ struct user_struct *alloc_uid(struct use
 			 */
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
+			ub_kmem_free(new->user_ub, uid_cachep, new);
 		} else {
+			get_beancounter(new->user_ub);
 			uid_hash_insert(new, hashent);
 			up = new;
 		}
@@ -483,11 +499,12 @@ out_destoy_sched:
 	sched_destroy_user(new);
 	put_user_ns(new->user_ns);
 out_free_user:
-	kmem_cache_free(uid_cachep, new);
+	ub_kmem_free(new->user_ub, uid_cachep, new);
 out_unlock:
 	uids_mutex_unlock();
 	return NULL;
 }
+EXPORT_SYMBOL(alloc_uid);
 
 static int __init uid_cache_init(void)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/user_namespace.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/user_namespace.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/user_namespace.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/user_namespace.c	2015-01-21 12:02:43.575217054 +0300
@@ -60,6 +60,7 @@ int create_user_ns(struct cred *new)
 
 	return 0;
 }
+EXPORT_SYMBOL(create_user_ns);
 
 /*
  * Deferred destructor for a user namespace.  This is required because
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/utrace.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/utrace.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/utrace.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/utrace.c	2015-01-21 12:02:49.109070137 +0300
@@ -350,6 +350,15 @@ struct utrace_engine *utrace_attach_pid(
 }
 EXPORT_SYMBOL_GPL(utrace_attach_pid);
 
+int task_utrace_attached(struct task_struct *task)
+{
+	struct utrace *utrace = task_utrace_struct(task);
+
+	return utrace && (!list_empty(&utrace->attached) ||
+			  !list_empty(&utrace->attaching));
+}
+EXPORT_SYMBOL_GPL(task_utrace_attached);
+
 /*
  * When an engine is detached, the target thread may still see it and
  * make callbacks until it quiesces.  We install a special ops vector
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/utsname.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/utsname.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/utsname.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/utsname.c	2015-01-21 12:02:48.163095252 +0300
@@ -16,14 +16,25 @@
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
 
 static struct uts_namespace *create_uts_ns(void)
 {
 	struct uts_namespace *uts_ns;
 
 	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
-	if (uts_ns)
+	if (uts_ns) {
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+		memset(&uts_ns->vdso, 0, sizeof(uts_ns->vdso));
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+		memset(&uts_ns->vdso32, 0, sizeof(uts_ns->vdso32));
+#endif
+#endif
 		kref_init(&uts_ns->kref);
+	}
 	return uts_ns;
 }
 
@@ -81,6 +92,27 @@ void free_uts_ns(struct kref *kref)
 
 	ns = container_of(kref, struct uts_namespace, kref);
 	proc_free_inum(ns->proc_inum);
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+	if (ns->vdso.pages) {
+		int i;
+
+		vunmap(ns->vdso.addr);
+		for (i = 0; i < ns->vdso.nr_pages; i++)
+			put_page(ns->vdso.pages[i]);
+		kfree(ns->vdso.pages);
+	}
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	if (ns->vdso32.pages) {
+		int i;
+
+		for (i = 0; i < ns->vdso32.nr_pages; i++)
+			put_page(ns->vdso32.pages[i]);
+		kfree(ns->vdso32.pages);
+	}
+#endif
+#endif
 	kfree(ns);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/utsname_sysctl.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/utsname_sysctl.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/utsname_sysctl.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/utsname_sysctl.c	2015-01-21 12:02:44.059204204 +0300
@@ -26,6 +26,14 @@ static void *get_uts(ctl_table *table, i
 		down_read(&uts_sem);
 	else
 		down_write(&uts_sem);
+
+	if (table->data == &virt_utsname.release) {
+		if (uts_ns == &init_uts_ns)
+			return virt_utsname.release;
+		else
+			return uts_ns->name.release;
+	}
+
 	return which;
 }
 
@@ -126,19 +134,27 @@ static struct ctl_table uts_kern_table[]
 	{}
 };
 
-static struct ctl_table uts_root_table[] = {
+static struct ctl_table uts_virt_osrelease_table[] = {
 	{
-		.ctl_name	= CTL_KERN,
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= uts_kern_table,
+		.procname	= "virt_osrelease",
+		.data		= virt_utsname.release,
+		.maxlen		= sizeof(virt_utsname.release),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_uts_string,
+		.strategy	= sysctl_uts_string,
 	},
 	{}
 };
 
+static struct ctl_path uts_path[] = {
+	{ .ctl_name = CTL_KERN, .procname = "kernel", },
+	{ }
+};
+
 static int __init utsname_sysctl_init(void)
 {
-	register_sysctl_table(uts_root_table);
+	register_sysctl_glob_paths(uts_path, uts_kern_table, 1);
+	register_sysctl_paths(uts_path, uts_virt_osrelease_table);
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/Makefile
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/Makefile	2015-01-21 12:02:44.083203567 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/Makefile	2015-01-21 12:02:58.949808915 +0300
@@ -0,0 +1,19 @@
+#
+#
+#  kernel/ve/Makefile
+#
+#  Copyright (C) 2000-2005  SWsoft
+#  All rights reserved.
+#
+#  Licensing governed by "linux/COPYING.SWsoft" file.
+
+obj-$(CONFIG_VE) = ve.o veowner.o hooks.o
+obj-$(CONFIG_VZ_WDOG) += vzwdog.o
+obj-$(CONFIG_VE_CALLS) += vzmon.o
+
+vzmon-objs = vecalls.o
+
+obj-$(CONFIG_VZ_DEV) += vzdev.o
+obj-$(CONFIG_VZ_EVENT) += vzevent.o
+
+obj-$(CONFIG_VZ_IOLIMIT) += vziolimit.o
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/hooks.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/hooks.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/hooks.c	2015-01-21 12:02:44.083203567 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/hooks.c	2015-01-21 12:02:44.531191673 +0300
@@ -0,0 +1,114 @@
+/*
+ *  linux/kernel/ve/hooks.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/ve.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ve_proto.h>
+#include <linux/module.h>
+
+static struct list_head ve_hooks[VE_MAX_CHAINS];
+static DECLARE_RWSEM(ve_hook_sem);
+
+void ve_hook_register(int chain, struct ve_hook *vh)
+{
+	struct list_head *lh;
+	struct ve_hook *tmp;
+
+	BUG_ON(chain > VE_MAX_CHAINS);
+
+	down_write(&ve_hook_sem);
+	list_for_each(lh, &ve_hooks[chain]) {
+		tmp = list_entry(lh, struct ve_hook, list);
+		if (vh->priority < tmp->priority)
+			break;
+	}
+
+	list_add_tail(&vh->list, lh);
+	up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_register);
+
+void ve_hook_unregister(struct ve_hook *vh)
+{
+	down_write(&ve_hook_sem);
+	list_del(&vh->list);
+	up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_unregister);
+
+static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve)
+{
+	int err;
+
+	err = 0;
+	if (vh->init != NULL && try_module_get(vh->owner)) {
+		err = vh->init(ve);
+		module_put(vh->owner);
+	}
+	return err;
+}
+
+static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve)
+{
+	if (vh->fini != NULL && try_module_get(vh->owner)) {
+		vh->fini(ve);
+		module_put(vh->owner);
+	}
+}
+
+int ve_hook_iterate_init(int chain, void *ve)
+{
+	struct ve_hook *vh;
+	int err;
+
+	err = 0;
+
+	down_read(&ve_hook_sem);
+	list_for_each_entry(vh, &ve_hooks[chain], list)
+		if ((err = ve_hook_init(vh, ve)) < 0)
+			break;
+
+	if (err)
+		list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list)
+			ve_hook_fini(vh, ve);
+
+	up_read(&ve_hook_sem);
+	return err;
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_init);
+
+void ve_hook_iterate_fini(int chain, void *ve)
+{
+	struct ve_hook *vh;
+
+	down_read(&ve_hook_sem);
+	list_for_each_entry_reverse(vh, &ve_hooks[chain], list)
+		ve_hook_fini(vh, ve);
+	up_read(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_fini);
+
+static int __init ve_hooks_init(void)
+{
+	int i;
+
+	for (i = 0; i < VE_MAX_CHAINS; i++)
+		INIT_LIST_HEAD(&ve_hooks[i]);
+	return 0;
+}
+
+core_initcall(ve_hooks_init);
+
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/ve.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/ve.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/ve.c	2015-01-21 12:02:44.083203567 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/ve.c	2015-01-21 12:02:57.863837739 +0300
@@ -0,0 +1,202 @@
+/*
+ *  linux/kernel/ve/ve.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * 've.c' helper file performing VE sub-system initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/freezer.h>
+
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/sys.h>
+#include <linux/kdev_t.h>
+#include <linux/termios.h>
+#include <linux/tty_driver.h>
+#include <linux/netdevice.h>
+#include <linux/utsname.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/ve_proto.h>
+#include <linux/devpts_fs.h>
+#include <linux/user_namespace.h>
+#include <linux/init_task.h>
+#include <linux/mutex.h>
+
+#include <linux/vzcalluser.h>
+
+unsigned long vz_rstamp = 0x37e0f59d;
+EXPORT_SYMBOL(vz_rstamp);
+
+#ifdef CONFIG_MODULES
+struct module no_module = { .state = MODULE_STATE_GOING };
+EXPORT_SYMBOL(no_module);
+#endif
+
+#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS)
+void (*do_env_free_hook)(struct ve_struct *ve);
+EXPORT_SYMBOL(do_env_free_hook);
+
+void do_env_free(struct ve_struct *env)
+{
+	BUG_ON(env->pcounter > 0);
+	BUG_ON(env->is_running);
+
+	preempt_disable();
+	do_env_free_hook(env);
+	preempt_enable();
+}
+EXPORT_SYMBOL(do_env_free);
+#endif
+
+int (*do_ve_enter_hook)(struct ve_struct *ve, unsigned int flags);
+EXPORT_SYMBOL(do_ve_enter_hook);
+
+struct ve_struct ve0 = {
+	.counter		= ATOMIC_INIT(1),
+	.pcounter		= 1,
+	.ve_list		= LIST_HEAD_INIT(ve0.ve_list),
+	.vetask_lh		= LIST_HEAD_INIT(ve0.vetask_lh),
+	.vetask_auxlist		= LIST_HEAD_INIT(ve0.vetask_auxlist),
+	.start_jiffies		= INITIAL_JIFFIES,
+	.ve_ns			= &init_nsproxy,
+	.ve_netns		= &init_net,
+	.user_ns		= &init_user_ns,
+	.is_running		= 1,
+	.op_sem			= __RWSEM_INITIALIZER(ve0.op_sem),
+#ifdef CONFIG_VE_IPTABLES
+	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
+	._iptables_modules	= VE_IP_NONE,	/* but nothing yet loaded */
+#endif
+	.features		= -1,
+	.meminfo_val		= VE_MEMINFO_SYSTEM,
+	._randomize_va_space	=
+#ifdef CONFIG_COMPAT_BRK
+					1,
+#else
+					2,
+#endif
+	.proc_fstype		= &proc_fs_type,
+	.devices		= LIST_HEAD_INIT(ve0.devices),
+	.init_cred		= &init_cred,
+	.fsync_enable		= FSYNC_FILTERED,
+	.sync_mutex		= __MUTEX_INITIALIZER(ve0.sync_mutex),
+	.mnt_nr			= ATOMIC_INIT(0),
+	.aio_nr			= 0,
+	.aio_max_nr		= AIO_MAX_NR_DEFAULT,
+};
+
+EXPORT_SYMBOL(ve0);
+
+LIST_HEAD(ve_list_head);
+DEFINE_MUTEX(ve_list_lock);
+
+struct ve_struct *__find_ve_by_id(envid_t veid)
+{
+	struct ve_struct *ve;
+
+	for_each_ve(ve) {
+		if (ve->veid == veid)
+			return ve;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(__find_ve_by_id);
+
+struct ve_struct *get_ve_by_id(envid_t veid)
+{
+	struct ve_struct *ve;
+	mutex_lock(&ve_list_lock);
+	ve = __find_ve_by_id(veid);
+	get_ve(ve);
+	mutex_unlock(&ve_list_lock);
+	return ve;
+}
+EXPORT_SYMBOL(get_ve_by_id);
+
+LIST_HEAD(ve_cleanup_list);
+DEFINE_SPINLOCK(ve_cleanup_lock);
+struct task_struct *ve_cleanup_thread;
+
+EXPORT_SYMBOL(ve_list_lock);
+EXPORT_SYMBOL(ve_list_head);
+EXPORT_SYMBOL(ve_cleanup_lock);
+EXPORT_SYMBOL(ve_cleanup_list);
+EXPORT_SYMBOL(ve_cleanup_thread);
+
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, ve0_lat_stats);
+
+void init_ve0(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_ve0();
+	ve->sched_lat_ve.cur = &per_cpu_var(ve0_lat_stats);
+	list_add(&ve->ve_list, &ve_list_head);
+	INIT_LIST_HEAD(&ve->_kthread_create_list);
+	spin_lock_init(&ve->aio_nr_lock);
+}
+
+void ve_cleanup_schedule(struct ve_struct *ve)
+{
+	BUG_ON(ve_cleanup_thread == NULL);
+
+	spin_lock(&ve_cleanup_lock);
+	list_add_tail(&ve->cleanup_list, &ve_cleanup_list);
+	spin_unlock(&ve_cleanup_lock);
+
+	wake_up_process(ve_cleanup_thread);
+}
+
+int ve_freeze(struct ve_struct *env)
+{
+	int err;
+
+	down_write(&env->op_sem);
+	err = -ESRCH;
+	if (!env->is_running)
+		goto out;
+	err = -EBUSY;
+	if (env->is_locked)
+		goto out;
+	env->is_locked = 1;
+	up_write(&env->op_sem);
+
+	err = freezer_change_state(env->ve_cgroup, CGROUP_FROZEN);
+	if (err)
+		ve_thaw(env);
+
+	return err;
+
+out:
+	up_write(&env->op_sem);
+	return err;
+}
+EXPORT_SYMBOL(ve_freeze);
+
+void ve_thaw(struct ve_struct *env)
+{
+	freezer_change_state(env->ve_cgroup, CGROUP_THAWED);
+
+	down_write(&env->op_sem);
+	WARN_ON(!env->is_locked);
+	env->is_locked = 0;
+	up_write(&env->op_sem);
+}
+EXPORT_SYMBOL(ve_thaw);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/vecalls.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vecalls.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/vecalls.c	2015-01-21 12:02:44.084203541 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vecalls.c	2015-01-21 12:02:58.723814914 +0300
@@ -0,0 +1,2910 @@
+/*
+ *  linux/kernel/ve/vecalls.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *
+ */
+
+/*
+ * 'vecalls.c' is file with basic VE support. It provides basic primities
+ * along with initialization script
+ */
+
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sys.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/mnt_namespace.h>
+#include <linux/termios.h>
+#include <linux/tty_driver.h>
+#include <linux/netdevice.h>
+#include <linux/wait.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/devpts_fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/user_namespace.h>
+#include <linux/sysfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/rcupdate.h>
+#include <linux/in.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/pid.h>
+#include <net/pkt_sched.h>
+#include <bc/beancounter.h>
+#include <linux/nsproxy.h>
+#include <linux/kobject.h>
+#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/tty.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/oom.h>
+#include <linux/aio.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/audit.h>
+
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+
+#include <linux/ve_proto.h>
+#include <linux/venet.h>
+#include <linux/vzctl.h>
+#include <linux/vzcalluser.h>
+#include <linux/fairsched.h>
+
+#include <linux/virtinfo.h>
+#include <linux/utsrelease.h>
+#include <linux/major.h>
+
+#include <bc/dcache.h>
+
+int nr_ve = 1;	/* One VE always exists. Compatibility with vestat */
+EXPORT_SYMBOL(nr_ve);
+
+static int	do_env_enter(struct ve_struct *ve, unsigned int flags);
+static int	alloc_ve_tty_drivers(struct ve_struct* ve);
+static void	free_ve_tty_drivers(struct ve_struct* ve);
+static int	register_ve_tty_drivers(struct ve_struct* ve);
+static void	unregister_ve_tty_drivers(struct ve_struct* ve);
+static int	init_ve_tty_drivers(struct ve_struct *);
+static void	fini_ve_tty_drivers(struct ve_struct *);
+static int	init_ve_vtty(struct ve_struct *ve);
+static void	fini_ve_vtty(struct ve_struct *ve);
+static void	clear_termios(struct tty_driver* driver );
+
+static void vecalls_exit(void);
+
+static int alone_in_pgrp(struct task_struct *tsk);
+
+/*
+ * real_put_ve() MUST be used instead of put_ve() inside vecalls.
+ */
+static void real_do_env_free(struct ve_struct *ve);
+static inline void real_put_ve(struct ve_struct *ve)
+{
+	if (ve && atomic_dec_and_test(&ve->counter)) {
+		BUG_ON(ve->pcounter > 0);
+		BUG_ON(ve->is_running);
+		real_do_env_free(ve);
+	}
+}
+static s64 ve_get_uptime(struct ve_struct *ve)
+{
+	struct timespec uptime;
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	uptime = timespec_sub(uptime, ve->real_start_timespec);
+	return timespec_to_ns(&uptime);
+}
+
+static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf)
+{
+	struct ve_struct *ve;
+	struct vz_cpu_stat *vstat;
+	int retval;
+	int i;
+	unsigned long tmp;
+	unsigned long avenrun[3];
+	struct kernel_cpustat kstat;
+
+	if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid))
+		return -EPERM;
+	if (veid == 0)
+		return -ESRCH;
+
+	vstat = kzalloc(sizeof(*vstat), GFP_KERNEL);
+	if (!vstat)
+		return -ENOMEM;
+
+	retval = fairsched_get_cpu_stat(veid, &kstat);
+	if (retval)
+		goto out_free;
+
+	retval = fairsched_get_cpu_avenrun(veid, avenrun);
+	if (retval)
+		goto out_free;
+
+	retval = -ESRCH;
+	mutex_lock(&ve_list_lock);
+	ve = __find_ve_by_id(veid);
+	if (ve == NULL)
+		goto out_unlock;
+
+	vstat->user_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[USER]);
+	vstat->nice_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[NICE]);
+	vstat->system_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[SYSTEM]);
+	vstat->idle_clk += kstat.cpustat[IDLE];
+
+	vstat->uptime_clk = ve_get_uptime(ve);
+
+	vstat->uptime_jif = (unsigned long)cputime64_to_clock_t(
+				get_jiffies_64() - ve->start_jiffies);
+	for (i = 0; i < 3; i++) {
+		tmp = avenrun[i] + (FIXED_1/200);
+		vstat->avenrun[i].val_int = LOAD_INT(tmp);
+		vstat->avenrun[i].val_frac = LOAD_FRAC(tmp);
+	}
+	mutex_unlock(&ve_list_lock);
+
+	retval = 0;
+	if (copy_to_user(buf, vstat, sizeof(*vstat)))
+		retval = -EFAULT;
+out_free:
+	kfree(vstat);
+	return retval;
+
+out_unlock:
+	mutex_unlock(&ve_list_lock);
+	goto out_free;
+}
+
+extern int ve_devt_add(struct ve_struct *ve, unsigned type, dev_t devt,
+		       unsigned mask);
+
+static int real_setdevperms(envid_t veid, unsigned type,
+		dev_t dev, unsigned mask)
+{
+	struct ve_struct *ve;
+	int err;
+
+	if (!capable_setveid() || veid == 0)
+		return -EPERM;
+
+	if ((ve = get_ve_by_id(veid)) == NULL)
+		return -ESRCH;
+
+	down_read(&ve->op_sem);
+	err = -ESRCH;
+	if (ve->is_running)
+		err = ve_devt_add(ve, type, dev, mask);
+	up_read(&ve->op_sem);
+	real_put_ve(ve);
+	return err;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE start: subsystems
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static int prepare_proc_root(struct ve_struct *ve)
+{
+	struct proc_dir_entry *de;
+
+	de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL);
+	if (de == NULL)
+		return -ENOMEM;
+
+	memcpy(de + 1, "/proc", 6);
+	de->name = (char *)(de + 1);
+	de->namelen = 5;
+	de->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	de->nlink = 2;
+	atomic_set(&de->count, 1);
+
+	ve->proc_root = de;
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static int init_ve_proc(struct ve_struct *ve)
+{
+	int err;
+
+	err = prepare_proc_root(ve);
+	if (err)
+		goto out_root;
+
+	err = register_ve_fs_type(ve, &proc_fs_type,
+			&ve->proc_fstype, NULL);
+	if (err)
+		goto out_reg;
+
+	err = pid_ns_prepare_proc(ve->ve_ns->pid_ns);
+	if (err)
+		goto out_prep_proc;
+
+	ve->proc_mnt = mntget(ve->ve_ns->pid_ns->proc_mnt);
+
+#ifdef CONFIG_PRINTK
+	if (proc_create("kmsg", S_IRUSR, ve->proc_root,
+				&proc_kmsg_operations) == NULL)
+		goto out_kmsg;
+#endif
+	if (proc_mkdir("vz", ve->proc_root) == NULL)
+		goto out_vz;
+
+	if (proc_mkdir("fs", ve->proc_root) == NULL)
+		goto out_fs;
+
+	if (proc_create("partitions", 0, ve->proc_root, NULL) == NULL)
+		goto out_parts;
+
+	return 0;
+
+out_parts:
+	remove_proc_entry("fs", ve->proc_root);
+out_fs:
+	remove_proc_entry("vz", ve->proc_root);
+out_vz:
+	remove_proc_entry("kmsg", ve->proc_root);
+out_kmsg:
+	mntput(ve->proc_mnt);
+	ve->proc_mnt = NULL;
+
+	pid_ns_release_proc(ve->ve_ns->pid_ns);
+out_prep_proc:
+	unregister_ve_fs_type(ve->proc_fstype, NULL);
+out_reg:
+	/* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */
+	;
+out_root:
+	return err;
+}
+
+static LIST_HEAD(ve_proc_entries);
+static DECLARE_MUTEX(ve_proc_entries_lock);
+
+struct ve_proc_dir_entry
+{
+	struct list_head list;
+	struct proc_dir_entry *de;
+	struct ve_struct *ve;
+};
+
+static void cleanup_ve_proc_entries(struct ve_struct *ve, struct list_head *list)
+{
+	struct ve_proc_dir_entry *ve_de, *t;
+	list_for_each_entry_safe(ve_de, t, list, list) {
+		if (ve_de->ve != ve)
+			continue;
+		remove_proc_entry(ve_de->de->name, ve_de->de->parent);
+	}
+}
+
+static void fini_ve_proc_entries(struct ve_struct *ve)
+{
+
+	down(&ve_proc_entries_lock);
+	cleanup_ve_proc_entries(ve, &ve_proc_entries);
+	up(&ve_proc_entries_lock);
+}
+
+static void fini_ve_proc(struct ve_struct *ve)
+{
+	remove_proc_entry("partitions", ve->proc_root);
+	remove_proc_entry("fs", ve->proc_root);
+	remove_proc_entry("vz", ve->proc_root);
+	remove_proc_entry("kmsg", ve->proc_root);
+	fini_ve_proc_entries(ve);
+	unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt);
+	ve->proc_mnt = NULL;
+}
+
+static void free_ve_proc(struct ve_struct *ve)
+{
+	/* proc filesystem frees proc_dir_entries on remove_proc_entry() only,
+	   so we check that everything was removed and not lost */
+	if (ve->proc_root && ve->proc_root->subdir) {
+		struct proc_dir_entry *p = ve->proc_root;
+		printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid);
+		while ((p = p->subdir) != NULL)
+			printk("/%s", p->name);
+		printk(" is not removed!\n");
+	}
+
+	kfree(ve->proc_root);
+	kfree(ve->proc_fstype);
+
+	ve->proc_fstype = NULL;
+	ve->proc_root = NULL;
+}
+#else
+#define init_ve_proc(ve)	(0)
+#define fini_ve_proc(ve)	do { } while (0)
+#define free_ve_proc(ve)	do { } while (0)
+#endif
+
+#ifdef CONFIG_UNIX98_PTYS
+#include <linux/devpts_fs.h>
+
+/*
+ * DEVPTS needs a virtualization: each environment should see each own list of
+ * pseudo-terminals.
+ * To implement it we need to have separate devpts superblocks for each
+ * VE, and each VE should mount its own one.
+ * Thus, separate vfsmount structures are required.
+ * To minimize intrusion into vfsmount lookup code, separate file_system_type
+ * structures are created.
+ *
+ * In addition to this, patch fo character device itself is required, as file
+ * system itself is used only for MINOR/MAJOR lookup.
+ */
+
+static int init_ve_devpts(struct ve_struct *ve)
+{
+	ve->devpts_mnt = kern_mount(&devpts_fs_type);
+	if (IS_ERR(ve->devpts_mnt))
+		return PTR_ERR(ve->devpts_mnt);
+	return 0;
+}
+
+static void fini_ve_devpts(struct ve_struct *ve)
+{
+	kern_umount(ve->devpts_mnt);
+}
+#else
+#define init_ve_devpts(ve)	(0)
+#define fini_ve_devpts(ve)	do { } while (0)
+#endif
+
+static int init_ve_shmem(struct ve_struct *ve)
+{
+	return register_ve_fs_type_data_flags(ve,
+					      &shmem_fs_type,
+					      &ve->shmem_fstype,
+					      &ve->shmem_mnt,
+					      NULL, MS_NOUSER);
+}
+
+static void fini_ve_shmem(struct ve_struct *ve)
+{
+	unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt);
+	/* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */
+	ve->shmem_mnt = NULL;
+}
+
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+extern struct device_attribute ve_net_class_attributes[];
+static inline int init_ve_netclass(void)
+{
+	struct class *nc;
+	int err;
+
+	nc = kzalloc(sizeof(*nc), GFP_KERNEL);
+	if (!nc)
+		return -ENOMEM;
+
+	nc->name = net_class.name;
+	nc->dev_release = net_class.dev_release;
+	nc->dev_uevent = net_class.dev_uevent;
+	nc->dev_attrs = ve_net_class_attributes;
+
+	err = class_register(nc);
+	if (!err) {
+		get_exec_env()->net_class = nc;
+		return 0;
+	}
+	kfree(nc);	
+	return err;
+}
+
+static inline void fini_ve_netclass(void)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	class_unregister(ve->net_class);
+	kfree(ve->net_class);
+	ve->net_class = NULL;
+}
+#else
+static inline int init_ve_netclass(void) { return 0; }
+static inline void fini_ve_netclass(void) { ; }
+#endif
+
+static const struct {
+	unsigned	minor;
+	char		*name;
+} mem_class_devices [] = {
+	{3, "null"},
+	{5, "zero"},
+	{7, "full"},
+	{8, "random"},
+	{9, "urandom"},
+	{0, NULL},
+};
+
+extern char *mem_devnode(struct device *dev, mode_t *mode);
+static int init_ve_mem_class(void)
+{
+	int i;
+	struct class *ve_mem_class;
+
+	ve_mem_class = class_create(THIS_MODULE, "mem");
+	if (IS_ERR(ve_mem_class))
+		return -ENOMEM;
+	ve_mem_class->devnode = mem_devnode;
+
+	for (i = 0; mem_class_devices[i].name; i++)
+		device_create(ve_mem_class, NULL,
+				MKDEV(MEM_MAJOR, mem_class_devices[i].minor),
+				NULL, mem_class_devices[i].name);
+
+	get_exec_env()->mem_class = ve_mem_class;
+	return 0;
+}
+
+
+void fini_ve_mem_class(void)
+{
+	int i;
+	struct class *ve_mem_class = get_exec_env()->mem_class;
+
+	for (i = 0; mem_class_devices[i].name; i++)
+		device_destroy(ve_mem_class,
+				MKDEV(MEM_MAJOR, mem_class_devices[i].minor));
+	class_destroy(ve_mem_class);
+}
+
+static void fini_ve_sysfs_fs(struct ve_struct *ve)
+{
+	kobject_put(ve->cgroup_kobj);
+	kobject_put(ve->fs_kobj);
+}
+
+static int init_ve_sysfs_fs(struct ve_struct *ve)
+{
+	ve->fs_kobj = kobject_create_and_add("fs", NULL);
+	if (!ve->fs_kobj)
+		goto err;
+	ve->cgroup_kobj = kobject_create_and_add("cgroup", ve->fs_kobj);
+	if (!ve->fs_kobj)
+		goto err;
+	return 0;
+err:
+	fini_ve_sysfs_fs(ve);
+	return -ENOMEM;
+}
+
+static int init_ve_ksysfs(struct ve_struct *ve)
+{
+#if defined(CONFIG_HOTPLUG)
+	return ksysfs_init_ve(ve, &ve->kernel_kobj);
+#else
+	return 0;
+#endif
+}
+
+static void fini_ve_ksysfs(struct ve_struct *ve)
+{
+#if defined(CONFIG_HOTPLUG)
+	ksysfs_fini_ve(ve, &ve->kernel_kobj);
+#endif
+}
+
+static void fini_ve_sysfs_cpu(struct ve_struct *ve)
+{
+	struct kobject *kobj, *kobjn;
+
+	if (ve->cpu_kset) {
+		list_for_each_entry_safe(kobj, kobjn,
+				&ve->cpu_kset->list, entry)
+			kobject_put(kobj);
+		kset_put(ve->cpu_kset);
+	}
+}
+
+static int init_ve_sysfs_cpu(struct ve_struct *ve)
+{
+	int i, nr_cpus;
+	struct kobject *kobj;
+
+	ve->cpu_kset = kset_create_and_add("cpu", NULL, ve->_system_dir);
+	if (!ve->cpu_kset)
+		goto out;
+
+	nr_cpus = num_possible_cpus();
+	nr_cpus = max(nr_cpus, 2);
+	for (i = 0; i < nr_cpus; i++) {
+		kobj = kobject_create();
+		if (!kobj)
+			goto out;
+		kobj->kset = ve->cpu_kset;
+		if (kobject_add(kobj, NULL, "cpu%d", i)) {
+			kobject_put(kobj);
+			goto out;
+		}
+	}
+
+	return 0;
+out:
+	fini_ve_sysfs_cpu(ve);
+	return -ENOMEM;
+}
+
+static void fini_ve_sysfs_system(struct ve_struct *ve)
+{
+	fini_ve_sysfs_cpu(ve);
+	kobject_put(ve->_system_dir);
+}
+
+static int init_ve_sysfs_system(struct ve_struct *ve)
+{
+	int err;
+
+	err = -ENOMEM;
+	ve->_system_dir = kobject_create_and_add("system",
+						 &ve->devices_kset->kobj);
+	if (!ve->_system_dir)
+		goto out;
+
+	err = init_ve_sysfs_cpu(ve);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	fini_ve_sysfs_system(ve);
+	return err;
+}
+
+static int init_ve_devtmpfs(struct ve_struct *ve)
+{
+#ifdef CONFIG_DEVTMPFS
+	char opts[] = "mode=0755";
+	return register_ve_fs_type_data(ve, &dev_fs_type,
+			&ve->devtmpfs_fstype, &ve->devtmpfs_mnt, opts);
+#else
+	return 0;
+#endif
+}
+
+static void fini_ve_devtmpfs(struct ve_struct *ve)
+{
+#ifdef CONFIG_DEVTMPFS
+	unregister_ve_fs_type(ve->devtmpfs_fstype, ve->devtmpfs_mnt);
+	ve->devtmpfs_mnt = NULL;
+#endif
+}
+
+static int init_ve_sysfs(struct ve_struct *ve)
+{
+	int err;
+
+#ifdef CONFIG_SYSFS
+	err = 0;
+	if (ve->features & VE_FEATURE_SYSFS) {
+		err = init_ve_sysfs_root(ve);
+		if (err != 0)
+			goto out;
+		err = register_ve_fs_type(ve,
+				   &sysfs_fs_type,
+				   &ve->sysfs_fstype,
+				   &ve->sysfs_mnt);
+		if (err != 0)
+			goto out_fs_type;
+	}
+#endif
+
+	err = classes_init();
+	if (err != 0)
+		goto err_classes;
+
+	err = devices_init();
+	if (err != 0)
+		goto err_devices;
+
+	err = init_ve_netclass();
+	if (err != 0)
+		goto err_net;
+
+	err = init_ve_tty_class();
+	if (err != 0)
+		goto err_tty;
+
+	err = init_ve_mem_class();
+	if (err != 0)
+		goto err_mem;
+
+	err = init_ve_sysfs_fs(ve);
+	if (err != 0)
+		goto err_fs;
+
+	err = init_ve_sysfs_system(ve);
+	if (err != 0)
+		goto err_sys;
+
+	err = init_ve_ksysfs(ve);
+	if (err !=0)
+		goto err_ksys;
+
+	return 0;
+
+err_ksys:
+	fini_ve_sysfs_system(ve);
+err_sys:
+	fini_ve_sysfs_fs(ve);
+err_fs:
+	fini_ve_mem_class();
+err_mem:
+	fini_ve_tty_class();
+err_tty:
+	fini_ve_netclass();
+err_net:
+	devices_fini();
+err_devices:
+	classes_fini();
+err_classes:
+#ifdef CONFIG_SYSFS
+	unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
+	/* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
+out_fs_type:
+	sysfs_put(ve->_sysfs_root);
+	ve->_sysfs_root = NULL;
+out:
+#endif
+	return err;
+}
+
+static void fini_ve_sysfs(struct ve_struct *ve)
+{
+	fini_ve_ksysfs(ve);
+	fini_ve_sysfs_system(ve);
+	fini_ve_sysfs_fs(ve);
+	fini_ve_mem_class();
+	fini_ve_tty_class();
+	fini_ve_netclass();
+	devices_fini();
+	classes_fini();
+#ifdef CONFIG_SYSFS
+	unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
+	ve->sysfs_mnt = NULL;
+	sysfs_put(ve->_sysfs_root);
+	ve->_sysfs_root = NULL;
+	/* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
+#endif
+}
+
+static void free_ve_filesystems(struct ve_struct *ve)
+{
+#ifdef CONFIG_SYSFS
+	kfree(ve->sysfs_fstype);
+	ve->sysfs_fstype = NULL;
+#endif
+	kfree(ve->shmem_fstype);
+	ve->shmem_fstype = NULL;
+
+#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
+	BUG_ON(ve->fuse_fs_type && !list_empty(&ve->_fuse_conn_list));
+	kfree(ve->fuse_fs_type);
+	ve->fuse_fs_type = NULL;
+
+	kfree(ve->fuse_ctl_fs_type);
+	ve->fuse_ctl_fs_type = NULL;
+#endif
+
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+	kfree(ve->bm_fs_type);
+	ve->bm_fs_type = NULL;
+#endif
+
+	free_ve_proc(ve);
+}
+
+static int init_printk(struct ve_struct *ve)
+{
+	struct ve_prep_printk {
+		wait_queue_head_t       log_wait;
+		unsigned		log_start;
+		unsigned		log_end;
+		unsigned		logged_chars;
+	} *tmp;
+
+	tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	init_waitqueue_head(&tmp->log_wait);
+	ve->_log_wait = &tmp->log_wait;
+	ve->_log_start = &tmp->log_start;
+	ve->_log_end = &tmp->log_end;
+	ve->_logged_chars = &tmp->logged_chars;
+	/* ve->log_buf will be initialized later by ve_log_init() */
+	return 0;
+}
+
+static void fini_printk(struct ve_struct *ve)
+{
+	/* 
+	 * there is no spinlock protection here because nobody can use
+	 * log_buf at the moments when this code is called. 
+	 */
+	kfree(ve->log_buf);
+	kfree(ve->_log_wait);
+}
+
+static void fini_venet(struct ve_struct *ve)
+{
+#ifdef CONFIG_INET
+	tcp_v4_kill_ve_sockets(ve);
+	synchronize_net();
+#endif
+}
+
+static int init_ve_sched(struct ve_struct *ve, unsigned int vcpus)
+{
+	int err;
+
+	err = fairsched_new_node(ve->veid, vcpus);
+
+	return err;
+}
+
+static void fini_ve_sched(struct ve_struct *ve, int leave)
+{
+	fairsched_drop_node(ve->veid, leave);
+}
+
+/*
+ * Namespaces
+ */
+
+static inline int init_ve_namespaces(struct ve_struct *ve,
+		struct nsproxy **old)
+{
+	int err;
+	struct task_struct *tsk;
+	struct nsproxy *cur;
+
+	tsk = current;
+	cur = tsk->nsproxy;
+
+	err = copy_namespaces(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID,
+			tsk, 1);
+	if (err < 0)
+		return err;
+
+	ve->ve_ns = get_nsproxy(tsk->nsproxy);
+	memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release,
+			sizeof(virt_utsname.release));
+
+	if (cur->pid_ns->flags & PID_NS_HIDE_CHILD)
+		ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN;
+
+	*old = cur;
+	return 0;
+}
+
+static inline void fini_ve_namespaces(struct ve_struct *ve,
+		struct nsproxy *old)
+{
+	struct task_struct *tsk = current;
+	struct nsproxy *tmp;
+
+	if (old) {
+		tmp = tsk->nsproxy;
+		tsk->nsproxy = get_nsproxy(old);
+		put_nsproxy(tmp);
+		tmp = ve->ve_ns;
+		ve->ve_ns = get_nsproxy(old);
+		put_nsproxy(tmp);
+	} else {
+		put_cred(ve->init_cred);
+		put_nsproxy(ve->ve_ns);
+		ve->ve_ns = NULL;
+	}
+}
+
+static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old)
+{
+	int err;
+	struct task_struct *tsk;
+	struct nsproxy *cur;
+
+	tsk = current;
+	cur = tsk->nsproxy;
+
+	err = copy_namespaces(CLONE_NEWNET, tsk, 1);
+	if (err < 0)
+		return err;
+
+	put_nsproxy(ve->ve_ns);
+	ve->ve_ns = get_nsproxy(tsk->nsproxy);
+	ve->ve_netns = get_net(ve->ve_ns->net_ns);
+	*old = cur;
+	return 0;
+}
+
+static void fini_ve_netns(struct ve_struct *ve)
+{
+	struct net *net;
+	DECLARE_COMPLETION_ONSTACK(sysfs_completion);
+
+	net = ve->ve_netns;
+	net->sysfs_completion = &sysfs_completion;
+	put_net(net);
+	wait_for_completion(&sysfs_completion);
+}
+
+static inline void switch_ve_namespaces(struct ve_struct *ve,
+		struct task_struct *tsk)
+{
+	struct nsproxy *old_ns;
+	struct nsproxy *new_ns;
+
+	BUG_ON(tsk != current);
+	old_ns = tsk->nsproxy;
+	new_ns = ve->ve_ns;
+
+	if (old_ns != new_ns) {
+		tsk->nsproxy = get_nsproxy(new_ns);
+		put_nsproxy(old_ns);
+	}
+}
+
+static __u64 get_ve_features(env_create_param_t *data, int datalen)
+{
+	__u64 known_features;
+
+	if (datalen < sizeof(struct env_create_param3))
+		/* this version of vzctl is aware of VE_FEATURES_OLD only */
+		known_features = VE_FEATURES_OLD;
+	else
+		known_features = data->known_features;
+
+	/*
+	 * known features are set as required
+	 * yet unknown features are set as in VE_FEATURES_DEF
+	 */
+	return (data->feature_mask & known_features) |
+		(VE_FEATURES_DEF & ~known_features);
+}
+
+static int init_ve_struct(struct ve_struct *ve, envid_t veid,
+		u32 class_id, env_create_param_t *data, int datalen)
+{
+	(void)get_ve(ve);
+	ve->veid = veid;
+	ve->class_id = class_id;
+	ve->features = get_ve_features(data, datalen);
+	INIT_LIST_HEAD(&ve->vetask_lh);
+	init_rwsem(&ve->op_sem);
+
+	ve->start_timespec = current->start_time;
+	ve->real_start_timespec = current->real_start_time;
+	/* The value is wrong, but it is never compared to process
+	 * start times */
+	ve->start_jiffies = get_jiffies_64();
+
+	ve->_randomize_va_space = ve0._randomize_va_space;
+	INIT_LIST_HEAD(&ve->vetask_auxlist);
+	INIT_LIST_HEAD(&ve->devices);
+
+	ve->odirect_enable = 2;
+	ve->fsync_enable = 2;
+
+	INIT_LIST_HEAD(&ve->ve_list);
+	init_waitqueue_head(&ve->ve_list_wait);
+	mutex_init(&ve->sync_mutex);
+
+	INIT_LIST_HEAD(&ve->devmnt_list);
+	mutex_init(&ve->devmnt_mutex);
+
+	idr_init(&ve->_posix_timers_id);
+	spin_lock_init(&ve->posix_timers_lock);
+
+	atomic_set(&ve->mnt_nr, 0);
+
+	spin_lock_init(&ve->aio_nr_lock);
+	ve->aio_nr = 0;
+	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+	return 0;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * /proc/meminfo virtualization
+ *
+ **********************************************************************
+ **********************************************************************/
+static int ve_set_meminfo(envid_t veid, unsigned long val)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	struct ve_struct *ve;
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -EINVAL;
+
+	if (val == 0)
+		val = VE_MEMINFO_SYSTEM;
+	else if (val == 1)
+		val = VE_MEMINFO_DEFAULT;
+	else if (val == 2)
+		val = VE_MEMINFO_COMPLETE;
+
+	ve->meminfo_val = val;
+	real_put_ve(ve);
+	return 0;
+#else
+	return -ENOTTY;
+#endif
+}
+
+static int init_ve_meminfo(struct ve_struct *ve)
+{
+	ve->meminfo_val = VE_MEMINFO_DEFAULT;
+	return 0;
+}
+
+static inline void fini_ve_meminfo(struct ve_struct *ve)
+{
+}
+
+static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk)
+{
+	get_fs_root(tsk->fs, &ve->root_path);
+	/* mark_tree_virtual(&ve->root_path); */
+	ub_dcache_set_owner(ve->root_path.dentry, get_exec_ub());
+}
+
+static void put_ve_root(struct ve_struct *ve)
+{
+	path_put(&ve->root_path);
+}
+
+static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk)
+{
+	/* required for real_setdevperms from register_ve_<fs> above */
+	memcpy(&ve->ve_cap_bset, &tsk->cred->cap_effective, sizeof(kernel_cap_t));
+}
+
+static int ve_list_add(struct ve_struct *ve)
+{
+	mutex_lock(&ve_list_lock);
+	if (__find_ve_by_id(ve->veid) != NULL)
+		goto err_exists;
+
+	list_add(&ve->ve_list, &ve_list_head);
+	nr_ve++;
+	mutex_unlock(&ve_list_lock);
+	return 0;
+
+err_exists:
+	mutex_unlock(&ve_list_lock);
+	return -EEXIST;
+}
+
+static void ve_list_del(struct ve_struct *ve)
+{
+	mutex_lock(&ve_list_lock);
+	list_del_init(&ve->ve_list);
+	nr_ve--;
+	mutex_unlock(&ve_list_lock);
+	wake_up_all(&ve->ve_list_wait);
+}
+
+static void init_ve_cred(struct ve_struct *ve, struct cred *new)
+{
+	const struct cred *cur;
+	kernel_cap_t bset;
+
+	bset = ve->ve_cap_bset;
+	cur = current_cred();
+	new->cap_effective = cap_intersect(cur->cap_effective, bset);
+	new->cap_inheritable = cap_intersect(cur->cap_inheritable, bset);
+	new->cap_permitted = cap_intersect(cur->cap_permitted, bset);
+	new->cap_bset = cap_intersect(cur->cap_bset, bset);
+
+	ve->init_cred = new;
+	ve->user_ns = new->user->user_ns;
+}
+
+static void ve_move_task(struct ve_struct *new)
+{
+	struct task_struct *tsk = current;
+	struct ve_struct *old;
+
+	might_sleep();
+	BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk)));
+
+	/* this probihibts ptracing of task entered to VE from host system */
+	if (tsk->mm)
+		tsk->mm->vps_dumpable = VD_VE_ENTER_TASK;
+	/* setup capabilities before enter */
+	if (commit_creds(get_new_cred(new->init_cred)))
+		BUG();
+
+	/* Reset OOM score adjustment */
+	tsk->signal->oom_adj = 0;
+	test_set_oom_score_adj(OOM_SCORE_ADJ_UNSET);
+
+	/* Reset loginuid */
+	audit_set_loginuid(current, (uid_t)-1);
+
+	/* Adjust cpuid faulting */
+	set_cpuid_faulting(!ve_is_super(new));
+
+	old = tsk->ve_task_info.owner_env;
+	tsk->ve_task_info.owner_env = new;
+
+	/* set ve fs_struct for kernel threads */
+	if (current->flags & PF_KTHREAD)
+		daemonize_fs_struct();
+
+	write_lock_irq(&tasklist_lock);
+	list_move_tail(&tsk->ve_task_info.vetask_list, &new->vetask_lh);
+	list_move_tail(&tsk->ve_task_info.aux_list, &new->vetask_auxlist);
+	old->pcounter--;
+	new->pcounter++;
+	write_unlock_irq(&tasklist_lock);
+
+	real_put_ve(old);
+	get_ve(new);
+
+	cgroup_kernel_attach(new->ve_cgroup, tsk);
+}
+
+#ifdef CONFIG_VE_IPTABLES
+
+static __u64 setup_iptables_mask(__u64 init_mask)
+{
+	/* Remove when userspace will start supplying IPv6-related bits. */
+	init_mask &= ~VE_IP_IPTABLES6;
+	init_mask &= ~VE_IP_FILTER6;
+	init_mask &= ~VE_IP_MANGLE6;
+	init_mask &= ~VE_IP_IPTABLE_NAT_MOD;
+	init_mask &= ~VE_NF_CONNTRACK_MOD;
+
+	if (mask_ipt_allow(init_mask, VE_IP_IPTABLES))
+		init_mask |= VE_IP_IPTABLES6;
+	if (mask_ipt_allow(init_mask, VE_IP_FILTER))
+		init_mask |= VE_IP_FILTER6;
+	if (mask_ipt_allow(init_mask, VE_IP_MANGLE))
+		init_mask |= VE_IP_MANGLE6;
+	if (mask_ipt_allow(init_mask, VE_IP_NAT))
+		init_mask |= VE_IP_IPTABLE_NAT;
+	if (mask_ipt_allow(init_mask, VE_IP_CONNTRACK))
+		init_mask |= VE_NF_CONNTRACK;
+
+	return init_mask;
+}
+
+#endif
+
+static inline int init_ve_cpustats(struct ve_struct *ve)
+{
+	ve->sched_lat_ve.cur = alloc_percpu(struct kstat_lat_pcpu_snap_struct);
+	if (ve->sched_lat_ve.cur == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static inline void free_ve_cpustats(struct ve_struct *ve)
+{
+	free_percpu(ve->sched_lat_ve.cur);
+	ve->sched_lat_ve.cur = NULL;
+}
+
+static int alone_in_pgrp(struct task_struct *tsk)
+{
+	struct task_struct *p;
+	int alone = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) {
+		if (p != tsk)
+			goto out;
+	} while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p);
+	do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) {
+		if (p != tsk)
+			goto out;
+	} while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p);
+	alone = 1;
+out:
+	read_unlock(&tasklist_lock);
+	return alone;
+}
+
+#ifdef CONFIG_CGROUP_DEVICE
+
+static struct vfsmount *ve_cgroup_mnt;
+static struct cgroup *ve_cgroup_root;
+
+static int init_ve_cgroups(struct ve_struct *ve)
+{
+	char name[16];
+
+	snprintf(name, sizeof(name), "%u", ve->veid);
+	ve->ve_cgroup = cgroup_kernel_open(ve_cgroup_root,
+			CGRP_CREAT|CGRP_WEAK, name);
+	if (IS_ERR(ve->ve_cgroup))
+		return PTR_ERR(ve->ve_cgroup);
+	return ve_prep_devcgroup(ve);
+}
+
+static void fini_ve_cgroups(struct ve_struct *ve)
+{
+	cgroup_kernel_close(ve->ve_cgroup);
+	ve->ve_cgroup = NULL;
+}
+
+static int __init init_vecalls_cgroups(void)
+{
+	struct cgroup_sb_opts opts = {
+		.name		= "container",
+		.subsys_bits	=
+			(1ul << devices_subsys_id) |
+			(1ul << freezer_subsys_id),
+	};
+
+	ve_cgroup_mnt = cgroup_kernel_mount(&opts);
+	if (IS_ERR(ve_cgroup_mnt))
+		return PTR_ERR(ve_cgroup_mnt);
+	ve_cgroup_root = cgroup_get_root(ve_cgroup_mnt);
+	get_ve0()->ve_cgroup = ve_cgroup_root;
+	return 0;
+}
+
+static void fini_vecalls_cgroups(void)
+{
+	kern_umount(ve_cgroup_mnt);
+}
+#else
+static int init_ve_cgroups(struct ve_struct *ve) { }
+static int fini_ve_cgroups(struct ve_struct *ve) { }
+static int init_vecalls_cgroups(void) { return 0; }
+static void fini_vecalls_cgroups(void) { ; }
+#endif /* CONFIG_CGROUP_DEVICE */
+
+void fini_kthreadd(struct ve_struct *ve)
+{
+	long delay = 1;
+
+	if (ve->khelper_wq)
+		destroy_workqueue(ve->khelper_wq);
+	kthreadd_stop(ve);
+
+	while (ve->pcounter > 1) {
+		schedule_timeout(delay);
+		delay = (delay < HZ) ? (delay << 1) : HZ;
+	}
+}
+
+int init_kthreadd(struct ve_struct *ve)
+{
+	int err;
+
+	err = kthreadd_create();
+	if (err < 0)
+		return err;
+
+	ve->khelper_wq = create_singlethread_workqueue_ve("khelper", ve);
+	if (ve->khelper_wq == NULL) {
+		fini_kthreadd(ve);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int do_env_create(envid_t veid, unsigned int flags, u32 class_id,
+			 env_create_param_t *data, int datalen)
+{
+	struct task_struct *tsk;
+	struct cred *new_creds;
+	struct ve_struct *old;
+	struct ve_struct *old_exec;
+	struct ve_struct *ve;
+ 	__u64 init_mask;
+	int err;
+	struct nsproxy *old_ns, *old_ns_net;
+
+	tsk = current;
+	old = VE_TASK_INFO(tsk)->owner_env;
+
+	if (!thread_group_leader(tsk) || !thread_group_empty(tsk))
+		return -EINVAL;
+
+	if (tsk->signal->tty) {
+		printk("ERR: CT init has controlling terminal\n");
+		return -EINVAL;
+	}
+	if (task_pgrp(tsk) != task_pid(tsk) ||
+			task_session(tsk) != task_pid(tsk)) {
+		int may_setsid;
+
+		read_lock(&tasklist_lock);
+		may_setsid = !tsk->signal->leader &&
+			!pid_task(find_pid_ns(task_pid_nr(tsk), &init_pid_ns), PIDTYPE_PGID);
+		read_unlock(&tasklist_lock);
+
+		if (!may_setsid) {
+			printk("ERR: CT init is process group leader\n");
+			return -EINVAL;
+		}
+	}
+	/* Check that the process is not a leader of non-empty group/session.
+	 * If it is, we cannot virtualize its PID and must fail. */
+	if (!alone_in_pgrp(tsk)) {
+		printk("ERR: CT init is not alone in process group\n");
+		return -EINVAL;
+	}
+
+
+	VZTRACE("%s: veid=%d classid=%d pid=%d\n",
+		__FUNCTION__, veid, class_id, current->pid);
+
+	err = -ENOMEM;
+	ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL);
+	if (ve == NULL)
+		goto err_struct;
+
+	init_ve_struct(ve, veid, class_id, data, datalen);
+	__module_get(THIS_MODULE);
+	down_write(&ve->op_sem);
+	if (flags & VE_LOCK)
+		ve->is_locked = 1;
+
+	/*
+	 * this should be done before adding to list
+	 * because if calc_load_ve finds this ve in
+	 * list it will be very surprised
+	 */
+	if ((err = init_ve_cpustats(ve)) < 0)
+		goto err_cpu_stats;
+
+	if ((err = init_ve_cgroups(ve)))
+		goto err_cgroup;
+
+	if ((err = ve_list_add(ve)) < 0)
+		goto err_exist;
+
+	/* this should be done before context switching */
+	if ((err = init_printk(ve)) < 0)
+		goto err_log_wait;
+
+	old_exec = set_exec_env(ve);
+
+	if ((err = init_ve_sched(ve, data->total_vcpus)) < 0)
+		goto err_sched;
+
+	set_ve_root(ve, tsk);
+
+	if ((err = init_ve_devtmpfs(ve)))
+		goto err_devtmpfs;
+
+	if ((err = init_ve_sysfs(ve)))
+		goto err_sysfs;
+
+	init_mask = data ? data->iptables_mask : VE_IP_DEFAULT;
+
+#ifdef CONFIG_VE_IPTABLES
+	/* Set up ipt_mask as it will be used during
+	 * net namespace initialization
+	 */
+	init_mask = setup_iptables_mask(init_mask);
+	ve->ipt_mask = init_mask;
+#endif
+
+	if ((err = init_ve_namespaces(ve, &old_ns)))
+		goto err_ns;
+
+	if ((err = init_ve_proc(ve)))
+		goto err_proc;
+
+	if ((err = init_ve_netns(ve, &old_ns_net)))
+		goto err_netns;
+
+	if ((err = init_ve_tty_drivers(ve)) < 0)
+		goto err_tty;
+
+	if ((err = init_ve_vtty(ve)))
+		goto err_vtty;
+
+	if ((err = init_ve_shmem(ve)))
+		goto err_shmem;
+
+	if ((err = init_ve_devpts(ve)))
+		goto err_devpts;
+
+	if((err = init_ve_meminfo(ve)))
+		goto err_meminf;
+
+	set_ve_caps(ve, tsk);
+
+	if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0)
+		goto err_vpid;
+
+	err = -ENOMEM;
+	new_creds = prepare_creds();
+	if (new_creds == NULL)
+		goto err_creds;
+
+	if ((err = create_user_ns(new_creds)) < 0)
+		goto err_uns;
+
+	init_ve_cred(ve, new_creds);
+
+	ve_move_task(ve);
+
+	if ((err = init_kthreadd(ve)) < 0)
+		goto err_kthreadd;
+
+	if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0)
+		goto err_ve_hook;
+
+	put_nsproxy(old_ns);
+	put_nsproxy(old_ns_net);
+
+	ve->is_running = 1;
+	up_write(&ve->op_sem);
+
+	printk(KERN_INFO "CT: %d: started\n", veid);
+	return veid;
+
+err_ve_hook:
+	fini_kthreadd(ve);
+err_kthreadd:
+	ve_move_task(old);
+	/* creds will put user and user ns */
+err_uns:
+	put_cred(new_creds);
+err_creds:
+	mntget(ve->proc_mnt);
+err_vpid:
+	fini_venet(ve);
+	fini_ve_meminfo(ve);
+err_meminf:
+	fini_ve_devpts(ve);
+err_devpts:
+	fini_ve_shmem(ve);
+err_shmem:
+	fini_ve_vtty(ve);
+err_vtty:
+	fini_ve_tty_drivers(ve);
+err_tty:
+	fini_ve_namespaces(ve, old_ns_net);
+	put_nsproxy(old_ns_net);
+	fini_ve_netns(ve);
+err_netns:
+	/*
+	 * If process hasn't become VE's init, proc_mnt won't be put during
+	 * pidns death, so this mntput by hand is needed. If it has, we
+	 * compensate with mntget above.
+	 */
+	mntput(ve->proc_mnt);
+	fini_ve_proc(ve);
+err_proc:
+	/* free_ve_utsname() is called inside real_put_ve() */
+	fini_ve_namespaces(ve, old_ns);
+	put_nsproxy(old_ns);
+	/*
+	 * We need to compensate, because fini_ve_namespaces() assumes
+	 * ve->ve_ns will continue to be used after, but VE will be freed soon
+	 * (in kfree() sense).
+	 */
+	put_nsproxy(ve->ve_ns);
+err_ns:
+	fini_ve_sysfs(ve);
+err_sysfs:
+	fini_ve_devtmpfs(ve);
+err_devtmpfs:
+	put_ve_root(ve);
+
+	/* It is safe to restore current->envid here because
+	 * ve_fairsched_detach does not use current->envid. */
+	/* Really fairsched code uses current->envid in sys_fairsched_mknod 
+	 * only.  It is correct if sys_fairsched_mknod is called from
+	 * userspace.  If sys_fairsched_mknod is called from
+	 * ve_fairsched_attach, then node->envid and node->parent_node->envid
+	 * are explicitly set to valid value after the call. */
+	/* FIXME */
+	VE_TASK_INFO(tsk)->owner_env = old;
+	VE_TASK_INFO(tsk)->exec_env = old_exec;
+
+	fini_ve_sched(ve, 1);
+err_sched:
+	(void)set_exec_env(old_exec);
+
+	/* we can jump here having incorrect envid */
+	VE_TASK_INFO(tsk)->owner_env = old;
+	fini_printk(ve);
+err_log_wait:
+	/* cpustats will be freed in do_env_free */
+	ve_list_del(ve);
+	up_write(&ve->op_sem);
+
+	real_put_ve(ve);
+err_struct:
+	printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err);
+	return err;
+
+err_exist:
+	fini_ve_cgroups(ve);
+err_cgroup:
+	free_ve_cpustats(ve);
+err_cpu_stats:
+	kfree(ve);
+	module_put(THIS_MODULE);
+	goto err_struct;
+}
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE start/stop callbacks
+ *
+ **********************************************************************
+ **********************************************************************/
+
+int real_env_create(envid_t veid, unsigned flags, u32 class_id,
+			env_create_param_t *data, int datalen)
+{
+	int status;
+	struct ve_struct *ve;
+
+	if (!flags) {
+		status = get_exec_env()->veid;
+		goto out;
+	}
+
+	status = -EPERM;
+	if (!capable_setveid())
+		goto out;
+
+	status = -EINVAL;
+	if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE)))
+		goto out;
+
+	status = -EINVAL;
+	ve = get_ve_by_id(veid);
+	if (ve) {
+		if (flags & VE_TEST) {
+			status = 0;
+			goto out_put;
+		}
+		if (flags & VE_EXCLUSIVE) {
+			status = -EACCES;
+			goto out_put;
+		}
+		if (flags & VE_CREATE) {
+			flags &= ~VE_CREATE;
+			flags |= VE_ENTER;
+		}
+	} else {
+		if (flags & (VE_TEST|VE_ENTER)) {
+			status = -ESRCH;
+			goto out;
+		}
+	}
+
+	if (flags & VE_CREATE) {
+		status = do_env_create(veid, flags, class_id, data, datalen);
+		goto out;
+	} else if (flags & VE_ENTER)
+		status = do_env_enter(ve, flags);
+
+	/* else: returning EINVAL */
+
+out_put:
+	real_put_ve(ve);
+out:
+	return status;
+}
+EXPORT_SYMBOL(real_env_create);
+
+static int do_env_enter(struct ve_struct *ve, unsigned int flags)
+{
+	struct task_struct *tsk = current;
+	int err;
+
+	VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid);
+
+	err = -EBUSY;
+	down_read(&ve->op_sem);
+	if (!ve->is_running)
+		goto out_up;
+	if (ve->is_locked && !(flags & VE_SKIPLOCK))
+		goto out_up;
+	err = -EINVAL;
+	if (!thread_group_leader(tsk) || !thread_group_empty(tsk))
+		goto out_up;
+
+#ifdef CONFIG_VZ_FAIRSCHED
+	err = fairsched_move_task(ve->veid, current);
+	if (err)
+		goto out_up;
+#endif
+	switch_ve_namespaces(ve, tsk);
+	set_exec_env(ve);
+	ve_move_task(ve);
+
+	if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK))
+		pid_ns_attach_task(ve->ve_ns->pid_ns, tsk);
+
+	/* Unlike VE_CREATE, we do not setsid() in VE_ENTER.
+	 * Process is allowed to be in an external group/session.
+	 * If user space callers wants, it will do setsid() after
+	 * VE_ENTER.
+	 */
+	err = VE_TASK_INFO(tsk)->owner_env->veid;
+	tsk->did_ve_enter = 1;
+
+out_up:
+	up_read(&ve->op_sem);
+	return err;
+}
+
+extern void fini_ve_devices(struct ve_struct *ve);
+
+static void env_cleanup(struct ve_struct *ve)
+{
+	struct ve_struct *old_ve;
+
+	VZTRACE("real_do_env_cleanup\n");
+
+	down_read(&ve->op_sem);
+	old_ve = set_exec_env(ve);
+
+	fini_venet(ve);
+
+	/* no new packets in flight beyond this point */
+
+	fini_ve_sched(ve, 0);
+
+	fini_ve_devpts(ve);
+	fini_ve_shmem(ve);
+	fini_ve_vtty(ve);
+	unregister_ve_tty_drivers(ve);
+	fini_ve_meminfo(ve);
+
+	fini_ve_devices(ve);
+
+	fini_ve_namespaces(ve, NULL);
+	fini_ve_netns(ve);
+	fini_ve_proc(ve);
+	fini_ve_sysfs(ve);
+	fini_ve_devtmpfs(ve);
+
+	ve_hook_iterate_fini(VE_CLEANUP_CHAIN, ve);
+
+	put_ve_root(ve);
+
+	(void)set_exec_env(old_ve);
+	fini_printk(ve);	/* no printk can happen in ve context anymore */
+
+	ve_list_del(ve);
+	up_read(&ve->op_sem);
+
+	real_put_ve(ve);
+}
+
+static DECLARE_COMPLETION(vzmond_complete);
+static int vzmond_helper(void *arg)
+{
+	char name[18];
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)arg;
+	snprintf(name, sizeof(name), "vzmond/%d", ve->veid);
+	daemonize(name);
+	env_cleanup(ve);
+	module_put_and_exit(0);
+}
+
+static void do_pending_env_cleanups(void)
+{
+	int err;
+	struct ve_struct *ve;
+
+	spin_lock(&ve_cleanup_lock);
+	while (1) {
+		if (list_empty(&ve_cleanup_list) || need_resched())
+			break;
+
+		ve = list_first_entry(&ve_cleanup_list,
+				struct ve_struct, cleanup_list);
+		list_del(&ve->cleanup_list);
+		spin_unlock(&ve_cleanup_lock);
+
+		__module_get(THIS_MODULE);
+		err = kernel_thread(vzmond_helper, (void *)ve, 0);
+		if (err < 0) {
+			env_cleanup(ve);
+			module_put(THIS_MODULE);
+		}
+
+		spin_lock(&ve_cleanup_lock);
+	}
+	spin_unlock(&ve_cleanup_lock);
+}
+
+static inline int have_pending_cleanups(void)
+{
+	return !list_empty(&ve_cleanup_list);
+}
+
+static int vzmond(void *arg)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop() || have_pending_cleanups()) {
+		schedule();
+		try_to_freeze();
+		if (signal_pending(current))
+			flush_signals(current);
+
+		do_pending_env_cleanups();
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (have_pending_cleanups())
+			__set_current_state(TASK_RUNNING);
+	}
+
+	__set_task_state(current, TASK_RUNNING);
+	complete_and_exit(&vzmond_complete, 0);
+}
+
+static int __init init_vzmond(void)
+{
+	ve_cleanup_thread = kthread_run(vzmond, NULL, "vzmond");
+	if (IS_ERR(ve_cleanup_thread))
+		return PTR_ERR(ve_cleanup_thread);
+	else
+		return 0;
+}
+
+static void fini_vzmond(void)
+{
+	kthread_stop(ve_cleanup_thread);
+	WARN_ON(!list_empty(&ve_cleanup_list));
+}
+
+static void ve_devmnt_free(struct ve_devmnt *devmnt)
+{
+	if (!devmnt)
+		return;
+
+	kfree(devmnt->allowed_options);
+	kfree(devmnt->hidden_options);
+	kfree(devmnt);
+}
+
+static void free_ve_devmnts(struct ve_struct *ve)
+{
+	while (!list_empty(&ve->devmnt_list)) {
+		struct ve_devmnt *devmnt;
+
+		devmnt = list_first_entry(&ve->devmnt_list, struct ve_devmnt, link);
+		list_del(&devmnt->link);
+		ve_devmnt_free(devmnt);
+	}
+}
+
+static void real_do_env_free(struct ve_struct *ve)
+{
+	VZTRACE("real_do_env_free\n");
+
+	idr_destroy(&ve->_posix_timers_id);
+	fini_ve_cgroups(ve);
+	free_ve_tty_drivers(ve);
+	free_ve_filesystems(ve);
+	free_ve_cpustats(ve);
+	free_ve_devmnts(ve);
+	printk(KERN_INFO "CT: %d: stopped\n", VEID(ve));
+	kfree(ve);
+
+	module_put(THIS_MODULE);
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE TTY handling
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base,
+					   struct ve_struct *ve)
+{
+	size_t size;
+	struct tty_driver *driver;
+
+	/* FIXME: make it a normal way (or wait till ms version) */
+
+	driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC);
+	if (!driver)
+		goto out;
+
+	memcpy(driver, base, sizeof(struct tty_driver));
+
+	driver->driver_state = NULL;
+
+	size = base->num * 3 * sizeof(void *);
+	if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) {
+		void **p;
+		p = kzalloc(size, GFP_KERNEL_UBC);
+		if (!p)
+			goto out_free;
+
+		driver->ttys = (struct tty_struct **)p;
+		driver->termios = (struct ktermios **)(p + driver->num);
+		driver->termios_locked = (struct ktermios **)
+			(p + driver->num * 2);
+	} else {
+		driver->ttys = NULL;
+		driver->termios = NULL;
+		driver->termios_locked = NULL;
+	}
+
+	driver->owner_env = ve;
+	driver->flags |= TTY_DRIVER_INSTALLED;
+	kref_init(&driver->kref);
+
+	return driver;
+
+out_free:
+	kfree(driver);
+out:
+	return NULL;
+}
+
+static void free_ve_tty_driver(struct tty_driver *driver)
+{
+	if (!driver)
+		return;
+
+	clear_termios(driver);
+	kfree(driver->ttys);
+	kfree(driver);
+}
+
+static int alloc_ve_tty_drivers(struct ve_struct* ve)
+{
+#ifdef CONFIG_LEGACY_PTYS
+	/* Traditional BSD devices */
+	ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve);
+	if (!ve->pty_driver)
+		goto out_mem;
+
+	ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve);
+	if (!ve->pty_slave_driver)
+		goto out_mem;
+
+	ve->pty_driver->other       = ve->pty_slave_driver;
+	ve->pty_slave_driver->other = ve->pty_driver;
+#endif	
+	return 0;
+
+out_mem:
+	free_ve_tty_drivers(ve);
+	return -ENOMEM;
+}
+
+static void free_ve_tty_drivers(struct ve_struct* ve)
+{
+#ifdef CONFIG_LEGACY_PTYS
+	free_ve_tty_driver(ve->pty_driver);
+	free_ve_tty_driver(ve->pty_slave_driver);
+	ve->pty_driver = ve->pty_slave_driver = NULL;
+#endif	
+}
+
+static inline void __register_tty_driver(struct tty_driver *driver)
+{
+	list_add(&driver->tty_drivers, &tty_drivers);
+}
+
+static inline void __unregister_tty_driver(struct tty_driver *driver)
+{
+	if (!driver)
+		return;
+	list_del(&driver->tty_drivers);
+}
+
+static int register_ve_tty_drivers(struct ve_struct* ve)
+{
+	mutex_lock(&tty_mutex);
+#ifdef CONFIG_LEGACY_PTYS
+	__register_tty_driver(ve->pty_driver);
+	__register_tty_driver(ve->pty_slave_driver);
+#endif	
+	mutex_unlock(&tty_mutex);
+
+	return 0;
+}
+
+static void unregister_ve_tty_drivers(struct ve_struct* ve)
+{
+	VZTRACE("unregister_ve_tty_drivers\n");
+
+	mutex_lock(&tty_mutex);
+#ifdef CONFIG_LEGACY_PTYS
+	__unregister_tty_driver(ve->pty_driver);
+	__unregister_tty_driver(ve->pty_slave_driver);
+#endif
+	mutex_unlock(&tty_mutex);
+}
+
+static int init_ve_tty_drivers(struct ve_struct *ve)
+{
+	int err;
+
+	if ((err = alloc_ve_tty_drivers(ve)))
+		goto err_ttyalloc;
+	if ((err = register_ve_tty_drivers(ve)))
+		goto err_ttyreg;
+	return 0;
+
+err_ttyreg:
+	free_ve_tty_drivers(ve);
+err_ttyalloc:
+	return err;
+}
+
+static void fini_ve_tty_drivers(struct ve_struct *ve)
+{
+	unregister_ve_tty_drivers(ve);
+	free_ve_tty_drivers(ve);
+}
+
+static void fini_ve_vtty(struct ve_struct *ve)
+{
+	int minor;
+
+	for (minor = 0 ; minor <= MAX_NR_VTTY ; minor++)
+		device_destroy(ve->tty_class, MKDEV(TTY_MAJOR, minor));
+}
+
+static int init_ve_vtty(struct ve_struct *ve)
+{
+	int err, minor;
+	struct device *dev;
+
+	for (minor = 0 ; minor <= MAX_NR_VTTY ; minor++) {
+		err = set_device_perms_ve(ve, S_IFCHR | VE_USE_MAJOR | VE_USE_MINOR,
+				MKDEV(TTY_MAJOR, minor), 06);
+		if (err)
+			goto out;
+		dev = device_create(ve->tty_class, NULL,
+				MKDEV(TTY_MAJOR, minor), NULL, "tty%d", minor);
+		err = PTR_ERR(dev);
+		if (IS_ERR(dev))
+			goto out;
+	}
+
+	return 0;
+
+out:
+	fini_ve_vtty(ve);
+	return err;
+}
+
+/*
+ * Free the termios and termios_locked structures because
+ * we don't want to get memory leaks when modular tty
+ * drivers are removed from the kernel.
+ */
+static void clear_termios(struct tty_driver *driver)
+{
+	int i;
+	struct ktermios *tp;
+
+	if (driver->termios == NULL)
+		return;
+	for (i = 0; i < driver->num; i++) {
+		tp = driver->termios[i];
+		if (tp) {
+			driver->termios[i] = NULL;
+			kfree(tp);
+		}
+		tp = driver->termios_locked[i];
+		if (tp) {
+			driver->termios_locked[i] = NULL;
+			kfree(tp);
+		}
+	}
+}
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Pieces of VE network
+ *
+ **********************************************************************
+ **********************************************************************/
+
+#ifdef CONFIG_NET
+#include <asm/uaccess.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#endif
+
+static int ve_dev_add(envid_t veid, char *dev_name)
+{
+	struct net_device *dev;
+	struct ve_struct *dst_ve;
+	struct net *dst_net;
+	int err = -ESRCH;
+
+	dst_ve = get_ve_by_id(veid);
+	if (dst_ve == NULL)
+		goto out;
+
+	dst_net = dst_ve->ve_netns;
+
+	rtnl_lock();
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_name(&init_net, dev_name);
+	read_unlock(&dev_base_lock);
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = __dev_change_net_namespace(dev, dst_net, dev_name, get_exec_ub());
+out_unlock:
+	rtnl_unlock();
+	real_put_ve(dst_ve);
+
+	if (dev == NULL)
+		printk(KERN_WARNING "%s: device %s not found\n",
+			__func__, dev_name);
+out:
+	return err;
+}
+
+static int ve_dev_del(envid_t veid, char *dev_name)
+{
+	struct net_device *dev;
+	struct ve_struct *src_ve;
+	struct net *src_net;
+	int err = -ESRCH;
+
+	src_ve = get_ve_by_id(veid);
+	if (src_ve == NULL)
+		goto out;
+
+	src_net = src_ve->ve_netns;
+
+	rtnl_lock();
+
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_name(src_net, dev_name);
+	read_unlock(&dev_base_lock);
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = __dev_change_net_namespace(dev, &init_net, dev_name,
+					 netdev_bc(dev)->owner_ub);
+out_unlock:
+	rtnl_unlock();
+	real_put_ve(src_ve);
+
+	if (dev == NULL)
+		printk(KERN_WARNING "%s: device %s not found\n",
+			__func__, dev_name);
+out:
+	return err;
+}
+
+int real_ve_dev_map(envid_t veid, int op, char *dev_name)
+{
+	if (!capable_setveid())
+		return -EPERM;
+	switch (op) {
+	case VE_NETDEV_ADD:
+		return ve_dev_add(veid, dev_name);
+	case VE_NETDEV_DEL:
+		return ve_dev_del(veid, dev_name);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE information via /proc
+ *
+ **********************************************************************
+ **********************************************************************/
+#ifdef CONFIG_PROC_FS
+#if BITS_PER_LONG == 32
+#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
+#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n"
+#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n"
+#else
+#define VESTAT_LINE_WIDTH (12 * 21)
+#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n"
+#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n"
+#endif
+
+static int vestat_seq_show(struct seq_file *m, void *v)
+{
+	struct list_head *entry;
+	struct ve_struct *ve;
+	struct ve_struct *curve;
+	int ret;
+	unsigned long user_ve, nice_ve, system_ve;
+	unsigned long long uptime;
+	u64 uptime_cycles, idle_time, strv_time, used;
+	struct kernel_cpustat kstat;
+
+	entry = (struct list_head *)v;
+	ve = list_entry(entry, struct ve_struct, ve_list);
+
+	curve = get_exec_env();
+	if (entry == ve_list_head.next ||
+	    (!ve_is_super(curve) && ve == curve)) {
+		/* print header */
+		seq_printf(m, "%-*s\n",
+			VESTAT_LINE_WIDTH - 1,
+			"Version: 2.2");
+		seq_printf(m, VESTAT_HEAD_FMT, "VEID",
+					"user", "nice", "system",
+					"uptime", "idle",
+					"strv", "uptime", "used",
+					"maxlat", "totlat", "numsched");
+	}
+
+	if (ve == get_ve0())
+		return 0;
+
+	ret = fairsched_get_cpu_stat(ve->veid, &kstat);
+	if (ret)
+		return ret;
+
+	strv_time = 0;
+	user_ve = kstat.cpustat[USER];
+	nice_ve = kstat.cpustat[NICE];
+	system_ve = kstat.cpustat[SYSTEM];
+	used = kstat.cpustat[USED];
+	idle_time = kstat.cpustat[IDLE];
+
+	uptime_cycles = ve_get_uptime(ve);
+	uptime = get_jiffies_64() - ve->start_jiffies;
+
+	seq_printf(m, VESTAT_LINE_FMT, ve->veid,
+				user_ve, nice_ve, system_ve,
+				(unsigned long long)uptime,
+				(unsigned long long)idle_time, 
+				(unsigned long long)strv_time,
+				(unsigned long long)uptime_cycles,
+				(unsigned long long)used,
+				(unsigned long long)ve->sched_lat_ve.last.maxlat,
+				(unsigned long long)ve->sched_lat_ve.last.totlat,
+				ve->sched_lat_ve.last.count);
+	return 0;
+}
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ve_struct *curve;
+
+	curve = get_exec_env();
+	mutex_lock(&ve_list_lock);
+	if (!ve_is_super(curve)) {
+		if (*pos != 0)
+			return NULL;
+		return curve;
+	}
+
+	return seq_list_start(&ve_list_head, *pos);
+}
+EXPORT_SYMBOL(ve_seq_start);
+
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+	else
+		return seq_list_next(v, &ve_list_head, pos);
+}
+EXPORT_SYMBOL(ve_seq_next);
+
+void ve_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&ve_list_lock);
+}
+EXPORT_SYMBOL(ve_seq_stop);
+
+static struct seq_operations vestat_seq_op = {
+        .start	= ve_seq_start,
+        .next	= ve_seq_next,
+        .stop	= ve_seq_stop,
+        .show	= vestat_seq_show
+};
+
+static int vestat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &vestat_seq_op);
+}
+
+static struct file_operations proc_vestat_operations = {
+        .open	 = vestat_open,
+        .read	 = seq_read,
+        .llseek	 = seq_lseek,
+        .release = seq_release
+};
+
+static struct seq_operations devperms_seq_op = {
+	.start  = ve_seq_start,
+	.next   = ve_seq_next,
+	.stop   = ve_seq_stop,
+	.show   = devperms_seq_show,
+};
+
+static int devperms_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &devperms_seq_op);
+}
+
+static struct file_operations proc_devperms_ops = {
+	.open           = devperms_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static int vz_version_show(struct seq_file *file, void* v)
+{
+	static const char ver[] = VZVERSION "\n";
+
+	return seq_puts(file, ver);
+}
+
+static int vz_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vz_version_show, NULL);
+}
+
+static struct file_operations proc_vz_version_oparations = {
+	.open    = vz_version_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/* /proc/vz/veinfo */
+
+static ve_seq_print_t veaddr_seq_print_cb;
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t cb)
+{
+	rcu_assign_pointer(veaddr_seq_print_cb, cb);
+}
+EXPORT_SYMBOL(vzmon_register_veaddr_print_cb);
+
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t cb)
+{
+	rcu_assign_pointer(veaddr_seq_print_cb, NULL);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(vzmon_unregister_veaddr_print_cb);
+
+static int veinfo_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve;
+	ve_seq_print_t veaddr_seq_print;
+
+	ve = list_entry((struct list_head *)v, struct ve_struct, ve_list);
+
+	seq_printf(m, "%10u %5u %5u", ve->veid, ve->class_id, ve->pcounter);
+
+	rcu_read_lock();
+	veaddr_seq_print = rcu_dereference(veaddr_seq_print_cb);
+	if (veaddr_seq_print)
+		veaddr_seq_print(m, ve);
+	rcu_read_unlock();
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static struct seq_operations veinfo_seq_op = {
+	.start	= ve_seq_start,
+	.next	=  ve_seq_next,
+	.stop	=  ve_seq_stop,
+	.show	=  veinfo_seq_show,
+};
+
+static int veinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &veinfo_seq_op);
+}
+
+static struct file_operations proc_veinfo_operations = {
+	.open		= veinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_vecalls_proc(void)
+{
+	struct proc_dir_entry *de;
+
+	de = proc_create("vestat", S_IFREG | S_IRUSR, glob_proc_vz_dir,
+			&proc_vestat_operations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make vestat proc entry\n");
+
+	de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_devperms_ops);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make devperms proc entry\n");
+
+	de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir,
+			&proc_vz_version_oparations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make version proc entry\n");
+
+	de = proc_create("veinfo", S_IFREG | S_IRUSR, glob_proc_vz_dir,
+			&proc_veinfo_operations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make veinfo proc entry\n");
+
+	return 0;
+}
+
+static void fini_vecalls_proc(void)
+{
+	remove_proc_entry("version", proc_vz_dir);
+	remove_proc_entry("devperms", proc_vz_dir);
+	remove_proc_entry("vestat", glob_proc_vz_dir);
+	remove_proc_entry("veinfo", glob_proc_vz_dir);
+}
+#else
+#define init_vecalls_proc()	(0)
+#define fini_vecalls_proc()	do { } while (0)
+#endif /* CONFIG_PROC_FS */
+
+static int init_ve_osrelease(struct ve_struct *ve, char *release)
+{
+	if (!release)
+		return -ENODATA;
+
+	if (strlen(release) >= sizeof(ve->ve_ns->uts_ns->name.release))
+		return -EMSGSIZE;
+
+	down_write(&uts_sem);
+	strcpy(ve->ve_ns->uts_ns->name.release, release);
+	up_write(&uts_sem);
+
+	return 0;
+}
+
+static struct proc_dir_entry *ve_proc_mkdir(struct ve_struct *ve, char *name,
+						struct proc_dir_entry *parent,
+						struct list_head *list)
+{
+	struct proc_dir_entry *de;
+	struct ve_proc_dir_entry *ve_de;
+
+	ve_de = kmalloc(sizeof(struct ve_proc_dir_entry *), GFP_KERNEL);
+	if (!ve_de)
+		return ERR_PTR(-ENOMEM);
+	
+	de = proc_mkdir(name, parent);
+	if (!de) {
+		kfree(ve_de);
+		return ERR_PTR(-EINVAL);
+	}
+
+	ve_de->de = de;
+	ve_de->ve = ve;
+	list_add(&ve_de->list, list);
+
+	return de;
+}
+
+static struct proc_dir_entry * ve_proc_mkdir_recursive(struct ve_struct *ve,
+				char *path, struct list_head *list)
+{
+	struct proc_dir_entry *parent, *de;
+	char *name, *end;
+	INIT_LIST_HEAD(list);
+
+	name = path;
+	parent = ve->proc_root;
+	while (1) {
+		end = strchr(name, '/');
+		if (end)
+			*end = '\0';
+		de = __proc_lookup(parent, name, strlen(name));
+		if (de == NULL) {
+			parent = ve_proc_mkdir(ve, name, parent, list);
+			if (IS_ERR(parent))
+				goto out_err;
+		} else
+			parent = de;
+		if (end)
+			*end = '/';
+		else
+			break;
+		name = end + 1;
+	}
+	return parent;
+out_err:
+	cleanup_ve_proc_entries(ve, list);
+	return parent;
+}
+
+/*
+ * data is a buffer with two strings, the first is name of a new entry and
+ * the second is path to the target entry.
+ */
+static int ve_configure_make_proc_link(struct ve_struct *ve, mode_t mode,
+					unsigned int size, char *data)
+{
+	struct proc_dir_entry *de, *link_de, *parent;
+	struct ve_proc_dir_entry *ve_de;
+	char *link, *name, *end;
+	LIST_HEAD(list);
+	int ret = 0;
+	
+	if (data[size-1] != '\0')
+		return -EINVAL;
+
+	name = data;
+	link = strchr(data, '\0');
+	if (link == data + size-1)
+		return -EINVAL;
+	link++;
+
+	down(&ve_proc_entries_lock);
+	parent = ve->proc_root;
+	end = strrchr(name, '/');
+	if (end) {
+		*end = '\0';
+		parent = ve_proc_mkdir_recursive(ve, name, &list);
+		*end = '/';
+		if (IS_ERR(parent)) {
+			ret = PTR_ERR(parent);
+			goto out_unlock;
+		}
+		name = end + 1;
+	}
+
+	de = __proc_lookup(parent, name, strlen(name));
+	if (de) {
+		ret = -EEXIST;
+		goto out_unlock;
+	}
+
+	ve_de = kmalloc(sizeof(struct ve_proc_dir_entry *), GFP_KERNEL);
+	if (!ve_de) {
+		ret = -ENOMEM;
+		goto out_dir;
+	}
+
+	link_de = proc_lookup_entry(link, get_ve0()->proc_root);
+	if (!link_de) {
+		ret = -ENOENT;
+		goto out_free;
+	}
+	
+	de = create_proc_hardlink(name, mode, parent, link_de);
+	if (!de) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	ve_de->de = de;
+	ve_de->ve = ve;
+	list_splice_init(&list, &ve_proc_entries);
+	list_add(&ve_de->list, &ve_proc_entries);
+out_free:
+	if (ret)
+		kfree(ve_de);
+out_dir:
+	if (ret)
+		cleanup_ve_proc_entries(ve, &list);
+out_unlock:
+	up(&ve_proc_entries_lock);	
+	return ret;
+}
+
+/*
+ * 'data' for VE_CONFIGURE_MOUNT_OPTIONS is a zero-terminated string
+ * consisting of substrings separated by MNTOPT_DELIM.
+ */
+#define MNTOPT_DELIM ';'
+
+/*
+ * Each substring has the form of "<type> <comma-separated-list-of-options>"
+ * where types are:
+ */
+enum {
+	MNTOPT_HIDDEN = 1,
+	MNTOPT_ALLOWED = 2,
+};
+
+/*
+ * 'ptr' points to the first character of buffer to parse
+ * 'endp' points to the last character of buffer to parse
+ */
+static int ve_parse_mount_options(char *ptr, char *endp,
+				  struct ve_devmnt *devmnt)
+{
+	while (*ptr) {
+		char *delim = strchr(ptr, MNTOPT_DELIM) ? : endp;
+		char *space = strchr(ptr, ' ');
+		int type;
+		char *options, *p;
+		int options_size = delim - space;
+		char **opts_pp = NULL; /* where to store 'options' */
+
+		if (delim == ptr || !space || options_size <= 1)
+			return -EINVAL;
+
+		type = simple_strtoul(ptr, &p, 10);
+		if (p != space)
+			return -EINVAL;
+
+	        options = kmalloc(options_size, GFP_KERNEL);
+		if (!options)
+			return -ENOMEM;
+
+		strncpy(options, space + 1, options_size - 1);
+		options[options_size - 1] = 0;
+
+		switch (type) {
+		case MNTOPT_ALLOWED:
+			opts_pp = &devmnt->allowed_options;
+			break;
+		case MNTOPT_HIDDEN:
+			opts_pp = &devmnt->hidden_options;
+			break;
+		};
+
+		/* wrong type or already set */
+		if (!opts_pp || *opts_pp) {
+			kfree(options);
+			return -EINVAL;
+		}
+
+		*opts_pp = options;
+
+		if (!*delim)
+			break;
+
+		ptr = delim + 1;
+	}
+
+	return 0;
+}
+
+static int ve_configure_mount_options(struct ve_struct *ve, unsigned int val,
+				      unsigned int size, char *data)
+{
+	struct ve_devmnt *devmnt;
+	int err;
+
+	if (size <= 1)
+		return -EINVAL; /* TODO: remove devmnt from list by dev */
+
+	data[size - 1] = 0;
+
+	devmnt = kzalloc(sizeof(*devmnt), GFP_KERNEL);
+	if (!devmnt)
+		return -ENOMEM;
+
+	devmnt->dev = new_decode_dev(val);
+
+	err = ve_parse_mount_options(data, data + size - 1, devmnt);
+	if (err) {
+		ve_devmnt_free(devmnt);
+		return err;
+	}
+
+	mutex_lock(&ve->devmnt_mutex);
+	list_add(&devmnt->link, &ve->devmnt_list);
+	mutex_unlock(&ve->devmnt_mutex);
+
+	return 0;
+}
+
+static int ve_configure(envid_t veid, unsigned int key,
+			unsigned int val, unsigned int size, char *data)
+{
+	struct ve_struct *ve;
+	int err = -ENOKEY;
+
+	switch(key) {
+	case VE_CONFIGURE_OPEN_TTY:
+		return vtty_open_master(veid, val);
+	}
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -EINVAL;
+
+	switch(key) {
+	case VE_CONFIGURE_OS_RELEASE:
+		err = init_ve_osrelease(ve, data); 
+		break;
+	case VE_CONFIGURE_CREATE_PROC_LINK:
+		err = ve_configure_make_proc_link(ve, val, size, data);
+		break;
+	case VE_CONFIGURE_MOUNT_OPTIONS:
+		err = ve_configure_mount_options(ve, val, size, data);
+		break;
+ 	}
+
+	real_put_ve(ve);
+ 	return err;
+}
+
+static int ve_configure_ioctl(struct vzctl_ve_configure *arg)
+{
+	int err;
+	struct vzctl_ve_configure s;
+	char *data = NULL;
+
+	err = -EFAULT;
+	if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+		goto out;
+	if (s.size) {
+		if (s.size > PAGE_SIZE)
+			return -EMSGSIZE;
+
+		data = kzalloc(s.size + 1, GFP_KERNEL);
+		if (unlikely(!data))
+			return -ENOMEM;
+
+		if (copy_from_user(data, (void __user *) &arg->data, s.size))
+			goto out;
+	}
+	err = ve_configure(s.veid, s.key, s.val, s.size, data);
+out:
+	kfree(data);
+	return err;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * User ctl
+ *
+ **********************************************************************
+ **********************************************************************/
+
+int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	    case VZCTL_MARK_ENV_TO_DOWN: {
+		        /* Compatibility issue */
+		        err = 0;
+		}
+		break;
+	    case VZCTL_SETDEVPERMS: {
+			/* Device type was mistakenly declared as dev_t
+			 * in the old user-kernel interface.
+			 * That's wrong, dev_t is a kernel internal type.
+			 * I use `unsigned' not having anything better in mind.
+			 * 2001/08/11  SAW  */
+			struct vzctl_setdevperms s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = real_setdevperms(s.veid, s.type,
+					new_decode_dev(s.dev), s.mask);
+		}
+		break;
+#ifdef CONFIG_INET
+	    case VZCTL_VE_NETDEV: {
+			struct vzctl_ve_netdev d;
+			char *s;
+			err = -EFAULT;
+			if (copy_from_user(&d, (void __user *)arg, sizeof(d)))
+				break;
+			err = -ENOMEM;
+			s = kmalloc(IFNAMSIZ+1, GFP_KERNEL);
+			if (s == NULL)
+				break;
+			err = -EFAULT;
+			if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) {
+				s[IFNAMSIZ] = 0;
+				err = real_ve_dev_map(d.veid, d.op, s);
+			}
+			kfree(s);
+		}
+		break;
+#endif
+	    case VZCTL_ENV_CREATE: {
+			struct vzctl_env_create s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = real_env_create(s.veid, s.flags, s.class_id,
+				NULL, 0);
+		}
+		break;
+	    case VZCTL_ENV_CREATE_DATA: {
+			struct vzctl_env_create_data s;
+			env_create_param_t *data;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err=-EINVAL;
+			if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN ||
+			    s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN ||
+			    s.data == 0)
+				break;
+			err = -ENOMEM;
+			data = kzalloc(sizeof(*data), GFP_KERNEL);
+			if (!data)
+				break;
+
+			err = -EFAULT;
+			if (copy_from_user(data, (void __user *)s.data,
+						s.datalen))
+				goto free_data;
+			err = real_env_create(s.veid, s.flags, s.class_id,
+				data, s.datalen);
+free_data:
+			kfree(data);
+		}
+		break;
+	    case VZCTL_GET_CPU_STAT: {
+			struct vzctl_cpustatctl s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = ve_get_cpu_stat(s.veid, s.cpustat);
+		}
+		break;
+	    case VZCTL_VE_MEMINFO: {
+			struct vzctl_ve_meminfo s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = ve_set_meminfo(s.veid, s.val);
+		}
+		break;
+	    case VZCTL_VE_CONFIGURE:
+		err = ve_configure_ioctl((struct vzctl_ve_configure *)arg);
+		break;
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_vzcalls_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+
+	switch(cmd) {
+	case VZCTL_GET_CPU_STAT: {
+		/* FIXME */
+	}
+	case VZCTL_COMPAT_ENV_CREATE_DATA: {
+		struct compat_vzctl_env_create_data cs;
+		struct vzctl_env_create_data __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(cs.flags, &s->flags) ||
+		    put_user(cs.class_id, &s->class_id) ||
+		    put_user(compat_ptr(cs.data), &s->data) ||
+		    put_user(cs.datalen, &s->datalen))
+			break;
+		err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA,
+						(unsigned long)s);
+		break;
+	}
+#ifdef CONFIG_NET
+	case VZCTL_COMPAT_VE_NETDEV: {
+		struct compat_vzctl_ve_netdev cs;
+		struct vzctl_ve_netdev __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(cs.op, &s->op) ||
+		    put_user(compat_ptr(cs.dev_name), &s->dev_name))
+			break;
+		err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s);
+		break;
+	}
+#endif
+	case VZCTL_COMPAT_VE_MEMINFO: {
+		struct compat_vzctl_ve_meminfo cs;
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		err = ve_set_meminfo(cs.veid, cs.val);
+		break;
+	}
+	default:
+		err = vzcalls_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo vzcalls = {
+	.type		= VZCTLTYPE,
+	.ioctl		= vzcalls_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_vzcalls_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Init/exit stuff
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static inline __init int init_vecalls_ioctls(void)
+{
+	vzioctl_register(&vzcalls);
+	return 0;
+}
+
+static inline void fini_vecalls_ioctls(void)
+{
+	vzioctl_unregister(&vzcalls);
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *table_header;
+
+static ctl_table kernel_table[] = {
+	{
+		.procname	= "ve_allow_kthreads",
+		.data		= &ve_allow_kthreads,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ 0 }
+};
+
+static ctl_table root_table[] =  {
+	{CTL_KERN, "kernel",  NULL, 0, 0555, kernel_table},
+	{ 0 }
+};
+
+static int init_vecalls_sysctl(void)
+{
+	table_header = register_sysctl_table(root_table);
+	if (!table_header)
+		return -ENOMEM ;
+	return 0;
+}
+
+static void fini_vecalls_sysctl(void)
+{
+	unregister_sysctl_table(table_header);
+} 
+#else
+static int init_vecalls_sysctl(void) { return 0; }
+static void fini_vecalls_sysctl(void) { ; }
+#endif
+
+static int __init vecalls_init(void)
+{
+	int err;
+
+	err = init_vecalls_cgroups();
+	if (err)
+		goto out_cgroups;
+
+	err = init_vecalls_sysctl();
+	if (err)
+		goto out_vzmond;
+
+	err = init_vzmond();
+	if (err < 0)
+		goto out_sysctl;
+
+	err = init_vecalls_proc();
+	if (err < 0)
+		goto out_proc;
+
+	err = init_vecalls_ioctls();
+	if (err < 0)
+		goto out_ioctls;
+
+	/* We can easy dereference this hook if VE is running
+	 * because in this case vzmon refcount > 0
+	 */
+	do_ve_enter_hook = do_env_enter;
+	/*
+	 * This one can also be dereferenced since not freed
+	 * VE holds reference on module
+	 */
+	do_env_free_hook = real_do_env_free;
+
+	return 0;
+
+out_ioctls:
+	fini_vecalls_proc();
+out_proc:
+	fini_vzmond();
+out_sysctl:
+	fini_vecalls_sysctl();
+out_vzmond:
+	fini_vecalls_cgroups();
+out_cgroups:
+	return err;
+}
+
+static void __exit vecalls_exit(void)
+{
+	do_env_free_hook = NULL;
+	do_ve_enter_hook = NULL;
+	fini_vecalls_ioctls();
+	fini_vecalls_proc();
+	fini_vzmond();
+	fini_vecalls_sysctl();
+	fini_vecalls_cgroups();
+}
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Control");
+MODULE_LICENSE("GPL v2");
+
+module_init(vecalls_init)
+module_exit(vecalls_exit)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/veowner.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/veowner.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/veowner.c	2015-01-21 12:02:44.084203541 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/veowner.c	2015-01-21 12:02:47.157121959 +0300
@@ -0,0 +1,153 @@
+/*
+ *  kernel/ve/veowner.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/ipc.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/inetdevice.h>
+#include <linux/pid_namespace.h>
+#include <linux/xattr.h>
+#include <asm/system.h>
+#include <asm/io.h>
+
+#include <net/tcp.h>
+
+/*
+ * ------------------------------------------------------------------------
+ * proc entries
+ * ------------------------------------------------------------------------
+ */
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_vz_dir;
+EXPORT_SYMBOL(proc_vz_dir);
+
+struct proc_dir_entry *glob_proc_vz_dir;
+EXPORT_SYMBOL(glob_proc_vz_dir);
+
+static void prepare_proc(void)
+{
+	proc_vz_dir = proc_mkdir("vz", NULL);
+	if (!proc_vz_dir)
+		panic("Can't create /proc/vz dir\n");
+
+	glob_proc_vz_dir = proc_mkdir("vz", &glob_proc_root);
+	if (!proc_vz_dir)
+		panic("Can't create /proc/vz dir\n");
+	proc_create("container", S_IFDIR|S_IRUSR|S_IXUSR, proc_vz_dir, NULL);
+}
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * OpenVZ sysctl
+ * ------------------------------------------------------------------------
+ */
+int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT;
+static int ve_area_access_check;
+
+#ifdef CONFIG_INET
+static struct ctl_table vz_ipv4_route_table[] = {
+	{
+		.procname	= "src_check",
+		.data		= &ip_rt_src_check,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ 0 }
+};
+
+static struct ctl_path net_ipv4_route_path[] = {
+	{ .ctl_name = CTL_NET, .procname = "net", },
+	{ .ctl_name = NET_IPV4, .procname = "ipv4", },
+	{ .ctl_name = NET_IPV4_ROUTE, .procname = "route", },
+	{ }
+};
+#endif
+
+static struct ctl_table vz_fs_table[] = {
+	{
+		.procname	= "ve-area-access-check",
+		.data		= &ve_area_access_check,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ve-xattr-policy",
+		.data		= &ve_xattr_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "fsync-enable",
+		.extra1		= (void *)offsetof(struct ve_struct, fsync_enable),
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ 0 }
+};
+
+static struct ctl_path fs_path[] = {
+	{ .ctl_name = CTL_FS, .procname = "fs", },
+	{ }
+};
+
+static void prepare_sysctl(void)
+{
+#ifdef CONFIG_INET
+	register_sysctl_paths(net_ipv4_route_path, vz_ipv4_route_table);
+#endif
+	register_sysctl_paths(fs_path, vz_fs_table);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * XXX init_ve_system
+ * ------------------------------------------------------------------------
+ */
+
+void init_ve_system(void)
+{
+	struct task_struct *init_entry;
+	struct ve_struct *ve;
+	struct path root;
+
+	ve = get_ve0();
+
+	init_entry = init_pid_ns.child_reaper;
+	/* if ve_move_task to VE0 (e.g. in cpt code)	*
+	 * occurs, ve_cap_bset on VE0 is required	*/
+	ve->ve_cap_bset = CAP_INIT_EFF_SET;
+
+	get_fs_root(init_entry->fs, &root);
+	ve->root_path = root;
+
+#ifdef CONFIG_PROC_FS
+	prepare_proc();
+#endif
+	prepare_sysctl();
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/vzdev.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vzdev.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/vzdev.c	2015-01-21 12:02:44.085203514 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vzdev.c	2015-01-21 12:02:47.861103269 +0300
@@ -0,0 +1,154 @@
+/*
+ *  kernel/ve/vzdev.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vzctl.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/vzcalluser.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <linux/device.h>
+#include <linux/smp_lock.h>
+
+#define VZCTL_MAJOR 126
+#define VZCTL_NAME "vzctl"
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Interface");
+MODULE_LICENSE("GPL v2");
+
+static LIST_HEAD(ioctls);
+static DEFINE_SPINLOCK(ioctl_lock);
+
+static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd)
+{
+	struct vzioctlinfo *h;
+
+	spin_lock(&ioctl_lock);
+	list_for_each_entry(h, &ioctls, list) {
+		if (h->type == _IOC_TYPE(cmd))
+			goto found;
+	}
+	h = NULL;
+found:
+	if (h && !try_module_get(h->owner))
+		h = NULL;
+	spin_unlock(&ioctl_lock);
+	return h;
+}
+
+static void vzctl_put_handler(struct vzioctlinfo *h)
+{
+	if (!h)
+		return;
+
+	module_put(h->owner);
+}
+
+long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct vzioctlinfo *h;
+	int err;
+
+	err = -ENOTTY;
+	h = vzctl_get_handler(cmd);
+	if (h && h->ioctl)
+		err = (*h->ioctl)(file, cmd, arg);
+	vzctl_put_handler(h);
+
+	return err;
+}
+
+long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct vzioctlinfo *h;
+	int err;
+
+	err = -ENOIOCTLCMD;
+	h = vzctl_get_handler(cmd);
+	if (h && h->compat_ioctl)
+		err = (*h->compat_ioctl)(file, cmd, arg);
+	vzctl_put_handler(h);
+
+	return err;
+}
+
+void vzioctl_register(struct vzioctlinfo *inf)
+{
+	spin_lock(&ioctl_lock);
+	list_add(&inf->list, &ioctls);
+	spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_register);
+
+void vzioctl_unregister(struct vzioctlinfo *inf)
+{
+	spin_lock(&ioctl_lock);
+	list_del_init(&inf->list);
+	spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_unregister);
+
+/*
+ * Init/exit stuff.
+ */
+static struct file_operations vzctl_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= vzctl_ioctl,
+	.compat_ioctl	= compat_vzctl_ioctl,
+};
+
+static struct class *vzctl_class;
+
+static void __exit vzctl_exit(void)
+{
+	device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0));
+	class_destroy(vzctl_class);
+	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+}
+
+static int __init vzctl_init(void)
+{
+	int ret;
+	struct device *class_err;
+
+	ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops);
+	if (ret < 0)
+		goto out;
+
+	vzctl_class = class_create(THIS_MODULE, "vzctl");
+	if (IS_ERR(vzctl_class)) {
+		ret = PTR_ERR(vzctl_class);
+		goto out_cleandev;
+	}
+
+	class_err = device_create(vzctl_class, NULL,
+			MKDEV(VZCTL_MAJOR, 0), NULL, VZCTL_NAME);
+	if (IS_ERR(class_err)) {
+		ret = PTR_ERR(class_err);
+		goto out_rmclass;
+	}
+
+	goto out;
+
+out_rmclass:
+	class_destroy(vzctl_class);
+out_cleandev:
+	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+out:
+	return ret;
+}
+
+module_init(vzctl_init)
+module_exit(vzctl_exit);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/vzevent.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vzevent.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/vzevent.c	2015-01-21 12:02:44.085203514 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vzevent.c	2015-01-21 12:02:44.177201070 +0300
@@ -0,0 +1,139 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/errno.h>
+#include <linux/ve_proto.h>
+#include <linux/vzevent.h>
+
+#define NETLINK_UEVENT	31
+#define VZ_EVGRP_ALL	0x01
+
+static int reboot_event;
+module_param(reboot_event, int, 0644);
+MODULE_PARM_DESC(reboot_event, "Enable reboot events");
+
+/*
+ * NOTE: the original idea was to send events via kobject_uevent(),
+ * however, it turns out that it has negative consequences like
+ * start of /sbin/hotplug which tries to react on our events in inadequate manner.
+ */
+
+static struct sock *vzev_sock;
+
+static char *action_to_string(int action)
+{
+	switch (action) {
+	case VE_EVENT_MOUNT:
+		return "ve-mount";
+	case VE_EVENT_UMOUNT:
+		return "ve-umount";
+	case VE_EVENT_START:
+		return "ve-start";
+	case VE_EVENT_STOP:
+		return "ve-stop";
+	case VE_EVENT_REBOOT:
+		return "ve-reboot";
+	default:
+		return NULL;
+	}
+}
+
+static int do_vzevent_send(int event, char *msg, int len)
+{
+	struct sk_buff *skb;
+	char *buf, *action;
+	int alen;
+
+	action = action_to_string(event);
+	if (!action)
+		return -EINVAL;
+
+	alen = strlen(action);
+
+	skb = alloc_skb(len + 1 + alen, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	buf = skb_put(skb, len + 1 + alen);
+	memcpy(buf, action, alen);
+	buf[alen] = '@';
+	memcpy(buf + alen + 1, msg, len);
+	(void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL);
+	return 0;
+}
+
+int vzevent_send(int event, const char *attrs_fmt, ...)
+{
+	va_list args;
+	int len, err;
+	struct ve_struct *ve;
+	char *page;
+
+	err = -ENOMEM;
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	va_start(args, attrs_fmt);
+	len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args);
+	va_end(args);
+
+	ve = set_exec_env(get_ve0());
+	err = do_vzevent_send(event, page, len);
+	(void)set_exec_env(ve);
+	free_page((unsigned long)page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(vzevent_send);
+
+static int ve_start(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	vzevent_send(VE_EVENT_START, "%d", ve->veid);
+	return 0;
+}
+
+static void ve_stop(void *data)
+{
+	struct ve_struct *ve;
+	int event = VE_EVENT_STOP;
+
+	if (test_and_clear_bit(VE_REBOOT, &get_exec_env()->flags) &&
+		reboot_event)
+		event = VE_EVENT_REBOOT;
+
+	ve = (struct ve_struct *)data;
+	vzevent_send(event, "%d", ve->veid);
+}
+
+static struct ve_hook ve_start_stop_hook = {
+	.init		= ve_start,
+	.fini		= ve_stop,
+	.owner		= THIS_MODULE,
+	.priority	= HOOK_PRIO_AFTERALL,
+};
+
+static int __init init_vzevent(void)
+{
+	vzev_sock = netlink_kernel_create(&init_net, NETLINK_UEVENT, 0, NULL, NULL, THIS_MODULE);
+	if (vzev_sock == NULL)
+		return -ENOMEM;
+	ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook);
+	return 0;
+}
+
+static void __exit exit_vzevent(void)
+{
+	ve_hook_unregister(&ve_start_stop_hook);
+	netlink_kernel_release(vzev_sock);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_vzevent);
+module_exit(exit_vzevent);
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/vziolimit.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vziolimit.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/vziolimit.c	2015-01-21 12:02:58.949808915 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vziolimit.c	2015-01-21 12:02:58.994807720 +0300
@@ -0,0 +1,353 @@
+/*
+ *  kernel/ve/vziolimit.c
+ *
+ *  Copyright (C) 2010, Parallels inc.
+ *  All rights reserved.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/virtinfo.h>
+#include <linux/vzctl.h>
+#include <linux/vziolimit.h>
+#include <asm/uaccess.h>
+#include <bc/beancounter.h>
+
+struct throttle {
+       unsigned speed;		/* maximum speed, units per second */
+       unsigned burst;		/* maximum bust, units */
+       unsigned latency;	/* maximum wait delay, jiffies */
+       unsigned remain;		/* units/HZ */
+       unsigned long time;	/* wall time in jiffies */
+       long long state;		/* current state in units */
+};
+
+/**
+ * set throttler initial state, externally serialized
+ * @speed	maximum speed (1/sec)
+ * @burst	maximum burst chunk
+ * @latency	maximum timeout (ms)
+ */
+static void throttle_setup(struct throttle *th, unsigned speed,
+		unsigned burst, unsigned latency)
+{
+	th->time = jiffies;
+	th->burst = burst;
+	th->latency = msecs_to_jiffies(latency);
+	wmb();
+	th->speed = speed;
+}
+
+/* externally serialized */
+static void throttle_charge(struct throttle *th, long long charge)
+{
+	unsigned long time, now = jiffies;
+	long long step, ceiling = charge + th->burst;
+
+	if (time_before(th->time, now)) {
+		step = (u64)th->speed * (now - th->time);
+		do_div(step, HZ);
+		step += th->state;
+		/* feed throttler as much as we can */
+		if (step <= ceiling)
+			th->state = step;
+		else if (th->state < ceiling)
+			th->state = ceiling;
+		th->time = now;
+	}
+
+	if (charge > th->state) {
+		charge -= th->state;
+		step = charge * HZ;
+		if (do_div(step, th->speed))
+			step++;
+		time = th->time + step;
+		/* limit maximum latency */
+		if (time_after(time, now + th->latency))
+			time = now + th->latency;
+		th->time = time;
+		step *= th->speed;
+		step += th->remain;
+		th->remain = do_div(step, HZ);
+		th->state += step;
+	}
+}
+
+/* lockless */
+static unsigned long throttle_timeout(struct throttle *th, unsigned long now)
+{
+	unsigned long time;
+
+	if (!th->speed)
+		return 0;
+	rmb();
+	time = th->time;
+	if (time_before(time, now))
+		return 0;
+	return min(time - now, (unsigned long)th->latency);
+}
+
+struct iolimit {
+	struct throttle throttle;
+	struct throttle iops;
+	wait_queue_head_t wq;
+};
+
+static void iolimit_wait(struct iolimit *iolimit, unsigned long timeout)
+{
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&iolimit->wq, &wait,
+				TASK_KILLABLE | TASK_IOTHROTTLED);
+		timeout = schedule_timeout(timeout);
+		if (fatal_signal_pending(current))
+			break;
+		if (unlikely(timeout))
+			timeout = min(throttle_timeout(&iolimit->throttle,
+						jiffies), timeout);
+	} while (timeout);
+	finish_wait(&iolimit->wq, &wait);
+}
+
+static unsigned long iolimit_timeout(struct iolimit *iolimit)
+{
+	unsigned long now = jiffies;
+
+	return max(throttle_timeout(&iolimit->throttle, now),
+			throttle_timeout(&iolimit->iops, now));
+}
+
+static void iolimit_balance_dirty(struct iolimit *iolimit,
+				  struct user_beancounter *ub,
+				  unsigned long write_chunk)
+{
+	struct throttle *th = &iolimit->throttle;
+	unsigned long flags, dirty, state;
+
+	if (!th->speed)
+		return;
+
+	/* can be non-atomic on i386, but ok. this just hint. */
+	state = th->state >> PAGE_SHIFT;
+	dirty = ub_stat_get(ub, dirty_pages) + write_chunk;
+	/* protect agains ub-stat percpu drift */
+	if (dirty + UB_STAT_BATCH * num_possible_cpus()	< state)
+		return;
+	/* get exact value of for smooth throttling */
+	dirty = ub_stat_get_exact(ub, dirty_pages) + write_chunk;
+	if (dirty < state)
+		return;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	/* precharge dirty pages */
+	throttle_charge(th, (long long)dirty << PAGE_SHIFT);
+	/* set dirty_exceeded for smooth throttling */
+	set_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+static int iolimit_virtinfo(struct vnotifier_block *nb,
+		unsigned long cmd, void *arg, int old_ret)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	struct iolimit *iolimit = ub->private_data2;
+	unsigned long flags, timeout;
+
+	if (!iolimit)
+		return old_ret;
+
+	if (!iolimit->throttle.speed && !iolimit->iops.speed)
+		return NOTIFY_OK;
+
+	switch (cmd) {
+		case VIRTINFO_IO_ACCOUNT:
+			if (!iolimit->throttle.speed)
+				break;
+			spin_lock_irqsave(&ub->ub_lock, flags);
+			if (iolimit->throttle.speed) {
+				long long charge = *(size_t*)arg;
+
+				throttle_charge(&iolimit->throttle, charge);
+				iolimit->throttle.state -= charge;
+			}
+			spin_unlock_irqrestore(&ub->ub_lock, flags);
+			break;
+		case VIRTINFO_IO_FUSE_REQ:
+		case VIRTINFO_IO_OP_ACCOUNT:
+			if (!iolimit->iops.speed)
+				break;
+			spin_lock_irqsave(&ub->ub_lock, flags);
+			if (iolimit->iops.speed) {
+				throttle_charge(&iolimit->iops, 1);
+				/*
+				 * Writeback doesn't use last iops from stash
+				 * to avoid choking future sync operations.
+				 */
+				if (iolimit->iops.state > 1 ||
+				    !(current->flags & PF_FLUSHER))
+					iolimit->iops.state--;
+			}
+			spin_unlock_irqrestore(&ub->ub_lock, flags);
+			break;
+		case VIRTINFO_IO_PREPARE:
+		case VIRTINFO_IO_JOURNAL:
+			if (current->flags & PF_FLUSHER)
+				break;
+			timeout = iolimit_timeout(iolimit);
+			if (timeout && !fatal_signal_pending(current))
+				iolimit_wait(iolimit, timeout);
+			break;
+		case VIRTINFO_IO_READAHEAD:
+		case VIRTINFO_IO_CONGESTION:
+			timeout = iolimit_timeout(iolimit);
+			if (timeout)
+				return NOTIFY_FAIL;
+			break;
+		case VIRTINFO_IO_BALANCE_DIRTY:
+			iolimit_balance_dirty(iolimit, ub, (unsigned long)arg);
+			break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block iolimit_virtinfo_nb = {
+	.notifier_call = iolimit_virtinfo,
+};
+
+
+static void throttle_state(struct user_beancounter *ub,
+		struct throttle *throttle, struct iolimit_state *state)
+{
+	spin_lock_irq(&ub->ub_lock);
+	state->speed = throttle->speed;
+	state->burst = throttle->burst;
+	state->latency = jiffies_to_msecs(throttle->latency);
+	spin_unlock_irq(&ub->ub_lock);
+}
+
+static struct iolimit *iolimit_get(struct user_beancounter *ub)
+{
+	struct iolimit *iolimit = ub->private_data2;
+
+	if (iolimit)
+		return iolimit;
+
+	iolimit = kzalloc(sizeof(struct iolimit), GFP_KERNEL);
+	if (!iolimit)
+		return NULL;
+	init_waitqueue_head(&iolimit->wq);
+
+	spin_lock_irq(&ub->ub_lock);
+	if (ub->private_data2) {
+		kfree(iolimit);
+		iolimit = ub->private_data2;
+	} else
+		ub->private_data2 = iolimit;
+	spin_unlock_irq(&ub->ub_lock);
+
+	return iolimit;
+}
+
+static int iolimit_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct user_beancounter *ub;
+	struct iolimit *iolimit;
+	struct iolimit_state state;
+	int err;
+
+	if (cmd != VZCTL_SET_IOLIMIT && cmd != VZCTL_GET_IOLIMIT &&
+	    cmd != VZCTL_SET_IOPSLIMIT && cmd != VZCTL_GET_IOPSLIMIT)
+		return -ENOTTY;
+
+	if (copy_from_user(&state, (void __user *)arg, sizeof(state)))
+		return -EFAULT;
+
+	ub = get_beancounter_byuid(state.id, 0);
+	if (!ub)
+		return -ENOENT;
+
+	iolimit = ub->private_data2;
+
+	switch (cmd) {
+		case VZCTL_SET_IOLIMIT:
+			iolimit = iolimit_get(ub);
+			err = -ENOMEM;
+			if (!iolimit)
+				break;
+			spin_lock_irq(&ub->ub_lock);
+			throttle_setup(&iolimit->throttle, state.speed,
+					state.burst, state.latency);
+			spin_unlock_irq(&ub->ub_lock);
+			wake_up_all(&iolimit->wq);
+			err = 0;
+			break;
+		case VZCTL_SET_IOPSLIMIT:
+			iolimit = iolimit_get(ub);
+			err = -ENOMEM;
+			if (!iolimit)
+				break;
+			spin_lock_irq(&ub->ub_lock);
+			throttle_setup(&iolimit->iops, state.speed,
+					state.burst, state.latency);
+			spin_unlock_irq(&ub->ub_lock);
+			wake_up_all(&iolimit->wq);
+			err = 0;
+			break;
+		case VZCTL_GET_IOLIMIT:
+			err = -ENXIO;
+			if (!iolimit)
+				break;
+			throttle_state(ub, &iolimit->throttle, &state);
+			err = -EFAULT;
+			if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+				break;
+			err = 0;
+			break;
+		case VZCTL_GET_IOPSLIMIT:
+			err = -ENXIO;
+			if (!iolimit)
+				break;
+			throttle_state(ub, &iolimit->iops, &state);
+			err = -EFAULT;
+			if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+				break;
+			err = 0;
+			break;
+		default:
+			err = -ENOTTY;
+	}
+
+	put_beancounter_longterm(ub);
+	return err;
+}
+
+static struct vzioctlinfo iolimit_vzioctl = {
+	.type		= VZIOLIMITTYPE,
+	.ioctl		= iolimit_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= iolimit_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static int __init iolimit_init(void)
+{
+	virtinfo_notifier_register(VITYPE_IO, &iolimit_virtinfo_nb);
+	vzioctl_register(&iolimit_vzioctl);
+
+	return 0;
+}
+
+static void __exit iolimit_exit(void)
+{
+	vzioctl_unregister(&iolimit_vzioctl);
+	virtinfo_notifier_unregister(VITYPE_IO, &iolimit_virtinfo_nb);
+}
+
+module_init(iolimit_init)
+module_exit(iolimit_exit)
+
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/ve/vzwdog.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vzwdog.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/ve/vzwdog.c	2015-01-21 12:02:44.085203514 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/ve/vzwdog.c	2015-01-21 12:02:53.925942269 +0300
@@ -0,0 +1,361 @@
+/*
+ *  kernel/ve/vzwdog.c
+ *
+ *  Copyright (C) 2000-2005  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/kobject.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/ve.h>
+#include <linux/vzstat.h>
+#include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/* Staff regading kernel thread polling VE validity */
+static int sleep_timeout = 60;
+static struct task_struct *wdog_thread_tsk;
+
+static struct file *intr_file;
+static char page[PAGE_SIZE];
+
+static void parse_irq_list(int len)
+{
+	int i, k, skip;
+	for (i = 0; i < len; ) {
+		k = i;
+		while (i < len && page[i] != '\n' && page[i] != ':')
+			i++;
+		skip = 0;
+		if (i < len && page[i] != '\n') {
+			i++; /* skip ':' */
+			while (i < len && (page[i] == ' ' || page[i] == '0'))
+				i++;
+			skip = (i < len && (page[i] < '0' || page[i] > '9'));
+			while (i < len && page[i] != '\n')
+				i++;
+		}
+		if (!skip)
+			printk("%.*s\n", i - k, page + k);
+		if (i < len)
+			i++; /* skip '\n' */
+	}
+}
+
+extern loff_t vfs_llseek(struct file *file, loff_t, int);
+extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *);
+extern struct file *filp_open(const char *filename, int flags, int mode);
+extern int filp_close(struct file *filp, fl_owner_t id);
+static void show_irq_list(void)
+{
+	mm_segment_t fs;
+	int r;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	vfs_llseek(intr_file, 0, 0);
+	r = vfs_read(intr_file, (void __user *)page, sizeof(page),
+			&intr_file->f_pos);
+	set_fs(fs);
+
+	if (r > 0)
+		parse_irq_list(r);
+}
+
+static u64 max_sched_lat;
+static u64 max_alloc_lat[KSTAT_ALLOCSTAT_NR];
+
+static void update_max_alloc_latency(void)
+{
+	int i;
+
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		max_alloc_lat[i] = max(max_alloc_lat[i],
+				kstat_glob.alloc_lat[i].last.maxlat);
+}
+
+static void update_max_schedule_latency(void)
+{
+	max_sched_lat = max(max_sched_lat, kstat_glob.sched_lat.last.maxlat);
+}
+
+static void update_max_latencies(void)
+{
+	spin_lock_irq(&kstat_glb_lock);
+	update_max_alloc_latency();
+	update_max_schedule_latency();
+	spin_unlock_irq(&kstat_glb_lock);
+}
+
+static void reset_max_latencies(void)
+{
+	max_sched_lat = 0;
+	memset(max_alloc_lat, 0, sizeof(max_alloc_lat));
+}
+
+static void show_alloc_latency(void)
+{
+	static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+		"A0",
+		"L0",
+		"H0",
+		"L1",
+		"H1"
+	};
+	int i;
+
+	printk("lat: ");
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+		struct kstat_lat_pcpu_struct *p;
+		u64 maxlat, avg0, avg1, avg2;
+
+		p = &kstat_glob.alloc_lat[i];
+		spin_lock_irq(&kstat_glb_lock);
+		maxlat = p->last.maxlat;
+		avg0 = p->avg[0];
+		avg1 = p->avg[1];
+		avg2 = p->avg[2];
+		spin_unlock_irq(&kstat_glb_lock);
+
+		printk("%s %Lu %Lu (%Lu %Lu %Lu)",
+				alloc_descr[i],
+				(unsigned long long)max_alloc_lat[i],
+				(unsigned long long)maxlat,
+				(unsigned long long)avg0,
+				(unsigned long long)avg1,
+				(unsigned long long)avg2);
+	}
+	printk("\n");
+}
+
+static void show_schedule_latency(void)
+{
+	struct kstat_lat_pcpu_struct *p;
+	cycles_t maxlat, totlat, avg0, avg1, avg2;
+	unsigned long count;
+
+	p = &kstat_glob.sched_lat;
+	spin_lock_irq(&kstat_glb_lock);
+	maxlat = p->last.maxlat;
+	totlat = p->last.totlat;
+	count = p->last.count;
+	avg0 = p->avg[0];
+	avg1 = p->avg[1];
+	avg2 = p->avg[2];
+	spin_unlock_irq(&kstat_glb_lock);
+
+	printk("sched lat: %Lu/%Lu/%Lu/%lu (%Lu %Lu %Lu)\n",
+			(unsigned long long)max_sched_lat,
+			(unsigned long long)maxlat,
+			(unsigned long long)totlat,
+			count,
+			(unsigned long long)avg0,
+			(unsigned long long)avg1,
+			(unsigned long long)avg2);
+}
+
+static void show_header(void)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	preempt_disable();
+	printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n",
+			tv.tv_sec, (long)tv.tv_usec,
+			(unsigned long long)get_jiffies_64(),
+			smp_processor_id());
+	printk("*** jiffies_per_second %u ***\n", HZ);
+	preempt_enable();
+}
+
+static void show_pgdatinfo(void)
+{
+	pg_data_t *pgdat;
+
+	printk("pgdat:");
+	for_each_online_pgdat(pgdat) {
+		printk(" %d: %lu,%lu,%lu",
+				pgdat->node_id,
+				pgdat->node_start_pfn,
+				pgdat->node_present_pages,
+				pgdat->node_spanned_pages);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+		printk(",%p", pgdat->node_mem_map);
+#endif
+	}
+	printk("\n");
+}
+
+static int show_partitions_io(struct gendisk *gp)
+{
+	struct disk_part_iter piter;
+	struct hd_struct *hd;
+	char buf[BDEVNAME_SIZE];
+	int cpu;
+
+	/*
+	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
+		seq_puts(seqf,	"major minor name"
+				"     rio rmerge rsect ruse wio wmerge "
+				"wsect wuse running use aveq"
+				"\n\n");
+	*/
+ 
+	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
+	while ((hd = disk_part_iter_next(&piter))) {
+		cpu = part_stat_lock();
+		part_round_stats(cpu, hd);
+		part_stat_unlock();
+		printk("%4d %7d %s %lu %lu %llu "
+			   "%u %lu %lu %llu %u %u %u %u\n",
+			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
+			   disk_name(gp, hd->partno, buf),
+			   part_stat_read(hd, ios[0]),
+			   part_stat_read(hd, merges[0]),
+			   (unsigned long long)part_stat_read(hd, sectors[0]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[0])),
+			   part_stat_read(hd, ios[1]),
+			   part_stat_read(hd, merges[1]),
+			   (unsigned long long)part_stat_read(hd, sectors[1]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+			   part_in_flight(hd),
+			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
+			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+			);
+	}
+	disk_part_iter_exit(&piter);
+ 
+	return 0;
+}
+
+static int show_one_disk_io(struct device *dev, void *x)
+{
+	char *name;
+	char buf[BDEVNAME_SIZE];
+	struct gendisk *gd;
+
+	if (dev->type != &disk_type)
+		return 0;
+
+	gd = dev_to_disk(dev);
+
+	name = disk_name(gd, 0, buf);
+	if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) &&
+			isdigit(name[4]))
+		return 0;
+
+	if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) &&
+			isdigit(name[3]))
+		return 0;
+
+	show_partitions_io(gd);
+
+	return 0;
+}
+
+static void show_diskio(void)
+{
+	printk("disk_io: ");
+	class_for_each_device(&block_class, NULL, NULL, show_one_disk_io);
+	printk("\n");
+}
+
+static void show_nrprocs(void)
+{
+	unsigned long _nr_running, _nr_sleeping,
+			_nr_unint, _nr_zombie, _nr_dead, _nr_stopped;
+
+	_nr_running = nr_running();
+	_nr_unint = nr_uninterruptible();
+	_nr_sleeping = nr_sleeping();
+	_nr_zombie = nr_zombie;
+	_nr_dead = atomic_read(&nr_dead);
+	_nr_stopped = nr_stopped();
+
+	printk("VEnum: %d, proc R %lu, S %lu, D %lu, "
+		"Z %lu, X %lu, T %lu (tot %d)\n",
+		nr_ve,	_nr_running, _nr_sleeping, _nr_unint,
+		_nr_zombie, _nr_dead, _nr_stopped, nr_threads);
+}
+
+static void wdog_print(void)
+{
+	show_header();
+	show_irq_list();
+	show_pgdatinfo();
+	show_mem(SHOW_MEM_FILTER_NODES);
+	show_diskio();
+	show_schedule_latency();
+	show_alloc_latency();
+	show_nrprocs();
+}
+
+static int wdog_loop(void* data)
+{
+	unsigned long next_print;
+	long timeout;
+
+	next_print = jiffies;
+	while (1) {
+		update_max_latencies();
+		if (time_is_before_eq_jiffies(next_print)) {
+			wdog_print();
+			reset_max_latencies();
+			next_print = jiffies + sleep_timeout * HZ;
+		}
+		try_to_freeze();
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+		timeout = clamp_t(long, next_print - jiffies, 0, LOAD_FREQ);
+		schedule_timeout(timeout);
+	}
+	return 0;
+}
+
+static int __init wdog_init(void)
+{
+	struct file *file;
+
+	file = filp_open("/proc/interrupts", 0, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	intr_file = file;
+
+	wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog");
+	if (IS_ERR(wdog_thread_tsk)) {
+		filp_close(intr_file, NULL);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static void __exit wdog_exit(void)
+{
+	kthread_stop(wdog_thread_tsk);
+	filp_close(intr_file, NULL);
+}
+
+module_param(sleep_timeout, int, 0660);
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo WDOG");
+MODULE_LICENSE("GPL v2");
+
+module_init(wdog_init)
+module_exit(wdog_exit)
diff -upr linux-2.6.32-504.3.3.el6.orig/kernel/workqueue.c linux-2.6.32-504.3.3.el6-042stab103_6/kernel/workqueue.c
--- linux-2.6.32-504.3.3.el6.orig/kernel/workqueue.c	2014-12-12 23:29:28.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/kernel/workqueue.c	2015-01-21 12:02:58.127830734 +0300
@@ -66,6 +66,7 @@ struct workqueue_struct {
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
 #endif
+	struct ve_struct *owner_env;
 };
 
 /* Serializes the accesses to the list of workqueues. */
@@ -313,6 +314,9 @@ static int worker_thread(void *__cwq)
 {
 	struct cpu_workqueue_struct *cwq = __cwq;
 	DEFINE_WAIT(wait);
+	struct ve_struct *orig_ve;
+
+	orig_ve = set_exec_env(cwq->wq->owner_env);
 
 	if (cwq->wq->freezeable)
 		set_freezable();
@@ -333,6 +337,8 @@ static int worker_thread(void *__cwq)
 		run_workqueue(cwq);
 	}
 
+	(void)set_exec_env(orig_ve);
+
 	return 0;
 }
 
@@ -719,6 +725,7 @@ int schedule_on_each_cpu(work_func_t fun
 	free_percpu(works);
 	return 0;
 }
+EXPORT_SYMBOL(schedule_on_each_cpu);
 
 void flush_scheduled_work(void)
 {
@@ -786,14 +793,14 @@ init_cpu_workqueue(struct workqueue_stru
 	return cwq;
 }
 
-static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu, struct ve_struct *ve)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
 	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
 
-	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+	p = kthread_create_ve(ve, worker_thread, cwq, fmt, wq->name, cpu);
 	/*
 	 * Nobody can add the work_struct to this cwq,
 	 *	if (caller is __create_workqueue)
@@ -829,11 +836,14 @@ struct workqueue_struct *__create_workqu
 						int freezeable,
 						int rt,
 						struct lock_class_key *key,
-						const char *lock_name)
+						const char *lock_name,
+						void *ve)
 {
 	struct workqueue_struct *wq;
 	struct cpu_workqueue_struct *cwq;
 	int err = 0, cpu;
+	char *dname;
+	struct ve_struct *env;
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
@@ -845,7 +855,22 @@ struct workqueue_struct *__create_workqu
 		return NULL;
 	}
 
-	wq->name = name;
+	if ((ve == NULL) || ve_is_super(ve)) {
+		env = get_ve0();
+		dname = (char *)name;
+	} else {
+		dname = kmalloc(strlen(name) + 32, GFP_KERNEL);
+		if (dname == NULL) {
+			free_percpu(wq->cpu_wq);
+			kfree(wq);
+			return NULL;
+		}
+		env = get_ve(ve);
+		sprintf(dname, "%s/%d", name, env->veid);
+	}
+
+	wq->name = dname;
+	wq->owner_env = env;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	wq->singlethread = singlethread;
 	wq->freezeable = freezeable;
@@ -854,7 +879,7 @@ struct workqueue_struct *__create_workqu
 
 	if (singlethread) {
 		cwq = init_cpu_workqueue(wq, singlethread_cpu);
-		err = create_workqueue_thread(cwq, singlethread_cpu);
+		err = create_workqueue_thread(cwq, singlethread_cpu, env);
 		start_workqueue_thread(cwq, -1);
 	} else {
 		cpu_maps_update_begin();
@@ -877,7 +902,7 @@ struct workqueue_struct *__create_workqu
 			cwq = init_cpu_workqueue(wq, cpu);
 			if (err || !cpu_online(cpu))
 				continue;
-			err = create_workqueue_thread(cwq, cpu);
+			err = create_workqueue_thread(cwq, cpu, env);
 			start_workqueue_thread(cwq, cpu);
 		}
 		cpu_maps_update_done();
@@ -944,6 +969,10 @@ void destroy_workqueue(struct workqueue_
 		cpu_maps_update_done();
 	}
 	free_percpu(wq->cpu_wq);
+	if (!ve_is_super(wq->owner_env)) {
+		kfree(wq->name);
+		put_ve(wq->owner_env);
+	}
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -996,7 +1025,7 @@ undo:
 
 		switch (action) {
 		case CPU_UP_PREPARE:
-			if (!create_workqueue_thread(cwq, cpu))
+			if (!create_workqueue_thread(cwq, cpu, wq->owner_env))
 				break;
 			printk(KERN_ERR "workqueue [%s] for %i failed\n",
 				wq->name, cpu);
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/Kconfig.debug linux-2.6.32-504.3.3.el6-042stab103_6/lib/Kconfig.debug
--- linux-2.6.32-504.3.3.el6.orig/lib/Kconfig.debug	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/Kconfig.debug	2015-01-21 12:02:54.129936853 +0300
@@ -135,6 +135,15 @@ config DEBUG_SECTION_MISMATCH
 	  - Enable verbose reporting from modpost to help solving
 	    the section mismatches reported.
 
+config SYSRQ_DEBUG
+	bool "Debugging via sysrq keys"
+	depends on MAGIC_SYSRQ
+	default y
+	help
+	  Say Y if you want to extend functionality of magic key. It will
+	  provide you with some debugging facilities such as dumping and
+	  writing memory, resolving symbols and some other.
+
 config DEBUG_KERNEL
 	bool "Kernel debugging"
 	help
@@ -322,7 +331,7 @@ config SCHEDSTATS
 config DEBUG_NMI_TIMEOUT
 	int "Number of seconds before NMI timeout"
 	depends on X86
-	default 5
+	default 30
 	help
 	  This value is the number of seconds the NMI watchdog will tick
 	  before it decides the machine has hung.
@@ -628,6 +637,7 @@ config TRACE_IRQFLAGS
 
 config DEBUG_SPINLOCK_SLEEP
 	bool "Spinlock debugging: sleep-inside-spinlock checking"
+	select PREEMPT_COUNT
 	depends on DEBUG_KERNEL
 	help
 	  If you say Y here, various routines which may sleep will become very
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/bust_spinlocks.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/bust_spinlocks.c
--- linux-2.6.32-504.3.3.el6.orig/lib/bust_spinlocks.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/bust_spinlocks.c	2015-01-21 12:02:41.651268135 +0300
@@ -17,11 +17,12 @@
 
 void __attribute__((weak)) bust_spinlocks(int yes)
 {
-	if (yes) {
+	if (yes > 0) {
 		++oops_in_progress;
 	} else {
 #ifdef CONFIG_VT
-		unblank_screen();
+		if (!yes)
+			unblank_screen();
 #endif
 		console_unblank();
 		if (--oops_in_progress == 0)
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/debug_locks.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/debug_locks.c
--- linux-2.6.32-504.3.3.el6.orig/lib/debug_locks.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/debug_locks.c	2015-01-21 12:02:58.137830469 +0300
@@ -42,6 +42,13 @@ int debug_locks_off(void)
 			console_verbose();
 			return 1;
 		}
+
+		/*
+		 * We want to taint kernel so tests can easily detect a lockdep
+		 * related problem reported.
+		 */
+
+		add_taint(TAINT_CRAP);
 	}
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/idr.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/idr.c
--- linux-2.6.32-504.3.3.el6.orig/lib/idr.c	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/idr.c	2015-01-21 12:02:50.306038362 +0300
@@ -36,6 +36,12 @@
 #include <linux/idr.h>
 #include <linux/spinlock.h>
 
+/* Leave the possibility of an incomplete final layer */
+#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
+
+/* Number of id_layer structs to leave in free list */
+#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL
+
 static struct kmem_cache *idr_layer_cache;
 static DEFINE_SPINLOCK(simple_ida_lock);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/is_single_threaded.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/is_single_threaded.c
--- linux-2.6.32-504.3.3.el6.orig/lib/is_single_threaded.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/is_single_threaded.c	2015-01-21 12:02:45.792158195 +0300
@@ -30,7 +30,7 @@ bool current_is_single_threaded(void)
 
 	ret = false;
 	rcu_read_lock();
-	for_each_process(p) {
+	for_each_process_all(p) {
 		if (unlikely(p->flags & PF_KTHREAD))
 			continue;
 		if (unlikely(p == task->group_leader))
@@ -48,7 +48,7 @@ bool current_is_single_threaded(void)
 			 * forked before exiting.
 			 */
 			smp_rmb();
-		} while_each_thread(p, t);
+		} while_each_thread_all(p, t);
 	}
 	ret = true;
 found:
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/kernel_lock.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/kernel_lock.c
--- linux-2.6.32-504.3.3.el6.orig/lib/kernel_lock.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/kernel_lock.c	2015-01-21 12:02:54.126936933 +0300
@@ -93,6 +93,7 @@ static inline void __lock_kernel(void)
  */
 static inline void __lock_kernel(void)
 {
+	preempt_disable();
 	_raw_spin_lock(&kernel_flag);
 }
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/kobject.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/kobject.c
--- linux-2.6.32-504.3.3.el6.orig/lib/kobject.c	2014-12-12 23:29:02.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/kobject.c	2015-01-21 12:02:44.826183840 +0300
@@ -147,6 +147,7 @@ static void kobject_init_internal(struct
 	if (!kobj)
 		return;
 	kref_init(&kobj->kref);
+	INIT_LIST_HEAD(&kobj->env_head);
 	INIT_LIST_HEAD(&kobj->entry);
 	kobj->state_in_sysfs = 0;
 	kobj->state_add_uevent_sent = 0;
@@ -630,6 +631,7 @@ struct kobject *kobject_create(void)
 	kobject_init(kobj, &dynamic_kobj_ktype);
 	return kobj;
 }
+EXPORT_SYMBOL(kobject_create);
 
 /**
  * kobject_create_and_add - create a struct kobject dynamically and register it with sysfs
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/kobject_uevent.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/kobject_uevent.c
--- linux-2.6.32-504.3.3.el6.orig/lib/kobject_uevent.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/kobject_uevent.c	2015-01-21 12:02:45.735159708 +0300
@@ -25,11 +25,11 @@
 #include <net/sock.h>
 
 
+#ifndef CONFIG_VE
+/* Virtualized for all VEs, but is shown only in VE0 */
 u64 uevent_seqnum;
-char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
-#if defined(CONFIG_NET)
-static struct sock *uevent_sock;
 #endif
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
 
 /* This lock protects uevent_seqnum and uevent_sock_list */
 static DEFINE_MUTEX(uevent_sock_mutex);
@@ -40,6 +40,8 @@ static const char *kobject_actions[] = {
 	[KOBJ_REMOVE] =		"remove",
 	[KOBJ_CHANGE] =		"change",
 	[KOBJ_MOVE] =		"move",
+	[KOBJ_START] =		"start",
+	[KOBJ_STOP] =		"stop",
 	[KOBJ_ONLINE] =		"online",
 	[KOBJ_OFFLINE] =	"offline",
 };
@@ -88,7 +90,7 @@ out:
  * Returns 0 if kobject_uevent() is completed with success or the
  * corresponding error when it fails.
  */
-int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+int kobject_uevent_env_one(struct kobject *kobj, enum kobject_action action,
 		       char *envp_ext[])
 {
 	struct kobj_uevent_env *env;
@@ -203,7 +205,8 @@ int kobject_uevent_env(struct kobject *k
 
 	mutex_lock(&uevent_sock_mutex);
 	/* we will send an event, so request a new sequence number */
-	retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
+	retval = add_uevent_var(env, "SEQNUM=%llu",
+				(unsigned long long)++ve_uevent_seqnum);
 
 	if (retval) {
 		mutex_unlock(&uevent_sock_mutex);
@@ -212,9 +215,10 @@ int kobject_uevent_env(struct kobject *k
 
 #if defined(CONFIG_NET)
 	/* send netlink message */
-	if (uevent_sock) {
+	if (get_exec_env()->ve_netns && get_exec_env()->ve_netns->uevent_sock) {
 		struct sk_buff *skb;
 		size_t len;
+		struct sock *uevent_sock = get_exec_env()->ve_netns->uevent_sock;
 
 		/* allocate message with the maximum possible size */
 		len = strlen(action_string) + strlen(devpath) + 2;
@@ -271,6 +275,21 @@ exit:
 }
 EXPORT_SYMBOL_GPL(kobject_uevent_env);
 
+extern int ve_kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+			char *envp_ext[]);
+int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+		       char *envp_ext[])
+{
+	int err, ret = 0;
+
+	ret = kobject_uevent_env_one(kobj, action, envp_ext);
+
+	err = ve_kobject_uevent_env(kobj, action, envp_ext);
+	if (ret && err < 0)
+		ret = err;
+	return ret;
+}
+
 /**
  * kobject_uevent - notify userspace by ending an uevent
  *
@@ -322,15 +341,45 @@ int add_uevent_var(struct kobj_uevent_en
 EXPORT_SYMBOL_GPL(add_uevent_var);
 
 #if defined(CONFIG_NET)
-static int __init kobject_uevent_init(void)
+
+static int __net_init kobject_uevent_net_init(struct net *net)
 {
-	uevent_sock = netlink_kernel_create(&init_net, NETLINK_KOBJECT_UEVENT,
+	struct sock *sk;
+
+	sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT,
 					    1, NULL, NULL, THIS_MODULE);
-	if (!uevent_sock) {
+	if (!sk) {
 		printk(KERN_ERR
 		       "kobject_uevent: unable to create netlink socket!\n");
 		return -ENODEV;
 	}
+
+	net->uevent_sock = sk;
+
+	return 0;
+}
+
+static void __net_exit kobject_uevent_net_exit(struct net *net)
+{
+	netlink_kernel_release(net->uevent_sock);
+	net->uevent_sock = NULL;
+}
+
+
+static struct pernet_operations kobject_uevent_net_ops = {
+	.init = kobject_uevent_net_init,
+	.exit = kobject_uevent_net_exit,
+};
+
+
+static int __init kobject_uevent_init(void)
+{
+	int res;
+
+	res = register_pernet_subsys(&kobject_uevent_net_ops);
+	if (res < 0)
+		return res;
+
 	netlink_set_nonroot(NETLINK_KOBJECT_UEVENT, NL_NONROOT_RECV);
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/libcrc32c.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/libcrc32c.c
--- linux-2.6.32-504.3.3.el6.orig/lib/libcrc32c.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/libcrc32c.c	2015-01-21 12:02:53.063965151 +0300
@@ -36,10 +36,11 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/crc32c.h>
 
 static struct crypto_shash *tfm;
 
-u32 crc32c(u32 crc, const void *address, unsigned int length)
+static u32 __crc32c(u32 crc, const void *address, unsigned int length)
 {
 	struct {
 		struct shash_desc shash;
@@ -57,6 +58,13 @@ u32 crc32c(u32 crc, const void *address,
 	return *(u32 *)desc.ctx;
 }
 
+u32 crc32c(u32 crc, const void *address, unsigned int length)
+{
+	if (unlikely(!tfm))
+		return crc32c_generic(crc, address, length);
+	return __crc32c(crc, address, length);
+}
+
 EXPORT_SYMBOL(crc32c);
 
 static int __init libcrc32c_mod_init(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/nlattr.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/nlattr.c
--- linux-2.6.32-504.3.3.el6.orig/lib/nlattr.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/nlattr.c	2015-01-21 12:02:43.838210071 +0300
@@ -196,7 +196,7 @@ int nla_parse(struct nlattr *tb[], int m
 	}
 
 	if (unlikely(rem > 0))
-		printk(KERN_WARNING "netlink: %d bytes leftover after parsing "
+		ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing "
 		       "attributes.\n", rem);
 
 	err = 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/radix-tree.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/radix-tree.c
--- linux-2.6.32-504.3.3.el6.orig/lib/radix-tree.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/radix-tree.c	2015-01-21 12:02:43.139228628 +0300
@@ -47,6 +47,8 @@
 #define RADIX_TREE_TAG_LONGS	\
 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
+#define RADIX_ROOT_TAG_MASK	(((1<<RADIX_TREE_MAX_TAGS)-1) << __GFP_BITS_SHIFT)
+
 struct radix_tree_node {
 	unsigned int	height;		/* Height from the bottom */
 	unsigned int	count;
@@ -127,9 +129,15 @@ static inline void root_tag_clear(struct
 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
 }
 
+static inline void root_tag_move_all_to_prev(struct radix_tree_root *root)
+{
+	root->gfp_mask = (root->gfp_mask & __GFP_BITS_MASK) |
+		(root->gfp_mask & RADIX_ROOT_TAG_MASK) << RADIX_TREE_MAX_TAGS;
+}
+
 static inline void root_tag_clear_all(struct radix_tree_root *root)
 {
-	root->gfp_mask &= __GFP_BITS_MASK;
+	root->gfp_mask &= (__force gfp_t)~RADIX_ROOT_TAG_MASK;
 }
 
 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
@@ -137,6 +145,26 @@ static inline int root_tag_get(struct ra
 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
 }
 
+static inline void prev_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
+static inline void prev_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= ~(1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
+static inline void prev_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK | RADIX_ROOT_TAG_MASK;
+}
+
+static inline int prev_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return root->gfp_mask & (1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
 /*
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
@@ -498,6 +526,8 @@ void *radix_tree_tag_set(struct radix_tr
 {
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
+	int prev = 0; /* suppress warning */
+	int right_prev = radix_tree_tag_get(root, index, tag);
 
 	height = root->height;
 	BUG_ON(index > radix_tree_maxindex(height));
@@ -505,11 +535,15 @@ void *radix_tree_tag_set(struct radix_tr
 	slot = indirect_to_ptr(root->rnode);
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
+	if (!height)
+		prev = root_tag_get(root, tag);
+
 	while (height > 0) {
 		int offset;
 
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
+		prev = tag_get(slot, tag, offset);
+		if (!prev)
 			tag_set(slot, tag, offset);
 		slot = slot->slots[offset];
 		BUG_ON(slot == NULL);
@@ -517,6 +551,13 @@ void *radix_tree_tag_set(struct radix_tr
 		height--;
 	}
 
+	if (prev)
+		prev_tag_set(root, tag);
+	else
+		prev_tag_clear(root, tag);
+
+	BUG_ON(!prev != !right_prev);
+
 	/* set the root's tag bit */
 	if (slot && !root_tag_get(root, tag))
 		root_tag_set(root, tag);
@@ -549,6 +590,8 @@ void *radix_tree_tag_clear(struct radix_
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
 	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
+	int prev = 0; /* suppress warning */
+	int right_prev = radix_tree_tag_get(root, index, tag);
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -558,6 +601,13 @@ void *radix_tree_tag_clear(struct radix_
 	pathp->node = NULL;
 	slot = indirect_to_ptr(root->rnode);
 
+	if (!height) {
+		prev = root_tag_get(root, tag);
+		if (prev)
+			root_tag_clear(root, tag);
+		goto out;
+	}
+
 	while (height > 0) {
 		int offset;
 
@@ -577,7 +627,8 @@ void *radix_tree_tag_clear(struct radix_
 		goto out;
 
 	while (pathp->node) {
-		if (!tag_get(pathp->node, tag, pathp->offset))
+		prev = tag_get(pathp->node, tag, pathp->offset);
+		if (!prev)
 			goto out;
 		tag_clear(pathp->node, tag, pathp->offset);
 		if (any_tag_set(pathp->node, tag))
@@ -590,6 +641,13 @@ void *radix_tree_tag_clear(struct radix_
 		root_tag_clear(root, tag);
 
 out:
+	if (prev)
+		prev_tag_set(root, tag);
+	else
+		prev_tag_clear(root, tag);
+
+	BUG_ON(!prev != !right_prev);
+
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -1417,14 +1475,18 @@ void *radix_tree_delete(struct radix_tre
 	unsigned int height, shift;
 	int tag;
 	int offset;
+	int right_prev[RADIX_TREE_MAX_TAGS] = {0,};
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
-		goto out;
+		goto out_none;
+
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		right_prev[tag] = radix_tree_tag_get(root, index, tag);
 
 	slot = root->rnode;
 	if (height == 0) {
-		root_tag_clear_all(root);
+		root_tag_move_all_to_prev(root);
 		root->rnode = NULL;
 		goto out;
 	}
@@ -1435,7 +1497,7 @@ void *radix_tree_delete(struct radix_tre
 
 	do {
 		if (slot == NULL)
-			goto out;
+			goto out_none;
 
 		pathp++;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -1447,7 +1509,7 @@ void *radix_tree_delete(struct radix_tre
 	} while (height > 0);
 
 	if (slot == NULL)
-		goto out;
+		goto out_none;
 
 	/*
 	 * Clear all tags associated with the just-deleted item
@@ -1455,6 +1517,8 @@ void *radix_tree_delete(struct radix_tre
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		if (tag_get(pathp->node, tag, pathp->offset))
 			radix_tree_tag_clear(root, index, tag);
+		else
+			prev_tag_clear(root, tag);
 	}
 
 	to_free = NULL;
@@ -1487,7 +1551,12 @@ void *radix_tree_delete(struct radix_tre
 		radix_tree_node_free(to_free);
 
 out:
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		BUG_ON(!right_prev[tag] != !prev_tag_get(root, tag));
 	return slot;
+out_none:
+	prev_tag_clear_all(root);
+	goto out;
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
@@ -1502,6 +1571,19 @@ int radix_tree_tagged(struct radix_tree_
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
+/**
+ *	radix_tree_prev_tag_get - get previous tag status for last changed item
+ *			call is valid right after radix_tree_tag_set/clear for
+ *			changed tag and after radix_tree_delete for all tags
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_prev_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return prev_tag_get(root, tag);
+}
+EXPORT_SYMBOL(radix_tree_prev_tag_get);
+
 static void
 radix_tree_node_ctor(void *node)
 {
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/sha1.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/sha1.c
--- linux-2.6.32-504.3.3.el6.orig/lib/sha1.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/sha1.c	2015-01-21 12:02:52.174988749 +0300
@@ -1,31 +1,73 @@
 /*
- * SHA transform algorithm, originally taken from code written by
- * Peter Gutmann, and placed in the public domain.
+ * SHA1 routine optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ *
+ * This was based on the git SHA1 implementation.
  */
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/bitops.h>
 #include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+
+/*
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ *
+ * Ben Herrenschmidt reports that on PPC, the C version comes close
+ * to the optimized asm with this (ie on PPC you don't want that
+ * 'volatile', since there are lots of registers).
+ *
+ * On ARM we get the best code generation by forcing a full memory barrier
+ * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
+ * the stack frame size simply explode and performance goes down the drain.
+ */
 
-/* The SHA f()-functions.  */
+#ifdef CONFIG_X86
+  #define setW(x, val) (*(volatile __u32 *)&W(x) = (val))
+#elif defined(CONFIG_ARM)
+  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
+#else
+  #define setW(x, val) (W(x) = (val))
+#endif
 
-#define f1(x,y,z)   (z ^ (x & (y ^ z)))		/* x ? y : z */
-#define f2(x,y,z)   (x ^ y ^ z)			/* XOR */
-#define f3(x,y,z)   ((x & y) + (z & (x ^ y)))	/* majority */
-
-/* The SHA Mysterious Constants */
-
-#define K1  0x5A827999L			/* Rounds  0-19: sqrt(2) * 2^30 */
-#define K2  0x6ED9EBA1L			/* Rounds 20-39: sqrt(3) * 2^30 */
-#define K3  0x8F1BBCDCL			/* Rounds 40-59: sqrt(5) * 2^30 */
-#define K4  0xCA62C1D6L			/* Rounds 60-79: sqrt(10) * 2^30 */
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+
+/*
+ * Where do we get the source from? The first 16 iterations get it from
+ * the input data, the next mix it from the 512-bit array.
+ */
+#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t)
+#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
+
+#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
+	__u32 TEMP = input(t); setW(t, TEMP); \
+	E += TEMP + rol32(A,5) + (fn) + (constant); \
+	B = ror32(B, 2); } while (0)
+
+#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
+#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
+#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )
 
 /**
  * sha_transform - single block SHA1 transform
  *
  * @digest: 160 bit digest to update
  * @data:   512 bits of data to hash
- * @W:      80 words of workspace (see note)
+ * @array:  16 words of workspace (see note)
  *
  * This function generates a SHA1 digest for a single 512-bit block.
  * Be warned, it does not handle padding and message digest, do not
@@ -36,47 +78,111 @@
  * to clear the workspace. This is left to the caller to avoid
  * unnecessary clears between chained hashing operations.
  */
-void sha_transform(__u32 *digest, const char *in, __u32 *W)
+void sha_transform(__u32 *digest, const char *data, __u32 *array)
 {
-	__u32 a, b, c, d, e, t, i;
-
-	for (i = 0; i < 16; i++)
-		W[i] = be32_to_cpu(((const __be32 *)in)[i]);
-
-	for (i = 0; i < 64; i++)
-		W[i+16] = rol32(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 1);
+	__u32 A, B, C, D, E;
 
-	a = digest[0];
-	b = digest[1];
-	c = digest[2];
-	d = digest[3];
-	e = digest[4];
-
-	for (i = 0; i < 20; i++) {
-		t = f1(b, c, d) + K1 + rol32(a, 5) + e + W[i];
-		e = d; d = c; c = rol32(b, 30); b = a; a = t;
-	}
-
-	for (; i < 40; i ++) {
-		t = f2(b, c, d) + K2 + rol32(a, 5) + e + W[i];
-		e = d; d = c; c = rol32(b, 30); b = a; a = t;
-	}
-
-	for (; i < 60; i ++) {
-		t = f3(b, c, d) + K3 + rol32(a, 5) + e + W[i];
-		e = d; d = c; c = rol32(b, 30); b = a; a = t;
-	}
-
-	for (; i < 80; i ++) {
-		t = f2(b, c, d) + K4 + rol32(a, 5) + e + W[i];
-		e = d; d = c; c = rol32(b, 30); b = a; a = t;
-	}
-
-	digest[0] += a;
-	digest[1] += b;
-	digest[2] += c;
-	digest[3] += d;
-	digest[4] += e;
+	A = digest[0];
+	B = digest[1];
+	C = digest[2];
+	D = digest[3];
+	E = digest[4];
+
+	/* Round 1 - iterations 0-16 take their input from 'data' */
+	T_0_15( 0, A, B, C, D, E);
+	T_0_15( 1, E, A, B, C, D);
+	T_0_15( 2, D, E, A, B, C);
+	T_0_15( 3, C, D, E, A, B);
+	T_0_15( 4, B, C, D, E, A);
+	T_0_15( 5, A, B, C, D, E);
+	T_0_15( 6, E, A, B, C, D);
+	T_0_15( 7, D, E, A, B, C);
+	T_0_15( 8, C, D, E, A, B);
+	T_0_15( 9, B, C, D, E, A);
+	T_0_15(10, A, B, C, D, E);
+	T_0_15(11, E, A, B, C, D);
+	T_0_15(12, D, E, A, B, C);
+	T_0_15(13, C, D, E, A, B);
+	T_0_15(14, B, C, D, E, A);
+	T_0_15(15, A, B, C, D, E);
+
+	/* Round 1 - tail. Input from 512-bit mixing array */
+	T_16_19(16, E, A, B, C, D);
+	T_16_19(17, D, E, A, B, C);
+	T_16_19(18, C, D, E, A, B);
+	T_16_19(19, B, C, D, E, A);
+
+	/* Round 2 */
+	T_20_39(20, A, B, C, D, E);
+	T_20_39(21, E, A, B, C, D);
+	T_20_39(22, D, E, A, B, C);
+	T_20_39(23, C, D, E, A, B);
+	T_20_39(24, B, C, D, E, A);
+	T_20_39(25, A, B, C, D, E);
+	T_20_39(26, E, A, B, C, D);
+	T_20_39(27, D, E, A, B, C);
+	T_20_39(28, C, D, E, A, B);
+	T_20_39(29, B, C, D, E, A);
+	T_20_39(30, A, B, C, D, E);
+	T_20_39(31, E, A, B, C, D);
+	T_20_39(32, D, E, A, B, C);
+	T_20_39(33, C, D, E, A, B);
+	T_20_39(34, B, C, D, E, A);
+	T_20_39(35, A, B, C, D, E);
+	T_20_39(36, E, A, B, C, D);
+	T_20_39(37, D, E, A, B, C);
+	T_20_39(38, C, D, E, A, B);
+	T_20_39(39, B, C, D, E, A);
+
+	/* Round 3 */
+	T_40_59(40, A, B, C, D, E);
+	T_40_59(41, E, A, B, C, D);
+	T_40_59(42, D, E, A, B, C);
+	T_40_59(43, C, D, E, A, B);
+	T_40_59(44, B, C, D, E, A);
+	T_40_59(45, A, B, C, D, E);
+	T_40_59(46, E, A, B, C, D);
+	T_40_59(47, D, E, A, B, C);
+	T_40_59(48, C, D, E, A, B);
+	T_40_59(49, B, C, D, E, A);
+	T_40_59(50, A, B, C, D, E);
+	T_40_59(51, E, A, B, C, D);
+	T_40_59(52, D, E, A, B, C);
+	T_40_59(53, C, D, E, A, B);
+	T_40_59(54, B, C, D, E, A);
+	T_40_59(55, A, B, C, D, E);
+	T_40_59(56, E, A, B, C, D);
+	T_40_59(57, D, E, A, B, C);
+	T_40_59(58, C, D, E, A, B);
+	T_40_59(59, B, C, D, E, A);
+
+	/* Round 4 */
+	T_60_79(60, A, B, C, D, E);
+	T_60_79(61, E, A, B, C, D);
+	T_60_79(62, D, E, A, B, C);
+	T_60_79(63, C, D, E, A, B);
+	T_60_79(64, B, C, D, E, A);
+	T_60_79(65, A, B, C, D, E);
+	T_60_79(66, E, A, B, C, D);
+	T_60_79(67, D, E, A, B, C);
+	T_60_79(68, C, D, E, A, B);
+	T_60_79(69, B, C, D, E, A);
+	T_60_79(70, A, B, C, D, E);
+	T_60_79(71, E, A, B, C, D);
+	T_60_79(72, D, E, A, B, C);
+	T_60_79(73, C, D, E, A, B);
+	T_60_79(74, B, C, D, E, A);
+	T_60_79(75, A, B, C, D, E);
+	T_60_79(76, E, A, B, C, D);
+	T_60_79(77, D, E, A, B, C);
+	T_60_79(78, C, D, E, A, B);
+	T_60_79(79, B, C, D, E, A);
+
+	digest[0] += A;
+	digest[1] += B;
+	digest[2] += C;
+	digest[3] += D;
+	digest[4] += E;
 }
 EXPORT_SYMBOL(sha_transform);
 
@@ -92,4 +198,18 @@ void sha_init(__u32 *buf)
 	buf[3] = 0x10325476;
 	buf[4] = 0xc3d2e1f0;
 }
+EXPORT_SYMBOL(sha_init);
+
+static void sha_batch_generic(__u32 *digest, const char *data, unsigned rounds)
+{
+	__u32 temp[SHA_WORKSPACE_WORDS];
+
+	while (rounds--) {
+		sha_transform(digest, data, temp);
+		data += SHA_MESSAGE_BYTES;
+	}
+}
 
+void __read_mostly (*sha_batch_transform)(__u32 *digest,
+		const char *data, unsigned rounds) = sha_batch_generic;
+EXPORT_SYMBOL(sha_batch_transform);
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/show_mem.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/show_mem.c
--- linux-2.6.32-504.3.3.el6.orig/lib/show_mem.c	2014-12-12 23:29:07.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/show_mem.c	2015-01-21 12:02:57.855837951 +0300
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/nmi.h>
 #include <linux/quicklist.h>
+#include <linux/module.h>
 
 void show_mem(unsigned int filter)
 {
@@ -61,3 +62,4 @@ void show_mem(unsigned int filter)
 		quicklist_total_size());
 #endif
 }
+EXPORT_SYMBOL(show_mem);
diff -upr linux-2.6.32-504.3.3.el6.orig/lib/textsearch.c linux-2.6.32-504.3.3.el6-042stab103_6/lib/textsearch.c
--- linux-2.6.32-504.3.3.el6.orig/lib/textsearch.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/lib/textsearch.c	2015-01-21 12:02:45.518165469 +0300
@@ -274,7 +274,7 @@ struct ts_config *textsearch_prepare(con
 	 * especially when the module is located on a NFS mount.
 	 */
 	if (ops == NULL && flags & TS_AUTOLOAD) {
-		request_module("ts_%s", algo);
+		ve0_request_module("ts_%s", algo);
 		ops = lookup_ts_algo(algo);
 	}
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/mm/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/mm/Kconfig	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/Kconfig	2015-01-21 12:02:58.931809393 +0300
@@ -323,3 +323,30 @@ config TRANSPARENT_HUGEPAGE
 	  up the pagetable walking.
 
 	  If memory constrained on embedded, you may want to say N.
+
+config MEMORY_GANGS
+	bool
+
+config MEMORY_GANGS_MIGRATION
+	bool
+	depends on MEMORY_GANGS && MIGRATION
+	default MEMORY_GANGS
+
+config MEMORY_VSWAP
+	bool
+
+config PRAM
+	bool "Persistent over-kexec memory storage"
+	depends on X86_64
+	select CRYPTO_CRC32C
+	select CRYPTO_CRC32C_INTEL
+	select CRC32
+	select LIBCRC32C
+
+config PSWAP
+	bool "Persistent swap"
+	depends on SWAP && PRAM
+
+config KSTALED
+	bool "Idle page tracking"
+	depends on 64BIT
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/mm/Makefile
--- linux-2.6.32-504.3.3.el6.orig/mm/Makefile	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/Makefile	2015-01-21 12:02:58.931809393 +0300
@@ -12,8 +12,9 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o mm_init.o mmu_context.o \
-			   $(mmu-y)
+			   iov-iter.o $(mmu-y)
 obj-y += init-mm.o
+obj-y += oom_group.o
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
@@ -51,3 +52,5 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoiso
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
+obj-$(CONFIG_PRAM) += pram.o
+obj-$(CONFIG_KSTALED) += kstaled.o
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/allocpercpu.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/allocpercpu.c
--- linux-2.6.32-504.3.3.el6.orig/mm/allocpercpu.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/allocpercpu.c	2015-01-21 12:02:58.029833335 +0300
@@ -132,7 +132,7 @@ void *__alloc_percpu(size_t size, size_t
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__alloc_percpu);
+EXPORT_SYMBOL(__alloc_percpu);
 
 /**
  * free_percpu - final cleanup of per-cpu data
@@ -148,7 +148,7 @@ void free_percpu(void *__pdata)
 	__percpu_depopulate_mask(__pdata, cpu_possible_mask);
 	kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(free_percpu);
+EXPORT_SYMBOL(free_percpu);
 
 /*
  * Generic percpu area setup.
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/backing-dev.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/backing-dev.c
--- linux-2.6.32-504.3.3.el6.orig/mm/backing-dev.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/backing-dev.c	2015-01-21 12:02:58.677816134 +0300
@@ -26,6 +26,12 @@ struct backing_dev_info default_backing_
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
+struct backing_dev_info noop_backing_dev_info = {
+	.name		= "noop",
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
+
 static struct class *bdi_class;
 
 /*
@@ -177,42 +183,52 @@ static ssize_t name##_show(struct device
 
 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 
-static ssize_t min_ratio_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
+static inline ssize_t generic_uint_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count,
+		int (*set_func) (struct backing_dev_info *, unsigned int))
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
 	char *end;
-	unsigned int ratio;
+	unsigned int val;
 	ssize_t ret = -EINVAL;
 
-	ratio = simple_strtoul(buf, &end, 10);
+	val = simple_strtoul(buf, &end, 10);
 	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-		ret = bdi_set_min_ratio(bdi, ratio);
+		ret = set_func(bdi, val);
 		if (!ret)
 			ret = count;
 	}
 	return ret;
 }
+
+static ssize_t min_ratio_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_ratio);
+}
 BDI_SHOW(min_ratio, bdi->min_ratio)
 
 static ssize_t max_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	char *end;
-	unsigned int ratio;
-	ssize_t ret = -EINVAL;
-
-	ratio = simple_strtoul(buf, &end, 10);
-	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-		ret = bdi_set_max_ratio(bdi, ratio);
-		if (!ret)
-			ret = count;
-	}
-	return ret;
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_ratio);
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+static ssize_t min_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_dirty);
+}
+BDI_SHOW(min_dirty_pages, bdi->min_dirty_pages)
+
+static ssize_t max_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_dirty);
+}
+BDI_SHOW(max_dirty_pages, bdi->max_dirty_pages)
+
 static ssize_t stable_pages_required_show(struct device *dev,
 					  struct device_attribute *attr,
 					  char *page)
@@ -227,6 +243,8 @@ static struct device_attribute bdi_dev_a
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(min_dirty_pages),
+	__ATTR_RW(max_dirty_pages),
 	__ATTR_RO(stable_pages_required),
 	__ATTR_NULL,
 };
@@ -254,6 +272,7 @@ static int __init default_bdi_init(void)
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
 		bdi_register(&default_backing_dev_info, NULL, "default");
+	err = bdi_init(&noop_backing_dev_info);
 
 	return err;
 }
@@ -288,6 +307,8 @@ static void bdi_task_init(struct backing
 	set_user_nice(tsk, 0);
 }
 
+extern struct io_context *current_io_context(gfp_t gfp_flags, int node);
+
 static int bdi_start_fn(void *ptr)
 {
 	struct bdi_writeback *wb = ptr;
@@ -303,6 +324,8 @@ static int bdi_start_fn(void *ptr)
 
 	bdi_task_init(bdi, wb);
 
+	(void)current_io_context(GFP_KERNEL, -1);
+
 	/*
 	 * Clear pending bit and wakeup anybody waiting to tear us down
 	 */
@@ -394,8 +417,8 @@ static int bdi_forker_task(void *ptr)
 	bdi_task_init(me->bdi, me);
 
 	for (;;) {
+		struct task_struct *task;
 		struct backing_dev_info *bdi, *tmp;
-		struct bdi_writeback *wb;
 
 		/*
 		 * Temporary measure, we want to make sure we don't see
@@ -446,28 +469,28 @@ static int bdi_forker_task(void *ptr)
 		list_del_init(&bdi->bdi_list);
 		spin_unlock_bh(&bdi_lock);
 
-		wb = &bdi->wb;
-		wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
-					dev_name(bdi->dev));
-		/*
-		 * If task creation fails, then readd the bdi to
-		 * the pending list and force writeout of the bdi
-		 * from this forker thread. That will free some memory
-		 * and we can try again.
-		 */
-		if (IS_ERR(wb->task)) {
-			wb->task = NULL;
-
+		task = kthread_create(bdi_start_fn, &bdi->wb, "flush-%s",
+				dev_name(bdi->dev));
+		if (IS_ERR(task)) {
 			/*
-			 * Add this 'bdi' to the back, so we get
-			 * a chance to flush other bdi's to free
-			 * memory.
+			 * If thread creation fails, then readd the bdi back to
+			 * the list and force writeout of the bdi from this
+			 * forker thread. That will free some memory and we can
+			 * try again. Add it to the tail so we get a chance to
+			 * flush other bdi's to free memory.
 			 */
 			spin_lock_bh(&bdi_lock);
 			list_add_tail(&bdi->bdi_list, &bdi_pending_list);
 			spin_unlock_bh(&bdi_lock);
 
 			bdi_flush_io(bdi);
+		} else {
+			bdi->wb.task = task;
+			/*
+			 * And as soon as the bdi thread is visible,
+			 * we can start it.
+			 */
+			wake_up_process(task);
 		}
 	}
 
@@ -543,13 +566,16 @@ int bdi_register(struct backing_dev_info
 	va_list args;
 	int ret = 0;
 	struct device *dev;
+	struct ve_struct *ve;
 
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		goto exit;
 
+	ve = set_exec_env(&ve0);
 	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 	va_end(args);
+	set_exec_env(ve);
 	if (IS_ERR(dev)) {
 		ret = PTR_ERR(dev);
 		goto exit;
@@ -637,13 +663,15 @@ static void bdi_prune_sb(struct backing_
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (sb->s_bdi == bdi)
-			sb->s_bdi = NULL;
+			sb->s_bdi = &default_backing_dev_info;
 	}
 	spin_unlock(&sb_lock);
 }
 
 void bdi_unregister(struct backing_dev_info *bdi)
 {
+	struct ve_struct *ve;
+
 	if (bdi->dev) {
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
@@ -651,7 +679,9 @@ void bdi_unregister(struct backing_dev_i
 		if (!bdi_cap_flush_forker(bdi))
 			bdi_wb_shutdown(bdi);
 		bdi_debug_unregister(bdi);
+		ve = set_exec_env(&ve0);
 		device_unregister(bdi->dev);
+		set_exec_env(ve);
 		bdi->dev = NULL;
 	}
 }
@@ -666,11 +696,14 @@ int bdi_init(struct backing_dev_info *bd
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	bdi->min_dirty_pages = 0;
+	bdi->max_dirty_pages = 0;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_RCU_HEAD(&bdi->rcu_head);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
 	INIT_LIST_HEAD(&bdi->work_list);
+	init_waitqueue_head(&bdi->cong_waitq);
 
 	bdi_wb_init(&bdi->wb, bdi);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/bootmem.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/bootmem.c
--- linux-2.6.32-504.3.3.el6.orig/mm/bootmem.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/bootmem.c	2015-01-21 12:02:52.852970749 +0300
@@ -13,6 +13,7 @@
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/kmemleak.h>
+#include <linux/pram.h>
 
 #include <asm/bug.h>
 #include <asm/io.h>
@@ -288,6 +289,8 @@ static int __init __reserve(bootmem_data
 			bdebug("silent double reserve of PFN %lx\n",
 				idx + bdata->node_min_pfn);
 		}
+	pram_ban_region(sidx + bdata->node_min_pfn,
+			eidx + bdata->node_min_pfn - 1);
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/compaction.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/compaction.c
--- linux-2.6.32-504.3.3.el6.orig/mm/compaction.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/compaction.c	2015-01-21 12:02:58.905810082 +0300
@@ -418,6 +418,10 @@ static bool too_many_isolated(struct zon
 	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
 					zone_page_state(zone, NR_ISOLATED_ANON);
 
+	if (isolated > (inactive + active) / 2)
+		isolated = zone_page_state_snapshot(zone, NR_ISOLATED_FILE) +
+			   zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+
 	return isolated > (inactive + active) / 2;
 }
 
@@ -433,8 +437,8 @@ static unsigned long isolate_migratepage
 	struct list_head *migratelist = &cc->migratepages;
 	isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
 	unsigned long flags;
-	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
+	struct lruvec *lruvec = NULL;
 
 	/* Do not scan outside zone boundaries */
 	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -462,14 +466,15 @@ static unsigned long isolate_migratepage
 
 	/* Time to isolate some pages for migration */
 	cond_resched();
-	spin_lock_irqsave(&zone->lru_lock, flags);
-	locked = true;
+	local_irq_save(flags);
 	for (; low_pfn < end_pfn; low_pfn++) {
 		/* give a chance to irqs before checking need_resched() */
-		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-			if (should_release_lock(&zone->lru_lock)) {
-				spin_unlock_irqrestore(&zone->lru_lock, flags);
-				locked = false;
+		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+			if (lruvec && should_release_lock(&lruvec->lru_lock)) {
+				unlock_lruvec(lruvec);
+				local_irq_restore(flags);
+				lruvec = NULL;
+				local_irq_save(flags);
 			}
 		}
 
@@ -536,16 +541,20 @@ static unsigned long isolate_migratepage
 		 * page underneath us may return surprising results.
 		 */
 		if (PageTransHuge(page)) {
-			if (!locked)
+			if (!lruvec || page_lruvec(page) != lruvec)
 				goto next_pageblock;
 			low_pfn += (1 << compound_order(page)) - 1;
 			continue;
 		}
 
 		/* Check if it is ok to still hold the lock */
-		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-								locked, cc);
-		if (!locked || fatal_signal_pending(current))
+		if (lruvec && !compact_checklock_irqsave(&lruvec->lru_lock,
+					&flags, true, cc)) {
+			lruvec = NULL;
+			local_irq_save(flags);
+			break;
+		}
+		if (fatal_signal_pending(current))
 			break;
 
 		/* Recheck PageLRU and PageTransHuge under lock */
@@ -560,14 +569,14 @@ static unsigned long isolate_migratepage
 			mode |= ISOLATE_ASYNC_MIGRATE;
 
 		/* Try isolate the page */
-		if (__isolate_lru_page(page, mode, 0) != 0)
+		if (__isolate_lru_page(page, mode, 0, &lruvec) != 0)
 			continue;
 
 		VM_BUG_ON(PageTransCompound(page));
 
 		/* Successfully isolated */
 		cc->finished_update_migrate = true;
-		del_page_from_lru_list(zone, page, page_lru(page));
+		del_page_from_lru_list(lruvec, page, page_lru(page));
 		list_add(&page->lru, migratelist);
 		cc->nr_migratepages++;
 
@@ -583,10 +592,10 @@ next_pageblock:
 		last_pageblock_nr = pageblock_nr;
 	}
 
-	acct_isolated(zone, locked, cc);
+	acct_isolated(zone, true, cc);
 
-	if (locked)
-		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	unlock_lruvec(lruvec);
+	local_irq_restore(flags);
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (low_pfn == end_pfn)
@@ -724,7 +733,8 @@ unsigned long compaction_suitable(struct
 	 * fragmentation index determines if allocation failures are due to
 	 * low memory or external fragmentation
 	 *
-	 * index of -1 implies allocations might succeed dependingon watermarks
+	 * index of -1000 implies allocations might succeed depending on
+	 * watermarks
 	 * index towards 0 implies failure is due to lack of memory
 	 * index towards 1000 implies failure is due to fragmentation
 	 *
@@ -734,7 +744,8 @@ unsigned long compaction_suitable(struct
 	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
 		return COMPACT_SKIPPED;
 
-	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+	if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+	    0, 0))
 		return COMPACT_PARTIAL;
 
 	return COMPACT_CONTINUE;
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/fadvise.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/fadvise.c
--- linux-2.6.32-504.3.3.el6.orig/mm/fadvise.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/fadvise.c	2015-01-21 12:02:42.173254276 +0300
@@ -7,6 +7,7 @@
  *		Initial version.
  */
 
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -25,10 +26,9 @@
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
  */
-SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
+int generic_fadvise(struct file* file, loff_t offset, loff_t len, int advice)
 {
-	struct file *file = fget(fd);
-	struct address_space *mapping;
+	struct address_space *mapping = file->f_mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
@@ -36,20 +36,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, lof
 	unsigned long nrpages;
 	int ret = 0;
 
-	if (!file)
-		return -EBADF;
-
-	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
-		ret = -ESPIPE;
-		goto out;
-	}
-
-	mapping = file->f_mapping;
-	if (!mapping || len < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	if (mapping->a_ops->get_xip_mem) {
 		switch (advice) {
 		case POSIX_FADV_NORMAL:
@@ -118,7 +104,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, lof
 		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
-			filemap_flush(mapping);
+			__filemap_fdatawrite_range(mapping, offset, endbyte,
+						   WB_SYNC_NONE);
 
 		/* First and last FULL page! */
 		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
@@ -145,6 +132,33 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, lof
 		ret = -EINVAL;
 	}
 out:
+	return ret;
+}
+EXPORT_SYMBOL(generic_fadvise);
+
+SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
+{
+	struct file *file = fget(fd);
+	int (*fadvise)(struct file *,loff_t, loff_t, int) = generic_fadvise;
+	int ret = 0;
+
+	if (!file)
+		return -EBADF;
+
+	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+		ret = -ESPIPE;
+		goto out;
+	}
+
+	if (!file->f_mapping || len < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (file->f_op && file->f_op->fadvise)
+		fadvise = file->f_op->fadvise;
+
+	ret = fadvise(file, offset, len, advice);
+out:
 	fput(file);
 	return ret;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/filemap.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/filemap.c
--- linux-2.6.32-504.3.3.el6.orig/mm/filemap.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/filemap.c	2015-01-21 12:02:58.678816107 +0300
@@ -34,9 +34,12 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/mmgang.h>
 #include <trace/events/kmem.h>
 #include "internal.h"
 
+#include <bc/io_acct.h>
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
@@ -121,6 +124,16 @@ void __remove_from_page_cache(struct pag
 	struct address_space *mapping = page->mapping;
 
 	radix_tree_delete(&mapping->page_tree, page->index);
+	if (mapping_cap_account_dirty(mapping) &&
+			radix_tree_prev_tag_get(&mapping->page_tree,
+				PAGECACHE_TAG_DIRTY))
+		ub_io_account_cancel(mapping);
+
+	if (mapping_cap_account_writeback(mapping) &&
+			radix_tree_prev_tag_get(&mapping->page_tree,
+				PAGECACHE_TAG_WRITEBACK))
+		ub_io_writeback_dec(mapping);
+
 	page->mapping = NULL;
 	/* Leave page->index set: truncation lookup relies upon it */
 	mapping->nrpages--;
@@ -430,6 +443,24 @@ int add_to_page_cache_locked(struct page
 {
 	int error;
 
+	error = gang_add_user_page(page, get_mapping_gang(mapping), gfp_mask);
+	if (error)
+		return error;
+
+	error = add_to_page_cache_nogang(page, mapping, offset, gfp_mask);
+	if (error)
+		gang_del_user_page(page);
+
+	return error;
+}
+EXPORT_SYMBOL(add_to_page_cache_locked);
+
+/* add_to_page_cache_nogang - add a locked page to pagecache without gang linking */
+int add_to_page_cache_nogang(struct page *page, struct address_space *mapping,
+		pgoff_t offset, gfp_t gfp_mask)
+{
+	int error;
+
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageSwapBacked(page));
 
@@ -463,7 +494,6 @@ int add_to_page_cache_locked(struct page
 out:
 	return error;
 }
-EXPORT_SYMBOL(add_to_page_cache_locked);
 
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, gfp_t gfp_mask)
@@ -492,6 +522,9 @@ struct page *__page_cache_alloc(gfp_t gf
 	int n;
 	struct page *page;
 
+	if (unlikely(ub_check_ram_limits(get_exec_ub(), gfp)))
+		return NULL;
+
 	if (cpuset_do_page_mem_spread()) {
 		get_mems_allowed();
 		n = cpuset_mem_spread_node();
@@ -779,6 +812,7 @@ struct page *find_or_create_page(struct 
 	struct page *page;
 	int err;
 repeat:
+	check_pagecache_limits(mapping, gfp_mask);
 	page = find_lock_page(mapping, index);
 	if (!page) {
 		page = __page_cache_alloc(gfp_mask);
@@ -1109,7 +1143,14 @@ static void do_generic_file_read(struct 
 
 		cond_resched();
 find_page:
+		check_pagecache_limits(mapping, mapping_gfp_mask(mapping));
 		page = find_get_page(mapping, index);
+		if (!page && inode->i_peer_file) {
+			page = pick_peer_page(inode, ra, index,
+					      last_index - index);
+			if (page)
+				goto page_ok;
+		}
 		if (!page) {
 			page_cache_sync_readahead(mapping,
 					ra, filp,
@@ -1202,6 +1243,8 @@ page_ok:
 		goto out;
 
 page_not_up_to_date:
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 		/* Get exclusive access to the page ... */
 		error = lock_page_killable(page);
 		if (unlikely(error))
@@ -1269,6 +1312,8 @@ readpage_error:
 		goto out;
 
 no_cached_page:
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 		/*
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
@@ -1376,31 +1421,59 @@ int generic_segment_checks(const struct 
 }
 EXPORT_SYMBOL(generic_segment_checks);
 
+static ssize_t mapping_direct_IO(struct address_space *mapping, int rw,
+			         struct kiocb *iocb, struct iov_iter *iter,
+			         loff_t pos)
+{
+	if (iov_iter_has_iovec(iter))
+		return mapping->a_ops->direct_IO(rw, iocb, iov_iter_iovec(iter),
+						 pos, iter->nr_segs);
+	else if (iov_iter_has_bvec(iter))
+		return mapping->a_ops->direct_IO_bvec(rw, iocb,
+						      iov_iter_bvec(iter), pos,
+						      iter->nr_segs);
+	else if (iov_iter_has_page(iter))
+		return mapping->a_ops->direct_IO_page(rw, iocb,
+						      iov_iter_page(iter), pos);
+	else
+		BUG();
+}
+
+static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
+				unsigned long offset, unsigned long size)
+{
+	struct iov_iter *iter = desc->arg.data;
+	unsigned long copied = 0;
+
+	if (size > desc->count)
+		size = desc->count;
+
+	copied = iov_iter_copy_to_user(page, iter, offset, size);
+	if (copied < size)
+		desc->error = -EFAULT;
+
+	iov_iter_advance(iter, copied);
+	desc->count -= copied;
+	desc->written += copied;
+
+	return copied;
+}
+
 /**
- * generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
  * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
+ * @iov_iter:	memory vector
  * @pos:	current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
  */
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg = 0;
-	size_t count;
+	read_descriptor_t desc;
+	ssize_t retval = 0;
+	size_t count = iov_iter_count(iter);
 	loff_t *ppos = &iocb->ki_pos;
 
-	count = 0;
-	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (retval)
-		return retval;
-
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
 		loff_t size;
@@ -1414,10 +1487,10 @@ generic_file_aio_read(struct kiocb *iocb
 		size = i_size_read(inode);
 		if (pos < size) {
 			retval = filemap_write_and_wait_range(mapping, pos,
-					pos + iov_length(iov, nr_segs) - 1);
+					pos + count - 1);
 			if (!retval) {
-				retval = mapping->a_ops->direct_IO(READ, iocb,
-							iov, pos, nr_segs);
+				retval = mapping_direct_IO(mapping, READ,
+							   iocb, iter, pos);
 			}
 			if (retval > 0) {
 				*ppos = pos + retval;
@@ -1439,42 +1512,49 @@ generic_file_aio_read(struct kiocb *iocb
 		}
 	}
 
-	count = retval;
-	for (seg = 0; seg < nr_segs; seg++) {
-		read_descriptor_t desc;
-		loff_t offset = 0;
+	iov_iter_advance(iter, retval);
 
-		/*
-		 * If we did a short DIO read we need to skip the section of the
-		 * iov that we've already read data into.
-		 */
-		if (count) {
-			if (count > iov[seg].iov_len) {
-				count -= iov[seg].iov_len;
-				continue;
-			}
-			offset = count;
-			count = 0;
-		}
-
-		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base + offset;
-		desc.count = iov[seg].iov_len - offset;
-		if (desc.count == 0)
-			continue;
-		desc.error = 0;
-		do_generic_file_read(filp, ppos, &desc, file_read_actor);
-		retval += desc.written;
-		if (desc.error) {
-			retval = retval ?: desc.error;
-			break;
-		}
-		if (desc.count > 0)
-			break;
-	}
+	desc.written = 0;
+	desc.arg.data = iter;
+	desc.count = count;
+	desc.error = 0;
+	do_generic_file_read(filp, ppos, &desc, file_read_iter_actor);
+
+	retval += desc.written;
+	if (desc.error && !retval)
+		retval = desc.error;
 out:
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_read_iter);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @pos:	current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter iter;
+	int ret;
+	size_t count;
+
+	count = 0;
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+	if (ret)
+		return ret;
+
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+	return generic_file_read_iter(iocb, &iter, pos);
+}
 EXPORT_SYMBOL(generic_file_aio_read);
 
 static ssize_t
@@ -1530,6 +1610,8 @@ static int page_cache_read(struct file *
 	struct page *page; 
 	int ret;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	do {
 		page = page_cache_alloc_cold(mapping);
 		if (!page)
@@ -1617,6 +1699,46 @@ static void do_async_mmap_readahead(stru
 					   page, offset, ra->ra_pages);
 }
 
+struct page *pick_peer_page(struct inode *inode, struct file_ra_state *ra,
+			    pgoff_t index, unsigned ra_size)
+{
+	struct address_space *mapping;
+	struct page *page = NULL;
+	struct file *file;
+
+	rcu_read_lock();
+	file = rcu_dereference(inode->i_peer_file);
+	if (!file || !atomic_long_inc_not_zero(&file->f_count)) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	rcu_read_unlock();
+
+	mapping = file->f_mapping;
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page_cache_sync_readahead(mapping, ra, file, index, ra_size);
+		page = find_get_page(mapping, index);
+		if (!page)
+			goto out;
+	}
+	if (PageReadahead(page))
+		page_cache_async_readahead(mapping, ra, file,
+				page, index, ra->ra_pages);
+	if (!PageUptodate(page)) {
+		if (!lock_page_killable(page)) {
+			unlock_page(page);
+			if (PageUptodate(page))
+				goto out;;
+		}
+		put_page(page);
+		page = NULL;
+	}
+out:
+	peer_fput(file);
+	return page;
+}
+
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	vma in which the fault was taken
@@ -1649,12 +1771,24 @@ int filemap_fault(struct vm_area_struct 
 	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
+	if (!page && inode->i_peer_file) {
+		page = pick_peer_page(inode, ra, offset, ra->ra_pages);
+		if (page) {
+			vmf->page = page;
+			return 0; /* unlocked page */
+		}
+	}
 	if (likely(page)) {
 		/*
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
 		do_async_mmap_readahead(vma, ra, file, page, offset);
+
+		if (unlikely(!PageUptodate(page)))
+			virtinfo_notifier_call(VITYPE_IO,
+					VIRTINFO_IO_PREPARE, NULL);
+
 	} else {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vma, ra, file, offset);
@@ -1755,11 +1889,16 @@ EXPORT_SYMBOL(filemap_fault);
 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct file *file = vma->vm_file;
+	struct inode *inode;
 	int ret = VM_FAULT_LOCKED;
 
+	if (file->f_op->get_host)
+		file = file->f_op->get_host(file);
+	inode = file->f_path.dentry->d_inode;
+
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
@@ -1830,6 +1969,7 @@ static struct page *__read_cache_page(st
 	struct page *page;
 	int err;
 repeat:
+	check_pagecache_limits(mapping, gfp);
 	page = find_get_page(mapping, index);
 	if (!page) {
 		page = __page_cache_alloc(gfp | __GFP_COLD);
@@ -2025,147 +2165,6 @@ int file_remove_suid(struct file *file)
 }
 EXPORT_SYMBOL(file_remove_suid);
 
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	size_t copied = 0, left = 0;
-
-	while (bytes) {
-		char __user *buf = iov->iov_base + base;
-		int copy = min(bytes, iov->iov_len - base);
-
-		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
-		copied += copy;
-		bytes -= copy;
-		vaddr += copy;
-		iov++;
-
-		if (unlikely(left))
-			break;
-	}
-	return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	BUG_ON(!in_atomic());
-	kaddr = kmap_atomic(page, KM_USER0);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap_atomic(kaddr, KM_USER0);
-
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
-	BUG_ON(i->count < bytes);
-
-	if (likely(i->nr_segs == 1)) {
-		i->iov_offset += bytes;
-		i->count -= bytes;
-	} else {
-		const struct iovec *iov = i->iov;
-		size_t base = i->iov_offset;
-
-		/*
-		 * The !iov->iov_len check ensures we skip over unlikely
-		 * zero-length segments (without overruning the iovec).
-		 */
-		while (bytes || unlikely(i->count && !iov->iov_len)) {
-			int copy;
-
-			copy = min(bytes, iov->iov_len - base);
-			BUG_ON(!i->count || i->count < copy);
-			i->count -= copy;
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				base = 0;
-			}
-		}
-		i->iov = iov;
-		i->iov_offset = base;
-	}
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-	char __user *buf = i->iov->iov_base + i->iov_offset;
-	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-	return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-	const struct iovec *iov = i->iov;
-	if (i->nr_segs == 1)
-		return i->count;
-	else
-		return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
 /*
  * Performs necessary checks before doing a write
  *
@@ -2271,9 +2270,8 @@ int pagecache_write_end(struct file *fil
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, size_t ocount)
+generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+		loff_t pos, loff_t *ppos, size_t count)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -2282,10 +2280,13 @@ generic_file_direct_write(struct kiocb *
 	size_t		write_len;
 	pgoff_t		end;
 
-	if (count != ocount)
-		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+	if (count != iov_iter_count(iter)) {
+		written = iov_iter_shorten(iter, count);
+		if (written)
+			goto out;
+	}
 
-	write_len = iov_length(iov, *nr_segs);
+	write_len = count;
 	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
 
 	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2312,7 +2313,7 @@ generic_file_direct_write(struct kiocb *
 		}
 	}
 
-	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+	written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
@@ -2338,6 +2339,23 @@ generic_file_direct_write(struct kiocb *
 out:
 	return written;
 }
+EXPORT_SYMBOL(generic_file_direct_write_iter);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, size_t ocount)
+{
+	struct iov_iter iter;
+	ssize_t ret;
+
+	iov_iter_init(&iter, iov, *nr_segs, ocount, 0);
+	ret = generic_file_direct_write_iter(iocb, &iter, pos, ppos, count);
+	/* generic_file_direct_write_iter() might have shortened the vec */
+	if (*nr_segs != iter.nr_segs)
+		*nr_segs = iter.nr_segs;
+	return ret;
+}
 EXPORT_SYMBOL(generic_file_direct_write);
 
 /*
@@ -2353,6 +2371,7 @@ struct page *grab_cache_page_write_begin
 	if (flags & AOP_FLAG_NOFS)
 		gfp_notmask = __GFP_FS;
 repeat:
+	check_pagecache_limits(mapping, mapping_gfp_mask(mapping) & ~gfp_notmask);
 	page = find_lock_page(mapping, index);
 	if (likely(page))
 		goto found;
@@ -2468,17 +2487,16 @@ again:
 }
 
 ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, ssize_t written)
+generic_file_buffered_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+		loff_t pos, loff_t *ppos, ssize_t written)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	ssize_t status;
-	struct iov_iter i;
 
-	iov_iter_init(&i, iov, nr_segs, count, written);
-	status = generic_perform_write(file, &i, pos);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
+	status = generic_perform_write(file, iter, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
@@ -2496,13 +2514,24 @@ generic_file_buffered_write(struct kiocb
 
 	return written ? written : status;
 }
+EXPORT_SYMBOL(generic_file_buffered_write_iter);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct iov_iter iter;
+	iov_iter_init(&iter, iov, nr_segs, count, written);
+	return generic_file_buffered_write_iter(iocb, &iter, pos, ppos,
+						written);
+}
 EXPORT_SYMBOL(generic_file_buffered_write);
 
 /**
  * __generic_file_aio_write - write data to a file
  * @iocb:	IO state structure (file, offset, etc.)
- * @iov:	vector with data to write
- * @nr_segs:	number of segments in the vector
+ * @iter:	iov_iter specifying memory to write
  * @ppos:	position where to write
  *
  * This function does all the work needed for actually writing data to a
@@ -2517,24 +2546,18 @@ EXPORT_SYMBOL(generic_file_buffered_writ
  * A caller has to handle it. This is mainly due to the fact that we want to
  * avoid syncing under i_mutex.
  */
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs, loff_t *ppos)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	loff_t		pos;
 	ssize_t		written;
 	ssize_t		err;
 
-	ocount = 0;
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err)
-		return err;
-
-	count = ocount;
+	count = iov_iter_count(iter);
 	pos = *ppos;
 
 	if (!sb_has_new_freeze(inode->i_sb))
@@ -2562,8 +2585,8 @@ ssize_t __generic_file_aio_write(struct 
 		loff_t endbyte;
 		ssize_t written_buffered;
 
-		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-							ppos, count, ocount);
+		written = generic_file_direct_write_iter(iocb, iter, pos,
+							 ppos, count);
 		if (written < 0 || written == count)
 			goto out;
 		/*
@@ -2572,9 +2595,9 @@ ssize_t __generic_file_aio_write(struct 
 		 */
 		pos += written;
 		count -= written;
-		written_buffered = generic_file_buffered_write(iocb, iov,
-						nr_segs, pos, ppos, count,
-						written);
+		iov_iter_advance(iter, written);
+		written_buffered = generic_file_buffered_write_iter(iocb, iter,
+						pos, ppos, written);
 		/*
 		 * If generic_file_buffered_write() retuned a synchronous error
 		 * then we want to return the number of bytes which were
@@ -2609,13 +2632,57 @@ ssize_t __generic_file_aio_write(struct 
 			 */
 		}
 	} else {
-		written = generic_file_buffered_write(iocb, iov, nr_segs,
-				pos, ppos, count, written);
+		iter->count = count;
+		written = generic_file_buffered_write_iter(iocb, iter,
+				pos, ppos, written);
 	}
 out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+			        loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_file_write_iter(iocb, iter, &iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0 || ret == -EIOCBQUEUED) {
+		ssize_t err;
+
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0 && ret > 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
+ssize_t
+__generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t *ppos)
+{
+	struct iov_iter iter;
+	size_t count;
+	int ret;
+
+	count = 0;
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (ret)
+		goto out;
+
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+	ret = __generic_file_write_iter(iocb, &iter, ppos);
+out:
+	return ret;
+}
 EXPORT_SYMBOL(__generic_file_aio_write);
 
 /**
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/filemap_xip.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/filemap_xip.c
--- linux-2.6.32-504.3.3.el6.orig/mm/filemap_xip.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/filemap_xip.c	2015-01-21 12:02:58.678816107 +0300
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
+#include <bc/vmpages.h>
 
 /*
  * We do use our own empty page to avoid interference with other users
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/fremap.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/fremap.c
--- linux-2.6.32-504.3.3.el6.orig/mm/fremap.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/fremap.c	2015-01-21 12:02:58.678816107 +0300
@@ -21,6 +21,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/vmpages.h>
+
 #include "internal.h"
 
 static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -36,7 +38,7 @@ static void zap_pte(struct mm_struct *mm
 		page = vm_normal_page(vma, addr, pte);
 		if (page) {
 			if (pte_dirty(pte))
-				set_page_dirty(page);
+				set_page_dirty_mm(page, mm);
 			page_remove_rmap(page);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
@@ -64,7 +66,7 @@ static int install_file_pte(struct mm_st
 	if (!pte)
 		goto out;
 
-	if (!pte_none(*pte))
+       if (!pte_none(*pte))
 		zap_pte(mm, vma, addr, pte);
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
@@ -224,8 +226,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsign
 		/*
 		 * drop PG_Mlocked flag for over-mapped range
 		 */
-		unsigned int saved_flags = vma->vm_flags;
-		munlock_vma_pages_range(vma, start, start + size);
+		vm_flags_t saved_flags = vma->vm_flags;
+		__munlock_vma_pages_range(vma, start, start + size, 0);
 		vma->vm_flags = saved_flags;
 	}
 
@@ -261,3 +263,4 @@ out:
 
 	return err;
 }
+EXPORT_SYMBOL(sys_remap_file_pages);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/huge_memory.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/huge_memory.c
--- linux-2.6.32-504.3.3.el6.orig/mm/huge_memory.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/huge_memory.c	2015-01-21 12:02:58.679816081 +0300
@@ -6,6 +6,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
@@ -21,6 +22,8 @@
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+#include <bc/kmem.h>
+
 /*
  * By default transparent hugepage support is enabled for all mappings
  * and khugepaged scans all mappings. Defrag is only invoked by
@@ -29,7 +32,6 @@
  * allocations.
  */
 unsigned long transparent_hugepage_flags __read_mostly =
-	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 
@@ -606,11 +608,23 @@ static int __do_huge_pmd_anonymous_page(
 {
 	int ret = 0;
 	pgtable_t pgtable;
+	int one;
 
 	VM_BUG_ON(!PageCompound(page));
+
+	one = ub_page_table_get_one(mm);
+	if (unlikely(one < 0)) {
+		mem_cgroup_uncharge_page(page);
+		gang_del_user_page(page);
+		put_page(page);
+		return VM_FAULT_OOM;
+	}
+
 	pgtable = pte_alloc_one(mm, haddr);
 	if (unlikely(!pgtable)) {
+		ub_page_table_put_one(mm, one);
 		mem_cgroup_uncharge_page(page);
+		gang_del_user_page(page);
 		put_page(page);
 		return VM_FAULT_OOM;
 	}
@@ -620,10 +634,19 @@ static int __do_huge_pmd_anonymous_page(
 
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_none(*pmd))) {
+		ub_page_table_put_one(mm, one);
+		spin_unlock(&mm->page_table_lock);
+		mem_cgroup_uncharge_page(page);
+		gang_del_user_page(page);
+		put_page(page);
+		pte_free(mm, pgtable);
+	} else if (ub_page_table_charge(mm, one)) {
 		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(page);
+		gang_del_user_page(page);
 		put_page(page);
 		pte_free(mm, pgtable);
+		ret = VM_FAULT_OOM;
 	} else {
 		pmd_t entry;
 		entry = mk_pmd(page, vma->vm_page_prot);
@@ -681,6 +704,8 @@ int do_huge_pmd_anonymous_page(struct mm
 			return VM_FAULT_OOM;
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
+		if (unlikely(ub_precharge_hpage(mm)))
+			goto out;
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 					  vma, haddr, numa_node_id(), 0);
 		if (unlikely(!page)) {
@@ -688,7 +713,12 @@ int do_huge_pmd_anonymous_page(struct mm
 			goto out;
 		}
 		count_vm_event(THP_FAULT_ALLOC);
+		if (gang_add_user_page(page, get_mm_gang(mm), GFP_KERNEL)) {
+			put_page(page);
+			goto out;
+		}
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+			gang_del_user_page(page);
 			put_page(page);
 			goto out;
 		}
@@ -725,11 +755,18 @@ int copy_huge_pmd(struct mm_struct *dst_
 	pmd_t pmd;
 	pgtable_t pgtable;
 	int ret;
+	int one;
 
 	ret = -ENOMEM;
+	one = ub_page_table_get_one(dst_mm);
+	if (one < 0)
+		goto out;
+
 	pgtable = pte_alloc_one(dst_mm, addr);
-	if (unlikely(!pgtable))
+	if (unlikely(!pgtable)) {
+		ub_page_table_put_one(dst_mm, one);
 		goto out;
+	}
 
 	spin_lock(&dst_mm->page_table_lock);
 	spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
@@ -737,10 +774,12 @@ int copy_huge_pmd(struct mm_struct *dst_
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 	if (unlikely(!pmd_trans_huge(pmd))) {
+		ub_page_table_put_one(dst_mm, one);
 		pte_free(dst_mm, pgtable);
 		goto out_unlock;
 	}
 	if (unlikely(pmd_trans_splitting(pmd))) {
+		ub_page_table_put_one(dst_mm, one);
 		/* split huge page running from under us */
 		spin_unlock(&src_mm->page_table_lock);
 		spin_unlock(&dst_mm->page_table_lock);
@@ -749,6 +788,11 @@ int copy_huge_pmd(struct mm_struct *dst_
 		wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
 		goto out;
 	}
+	ret = -ENOMEM;
+	if (ub_page_table_charge(dst_mm, one)) {
+		pte_free(dst_mm, pgtable);
+		goto out_unlock;
+	}
 	src_page = pmd_page(pmd);
 	VM_BUG_ON(!PageHead(src_page));
 	get_page(src_page);
@@ -812,12 +856,17 @@ static int do_huge_pmd_wp_page_fallback(
 					       __GFP_OTHER_NODE,
 					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
+			     gang_add_user_page(pages[i], get_mm_gang(mm),
+							GFP_KERNEL) ||
 			     mem_cgroup_newpage_charge(pages[i], mm,
 						       GFP_KERNEL))) {
+			if (pages[i] && page_gang(pages[i]))
+				gang_del_user_page(pages[i]);
 			if (pages[i])
 				put_page(pages[i]);
 			while (--i >= 0) {
 				mem_cgroup_uncharge_page(pages[i]);
+				gang_del_user_page(pages[i]);
 				put_page(pages[i]);
 			}
 			kfree(pages);
@@ -871,6 +920,7 @@ out_free_pages:
 	spin_unlock(&mm->page_table_lock);
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		mem_cgroup_uncharge_page(pages[i]);
+		gang_del_user_page(pages[i]);
 		put_page(pages[i]);
 	}
 	kfree(pages);
@@ -904,6 +954,11 @@ int do_huge_pmd_wp_page(struct mm_struct
 	get_page(page);
 	spin_unlock(&mm->page_table_lock);
 
+	if (unlikely(ub_precharge_hpage(mm))) {
+		ret |= VM_FAULT_OOM;
+		goto out;
+	}
+
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -922,7 +977,15 @@ int do_huge_pmd_wp_page(struct mm_struct
 	}
 	count_vm_event(THP_FAULT_ALLOC);
 
+	if (gang_add_user_page(new_page, get_mm_gang(mm), GFP_KERNEL)) {
+		put_page(new_page);
+		put_page(page);
+		ret |= VM_FAULT_OOM;
+		goto out;
+	}
+
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+		gang_del_user_page(new_page);
 		put_page(new_page);
 		split_huge_page(page);
 		put_page(page);
@@ -937,6 +1000,7 @@ int do_huge_pmd_wp_page(struct mm_struct
 	put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		mem_cgroup_uncharge_page(new_page);
+		gang_del_user_page(new_page);
 		put_page(new_page);
 	} else {
 		pmd_t entry;
@@ -1015,6 +1079,7 @@ int zap_huge_pmd(struct mmu_gather *tlb,
 			VM_BUG_ON(page_mapcount(page) < 0);
 			add_mm_counter(tlb->mm, anon_rss, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
+			ub_page_table_uncharge(tlb->mm);
 			tlb->mm->nr_ptes--;
 			spin_unlock(&tlb->mm->page_table_lock);
 			tlb_remove_page(tlb, page);
@@ -1196,11 +1261,12 @@ static int __split_huge_page_splitting(s
 static void __split_huge_page_refcount(struct page *page)
 {
 	int i;
-	struct zone *zone = page_zone(page);
+	struct lruvec *lruvec;
 	int tail_count = 0;
 
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
-	spin_lock_irq(&zone->lru_lock);
+	local_irq_disable();
+	lruvec = lock_page_lru(page);
 	compound_lock(page);
 
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
@@ -1261,17 +1327,18 @@ static void __split_huge_page_refcount(s
 		BUG_ON(!PageSwapBacked(page_tail));
 
 		mem_cgroup_split_hugepage_commit(page_tail, page);
-		lru_add_page_tail(zone, page, page_tail);
+		set_page_lruvec(page_tail, lruvec);
+		lru_add_page_tail(lruvec, page, page_tail);
 	}
 	atomic_sub(tail_count, &page->_count);
 	BUG_ON(atomic_read(&page->_count) <= 0);
 
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_ANON_PAGES, HPAGE_PMD_NR);
 
 	ClearPageCompound(page);
 	compound_unlock(page);
-	spin_unlock_irq(&zone->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		struct page *page_tail = page + i;
@@ -1440,7 +1507,8 @@ out:
 	return ret;
 }
 
-int hugepage_madvise(unsigned long *vm_flags, int advice)
+int hugepage_madvise(struct vm_area_struct *vma,
+		     unsigned long *vm_flags, int advice)
 {
 	switch (advice) {
 	case MADV_HUGEPAGE:
@@ -1455,6 +1523,13 @@ int hugepage_madvise(unsigned long *vm_f
 			return -EINVAL;
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
+		/*
+		 * If the vma become good for khugepaged to scan,
+		 * register it here without waiting a page fault that
+		 * may not happen any time soon.
+		 */
+		if (unlikely(khugepaged_enter_vma_merge(vma)))
+			return -ENOMEM;
 		break;
 	case MADV_NOHUGEPAGE:
 		/*
@@ -1468,6 +1543,11 @@ int hugepage_madvise(unsigned long *vm_f
 			return -EINVAL;
 		*vm_flags &= ~VM_HUGEPAGE;
 		*vm_flags |= VM_NOHUGEPAGE;
+		/*
+		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+		 * this vma even if we leave the mm registered in khugepaged if
+		 * it got registered before VM_NOHUGEPAGE was set.
+		 */
 		break;
 	}
 
@@ -1501,6 +1581,7 @@ again:
 	if (unlikely(pmd_trans_huge(*pmd)))
 		goto again;
 }
+EXPORT_SYMBOL(__split_huge_page_pmd);
 
 static int __init khugepaged_slab_init(void)
 {
@@ -1849,7 +1930,17 @@ static void collapse_huge_page(struct mm
 	}
 	count_vm_event(THP_COLLAPSE_ALLOC);
 #endif
+
+	if (gang_add_user_page(new_page, get_mm_gang(mm), GFP_KERNEL)) {
+		up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
+		put_page(new_page);
+#endif
+		return;
+	}
+
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+		gang_del_user_page(new_page);
 		up_read(&mm->mmap_sem);
 #ifdef CONFIG_NUMA
 		put_page(new_page);
@@ -1875,7 +1966,8 @@ static void collapse_huge_page(struct mm
 	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
 		goto out;
 
-	if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+	    (vma->vm_flags & VM_NOHUGEPAGE))
 		goto out;
 
 	if (!vma->anon_vma || vma->vm_ops)
@@ -1971,6 +2063,7 @@ out_up_write:
 
 out:
 	mem_cgroup_uncharge_page(new_page);
+	gang_del_user_page(new_page);
 #ifdef CONFIG_NUMA
 	put_page(new_page);
 #endif
@@ -2110,8 +2203,9 @@ static unsigned int khugepaged_scan_mm_s
 			break;
 		}
 
-		if (!(vma->vm_flags & VM_HUGEPAGE) &&
-		    !khugepaged_always()) {
+		if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+		     !khugepaged_always()) ||
+		    (vma->vm_flags & VM_NOHUGEPAGE)) {
 		skip:
 			progress++;
 			continue;
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/hugetlb.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/hugetlb.c
--- linux-2.6.32-504.3.3.el6.orig/mm/hugetlb.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/hugetlb.c	2015-01-21 12:02:58.679816081 +0300
@@ -624,6 +624,7 @@ static void free_huge_page(struct page *
 		(struct hugepage_subpool *)page_private(page);
 	bool restore_reserve;
 
+	ub_hugetlb_uncharge(page);
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	BUG_ON(page_count(page));
@@ -2382,7 +2383,7 @@ void __unmap_hugepage_range(struct vm_ar
 
 		page = pte_page(pte);
 		if (pte_dirty(pte))
-			set_page_dirty(page);
+			set_page_dirty_mm(page, mm);
 		list_add(&page->lru, &page_list);
 	}
 	spin_unlock(&mm->page_table_lock);
@@ -2538,7 +2539,10 @@ retry_avoidcopy:
 	 * When the original hugepage is shared one, it does not have
 	 * anon_vma prepared.
 	 */
-	if (unlikely(anon_vma_prepare(vma))) {
+	if (unlikely(anon_vma_prepare(vma)) ||
+	    ub_hugetlb_charge(mm_ub(mm), new_page)) {
+		page_cache_release(new_page);
+		page_cache_release(old_page);
 		/* Caller expects lock to be held */
 		spin_lock(&mm->page_table_lock);
 		return VM_FAULT_OOM;
@@ -2655,12 +2659,20 @@ retry:
 		clear_huge_page(page, address, pages_per_huge_page(h));
 		__SetPageUptodate(page);
 
+		if (ub_hugetlb_charge(mm_ub(mm), page)) {
+			put_page(page);
+			ret = VM_FAULT_OOM;
+			goto out;
+		}
+
 		if (vma->vm_flags & VM_MAYSHARE) {
 			int err;
 			struct inode *inode = mapping->host;
 
-			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+			__set_page_locked(page);
+			err = add_to_page_cache_nogang(page, mapping, idx, GFP_KERNEL);
 			if (err) {
+				__clear_page_locked(page);
 				put_page(page);
 				if (err == -EEXIST)
 					goto retry;
@@ -2674,6 +2686,7 @@ retry:
 		} else {
 			lock_page(page);
 			if (unlikely(anon_vma_prepare(vma))) {
+				ub_hugetlb_uncharge(page);
 				ret = VM_FAULT_OOM;
 				goto backout_unlocked;
 			}
@@ -2992,7 +3005,7 @@ void hugetlb_change_protection(struct vm
 int hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
 					struct vm_area_struct *vma,
-					int acctflag)
+					vm_flags_t vm_flags)
 {
 	long ret, chg;
 	struct hstate *h = hstate_inode(inode);
@@ -3003,7 +3016,7 @@ int hugetlb_reserve_pages(struct inode *
 	 * attempt will be made for VM_NORESERVE to allocate a page
 	 * without using reserves
 	 */
-	if (acctflag & VM_NORESERVE)
+	if (vm_flags & VM_NORESERVE)
 		return 0;
 
 	/*
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/init-mm.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/init-mm.c
--- linux-2.6.32-504.3.3.el6.orig/mm/init-mm.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/init-mm.c	2015-01-21 12:02:58.011833811 +0300
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
+#include <linux/module.h>
 
 #include <asm/atomic.h>
 #include <asm/pgtable.h>
@@ -18,3 +19,4 @@ struct mm_struct init_mm = {
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
 };
+EXPORT_SYMBOL(init_mm);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/internal.h linux-2.6.32-504.3.3.el6-042stab103_6/mm/internal.h
--- linux-2.6.32-504.3.3.el6.orig/mm/internal.h	2014-12-12 23:29:10.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/internal.h	2015-01-21 12:02:58.680816055 +0300
@@ -12,6 +12,7 @@
 #define __MM_INTERNAL_H
 
 #include <linux/mm.h>
+#include <linux/rcupdate.h>
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
@@ -122,8 +123,13 @@ static inline void unevictable_migrate_p
 #ifdef CONFIG_MMU
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long end);
+extern void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int acct);
+static inline void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__munlock_vma_pages_range(vma, start, end, 1);
+}
 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
@@ -149,12 +155,6 @@ static inline int is_mlocked_vma(struct 
 }
 
 /*
- * must be called with vma's mmap_sem held for read or write, and page locked.
- */
-extern void mlock_vma_page(struct page *page);
-extern void munlock_vma_page(struct page *page);
-
-/*
  * Clear the page's PageMlocked().  This can be useful in a situation where
  * we want to unconditionally remove a page from the pagecache -- e.g.,
  * on truncation or freeing.
@@ -193,7 +193,7 @@ static inline int is_mlocked_vma(struct 
 	return 0;
 }
 static inline void clear_page_mlock(struct page *page) { }
-static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_vma_page(struct vm_area_struct *vma, struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 
 #endif /* !CONFIG_MMU */
@@ -298,7 +298,6 @@ static inline void mminit_validate_memmo
 #define ZONE_RECLAIM_FULL	-1
 #define ZONE_RECLAIM_SOME	0
 #define ZONE_RECLAIM_SUCCESS	1
-#endif
 
 extern int hwpoison_filter(struct page *p);
 
@@ -308,3 +307,5 @@ extern u64 hwpoison_filter_flags_mask;
 extern u64 hwpoison_filter_flags_value;
 extern u64 hwpoison_filter_memcg;
 extern u32 hwpoison_filter_enable;
+
+#endif /* __MM_INTERNAL_H */
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/iov-iter.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/iov-iter.c
--- linux-2.6.32-504.3.3.el6.orig/mm/iov-iter.c	2015-01-21 12:02:58.158829910 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/iov-iter.c	2015-01-21 12:02:58.235827867 +0300
@@ -0,0 +1,474 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/hardirq.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+static size_t __iovec_copy_to_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_to_user_inatomic(buf, vaddr, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_to_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_to_user_inatomic(buf, kaddr + offset, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_to_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_to_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = copy_to_user(buf, kaddr + offset, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_from_user(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+static void ii_iovec_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+		i->count -= bytes;
+	} else {
+		struct iovec *iov = (struct iovec *)i->data;
+		size_t base = i->iov_offset;
+		unsigned long nr_segs = i->nr_segs;
+
+		/*
+		 * The !iov->iov_len check ensures we skip over unlikely
+		 * zero-length segments (without overruning the iovec).
+		 */
+		while (bytes || unlikely(i->count && !iov->iov_len)) {
+			int copy;
+
+			copy = min(bytes, iov->iov_len - base);
+			BUG_ON(!i->count || i->count < copy);
+			i->count -= copy;
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				nr_segs--;
+				base = 0;
+			}
+		}
+		i->data = (unsigned long)iov;
+		i->iov_offset = base;
+		i->nr_segs = nr_segs;
+	}
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+static int ii_iovec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char __user *buf = iov->iov_base + i->iov_offset;
+	bytes = min(bytes, iov->iov_len - i->iov_offset);
+	return fault_in_pages_readable(buf, bytes);
+}
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+static size_t ii_iovec_single_seg_count(const struct iov_iter *i)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+static int ii_iovec_shorten(struct iov_iter *i, size_t count)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	i->nr_segs = iov_shorten(iov, i->nr_segs, count);
+	return 0;
+}
+
+struct iov_iter_ops ii_iovec_ops = {
+	.ii_copy_to_user_atomic = ii_iovec_copy_to_user_atomic,
+	.ii_copy_to_user = ii_iovec_copy_to_user,
+	.ii_copy_from_user_atomic = ii_iovec_copy_from_user_atomic,
+	.ii_copy_from_user = ii_iovec_copy_from_user,
+	.ii_advance = ii_iovec_advance,
+	.ii_fault_in_readable = ii_iovec_fault_in_readable,
+	.ii_single_seg_count = ii_iovec_single_seg_count,
+	.ii_shorten = ii_iovec_shorten,
+};
+EXPORT_SYMBOL(ii_iovec_ops);
+
+/*
+ * As an easily verifiable first pass, we implement all the methods that
+ * copy data to and from bvec pages with one function.  We implement it
+ * all with kmap_atomic().
+ */
+static size_t bvec_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+				    unsigned long page_offset, size_t bytes,
+				    int topage)
+{
+	struct bio_vec *bvec = (struct bio_vec *)iter->data;
+	size_t bvec_offset = iter->iov_offset;
+	size_t remaining = bytes;
+	void *bvec_map;
+	void *page_map;
+	size_t copy;
+
+	page_map = kmap_atomic(page, KM_USER0);
+
+	BUG_ON(bytes > iter->count);
+	while (remaining) {
+		BUG_ON(bvec->bv_len == 0);
+		BUG_ON(bvec_offset >= bvec->bv_len);
+		copy = min(remaining, bvec->bv_len - bvec_offset);
+		bvec_map = kmap_atomic(bvec->bv_page, KM_USER1);
+		if (topage)
+			memcpy(page_map + page_offset,
+			       bvec_map + bvec->bv_offset + bvec_offset,
+			       copy);
+		else
+			memcpy(bvec_map + bvec->bv_offset + bvec_offset,
+			       page_map + page_offset,
+			       copy);
+		kunmap_atomic(bvec_map, KM_USER1);
+		remaining -= copy;
+		bvec_offset += copy;
+		page_offset += copy;
+		if (bvec_offset == bvec->bv_len) {
+			bvec_offset = 0;
+			bvec++;
+		}
+	}
+
+	kunmap_atomic(page_map, KM_USER0);
+
+	return bytes;
+}
+
+size_t ii_bvec_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_to_user(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				     unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_bvec_copy_from_user(struct page *page, struct iov_iter *i,
+			      unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+/*
+ * bio_vecs have a stricter structure than iovecs that might have
+ * come from userspace.  There are no zero length bio_vec elements.
+ */
+void ii_bvec_advance(struct iov_iter *i, size_t bytes)
+{
+	struct bio_vec *bvec = (struct bio_vec *)i->data;
+	size_t offset = i->iov_offset;
+	size_t delta;
+
+	BUG_ON(i->count < bytes);
+	while (bytes) {
+		BUG_ON(bvec->bv_len == 0);
+		BUG_ON(bvec->bv_len <= offset);
+		delta = min(bytes, bvec->bv_len - offset);
+		offset += delta;
+		i->count -= delta;
+		bytes -= delta;
+		if (offset == bvec->bv_len) {
+			bvec++;
+			offset = 0;
+		}
+	}
+
+	i->data = (unsigned long)bvec;
+	i->iov_offset = offset;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_bvec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return 0;
+}
+
+size_t ii_bvec_single_seg_count(const struct iov_iter *i)
+{
+	const struct bio_vec *bvec = (struct bio_vec *)i->data;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, bvec->bv_len - i->iov_offset);
+}
+
+static int ii_bvec_shorten(struct iov_iter *i, size_t count)
+{
+	return -EINVAL;
+}
+
+struct iov_iter_ops ii_bvec_ops = {
+	.ii_copy_to_user_atomic = ii_bvec_copy_to_user_atomic,
+	.ii_copy_to_user = ii_bvec_copy_to_user,
+	.ii_copy_from_user_atomic = ii_bvec_copy_from_user_atomic,
+	.ii_copy_from_user = ii_bvec_copy_from_user,
+	.ii_advance = ii_bvec_advance,
+	.ii_fault_in_readable = ii_bvec_fault_in_readable,
+	.ii_single_seg_count = ii_bvec_single_seg_count,
+	.ii_shorten = ii_bvec_shorten,
+};
+EXPORT_SYMBOL(ii_bvec_ops);
+
+/* Functions to get on with single page */
+
+static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+				    unsigned long page_offset, size_t bytes,
+				    int topage)
+{
+	struct page *ipage = (struct page *)iter->data;
+	size_t ipage_offset = iter->iov_offset;
+	void *ipage_map;
+	void *page_map;
+
+	BUG_ON(bytes > iter->count);
+	BUG_ON(bytes > PAGE_SIZE - ipage_offset);
+	BUG_ON(ipage_offset >= PAGE_SIZE);
+
+	page_map = kmap_atomic(page, KM_USER0);
+	ipage_map = kmap_atomic(ipage, KM_USER1);
+
+	if (topage)
+		memcpy(page_map + page_offset,
+		       ipage_map + ipage_offset,
+		       bytes);
+	else
+		memcpy(ipage_map + ipage_offset,
+		       page_map + page_offset,
+		       bytes);
+
+	kunmap_atomic(ipage_map, KM_USER1);
+	kunmap_atomic(page_map, KM_USER0);
+
+	return bytes;
+}
+
+size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_to_user(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				     unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_page_copy_from_user(struct page *page, struct iov_iter *i,
+			      unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+void ii_page_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+	BUG_ON(i->iov_offset >= PAGE_SIZE);
+	BUG_ON(bytes > PAGE_SIZE - i->iov_offset);
+
+	i->iov_offset += bytes;
+	i->count      -= bytes;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_page_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return 0;
+}
+
+size_t ii_page_single_seg_count(const struct iov_iter *i)
+{
+	BUG_ON(i->nr_segs != 1);
+
+	return i->count;
+}
+
+static int ii_page_shorten(struct iov_iter *i, size_t count)
+{
+	return -EINVAL;
+}
+
+struct iov_iter_ops ii_page_ops = {
+	.ii_copy_to_user_atomic = ii_page_copy_to_user_atomic,
+	.ii_copy_to_user = ii_page_copy_to_user,
+	.ii_copy_from_user_atomic = ii_page_copy_from_user_atomic,
+	.ii_copy_from_user = ii_page_copy_from_user,
+	.ii_advance = ii_page_advance,
+	.ii_fault_in_readable = ii_page_fault_in_readable,
+	.ii_single_seg_count = ii_page_single_seg_count,
+	.ii_shorten = ii_page_shorten,
+};
+EXPORT_SYMBOL(ii_page_ops);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/kmemleak.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/kmemleak.c
--- linux-2.6.32-504.3.3.el6.orig/mm/kmemleak.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/kmemleak.c	2015-01-21 12:02:58.449822187 +0300
@@ -93,6 +93,7 @@
 #include <linux/nodemask.h>
 #include <linux/mm.h>
 #include <linux/workqueue.h>
+#include <linux/crc32.h>
 
 #include <asm/sections.h>
 #include <asm/processor.h>
@@ -108,7 +109,6 @@
 #define MSECS_MIN_AGE		5000	/* minimum object age for reporting */
 #define SECS_FIRST_SCAN		60	/* delay before the first scan */
 #define SECS_SCAN_WAIT		600	/* subsequent auto scanning delay */
-#define GRAY_LIST_PASSES	25	/* maximum number of gray list scans */
 #define MAX_SCAN_SIZE		4096	/* maximum size of a scanned block */
 
 #define BYTES_PER_POINTER	sizeof(void *)
@@ -119,8 +119,8 @@
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
 	struct hlist_node node;
-	unsigned long offset;
-	size_t length;
+	unsigned long start;
+	size_t size;
 };
 
 #define KMEMLEAK_GREY	0
@@ -149,6 +149,8 @@ struct kmemleak_object {
 	int min_count;
 	/* the total number of pointers found pointing to this object */
 	int count;
+	/* checksum for detecting modified objects */
+	u32 checksum;
 	/* memory ranges to be scanned inside an object (empty for all) */
 	struct hlist_head area_list;
 	unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
 #define OBJECT_REPORTED		(1 << 1)
 /* flag set to not scan the object */
 #define OBJECT_NO_SCAN		(1 << 2)
-/* flag set on newly allocated objects */
-#define OBJECT_NEW		(1 << 3)
 
 /* number of bytes to print per line; must be 16 or 32 */
 #define HEX_ROW_SIZE		16
@@ -244,8 +244,6 @@ struct early_log {
 	const void *ptr;		/* allocated/freed memory block */
 	size_t size;			/* memory block size */
 	int min_count;			/* minimum reference count */
-	unsigned long offset;		/* scan area offset */
-	size_t length;			/* scan area length */
 	unsigned long trace[MAX_TRACE];	/* stack trace */
 	unsigned int trace_len;		/* stack trace length */
 };
@@ -326,11 +324,6 @@ static bool color_gray(const struct kmem
 		object->count >= object->min_count;
 }
 
-static bool color_black(const struct kmemleak_object *object)
-{
-	return object->min_count == KMEMLEAK_BLACK;
-}
-
 /*
  * Objects are considered unreferenced only if their color is white, they have
  * not be deleted and have a minimum age to avoid false positives caused by
@@ -338,7 +331,7 @@ static bool color_black(const struct kme
  */
 static bool unreferenced_object(struct kmemleak_object *object)
 {
-	return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+	return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
 		time_before_eq(object->jiffies + jiffies_min_age,
 			       jiffies_last_scan);
 }
@@ -384,6 +377,7 @@ static void dump_object_info(struct kmem
 	pr_notice("  min_count = %d\n", object->min_count);
 	pr_notice("  count = %d\n", object->count);
 	pr_notice("  flags = 0x%lx\n", object->flags);
+	pr_notice("  checksum = %d\n", object->checksum);
 	pr_notice("  backtrace:\n");
 	print_stack_trace(&trace, 4);
 }
@@ -525,12 +519,13 @@ static struct kmemleak_object *create_ob
 	INIT_HLIST_HEAD(&object->area_list);
 	spin_lock_init(&object->lock);
 	atomic_set(&object->use_count, 1);
-	object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
+	object->flags = OBJECT_ALLOCATED;
 	object->pointer = ptr;
 	object->size = size;
 	object->min_count = min_count;
-	object->count = -1;			/* no color initially */
+	object->count = 0;			/* white color initially */
 	object->jiffies = jiffies;
+	object->checksum = 0;
 
 	/* task information */
 	if (in_irq()) {
@@ -723,14 +718,13 @@ static void make_black_object(unsigned l
  * Add a scanning area to the object. If at least one such area is added,
  * kmemleak will only scan these ranges rather than the whole memory block.
  */
-static void add_scan_area(unsigned long ptr, unsigned long offset,
-			  size_t length, gfp_t gfp)
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
 {
 	unsigned long flags;
 	struct kmemleak_object *object;
 	struct kmemleak_scan_area *area;
 
-	object = find_and_get_object(ptr, 0);
+	object = find_and_get_object(ptr, 1);
 	if (!object) {
 		kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
 			      ptr);
@@ -744,7 +738,7 @@ static void add_scan_area(unsigned long 
 	}
 
 	spin_lock_irqsave(&object->lock, flags);
-	if (offset + length > object->size) {
+	if (ptr + size > object->pointer + object->size) {
 		kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
 		dump_object_info(object);
 		kmem_cache_free(scan_area_cache, area);
@@ -752,8 +746,8 @@ static void add_scan_area(unsigned long 
 	}
 
 	INIT_HLIST_NODE(&area->node);
-	area->offset = offset;
-	area->length = length;
+	area->start = ptr;
+	area->size = size;
 
 	hlist_add_head(&area->node, &object->area_list);
 out_unlock:
@@ -789,7 +783,7 @@ static void object_no_scan(unsigned long
  * processed later once kmemleak is fully initialized.
  */
 static void __init log_early(int op_type, const void *ptr, size_t size,
-			     int min_count, unsigned long offset, size_t length)
+			     int min_count)
 {
 	unsigned long flags;
 	struct early_log *log;
@@ -811,8 +805,6 @@ static void __init log_early(int op_type
 	log->ptr = ptr;
 	log->size = size;
 	log->min_count = min_count;
-	log->offset = offset;
-	log->length = length;
 	if (op_type == KMEMLEAK_ALLOC)
 		log->trace_len = __save_stack_trace(log->trace);
 	crt_early_log++;
@@ -861,7 +853,7 @@ void __ref kmemleak_alloc(const void *pt
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
 		create_object((unsigned long)ptr, size, min_count, gfp);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+		log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc);
 
@@ -876,7 +868,7 @@ void __ref kmemleak_free(const void *ptr
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
 		delete_object_full((unsigned long)ptr);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+		log_early(KMEMLEAK_FREE, ptr, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
 
@@ -891,7 +883,7 @@ void __ref kmemleak_free_part(const void
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
 		delete_object_part((unsigned long)ptr, size);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
+		log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free_part);
 
@@ -906,7 +898,7 @@ void __ref kmemleak_not_leak(const void 
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
 		make_gray_object((unsigned long)ptr);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+		log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
 
@@ -922,22 +914,21 @@ void __ref kmemleak_ignore(const void *p
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
 		make_black_object((unsigned long)ptr);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+		log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_ignore);
 
 /*
  * Limit the range to be scanned in an allocated memory block.
  */
-void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
-			      size_t length, gfp_t gfp)
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
 {
 	pr_debug("%s(0x%p)\n", __func__, ptr);
 
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
-		add_scan_area((unsigned long)ptr, offset, length, gfp);
+		add_scan_area((unsigned long)ptr, size, gfp);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+		log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
 }
 EXPORT_SYMBOL(kmemleak_scan_area);
 
@@ -951,11 +942,25 @@ void __ref kmemleak_no_scan(const void *
 	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
 		object_no_scan((unsigned long)ptr);
 	else if (atomic_read(&kmemleak_early_log))
-		log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+		log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_no_scan);
 
 /*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+	u32 old_csum = object->checksum;
+
+	if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
+		return false;
+
+	object->checksum = crc32(0, (void *)object->pointer, object->size);
+	return object->checksum != old_csum;
+}
+
+/*
  * Memory scanning is a long process and it needs to be interruptable. This
  * function checks whether such interrupt condition occured.
  */
@@ -1034,11 +1039,14 @@ static void scan_block(void *_start, voi
 		 * added to the gray_list.
 		 */
 		object->count++;
-		if (color_gray(object))
+		if (color_gray(object)) {
 			list_add_tail(&object->gray_list, &gray_list);
-		else
-			put_object(object);
+			spin_unlock_irqrestore(&object->lock, flags);
+			continue;
+		}
+
 		spin_unlock_irqrestore(&object->lock, flags);
+		put_object(object);
 	}
 }
 
@@ -1078,14 +1086,47 @@ static void scan_object(struct kmemleak_
 		}
 	} else
 		hlist_for_each_entry(area, elem, &object->area_list, node)
-			scan_block((void *)(object->pointer + area->offset),
-				   (void *)(object->pointer + area->offset
-					    + area->length), object, 0);
+			scan_block((void *)area->start,
+				   (void *)(area->start + area->size),
+				   object, 0);
 out:
 	spin_unlock_irqrestore(&object->lock, flags);
 }
 
 /*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+	struct kmemleak_object *object, *tmp;
+
+	/*
+	 * The list traversal is safe for both tail additions and removals
+	 * from inside the loop. The kmemleak objects cannot be freed from
+	 * outside the loop because their use_count was incremented.
+	 */
+	object = list_entry(gray_list.next, typeof(*object), gray_list);
+	while (&object->gray_list != &gray_list) {
+		cond_resched();
+
+		/* may add new objects to the list */
+		if (!scan_should_stop())
+			scan_object(object);
+
+		tmp = list_entry(object->gray_list.next, typeof(*object),
+				 gray_list);
+
+		/* remove the object from the list and release it */
+		list_del(&object->gray_list);
+		put_object(object);
+
+		object = tmp;
+	}
+	WARN_ON(!list_empty(&gray_list));
+}
+
+/*
  * Scan data sections and all the referenced memory blocks allocated via the
  * kernel's standard allocators. This function must be called with the
  * scan_mutex held.
@@ -1093,10 +1134,9 @@ out:
 static void kmemleak_scan(void)
 {
 	unsigned long flags;
-	struct kmemleak_object *object, *tmp;
+	struct kmemleak_object *object;
 	int i;
 	int new_leaks = 0;
-	int gray_list_pass = 0;
 
 	jiffies_last_scan = jiffies;
 
@@ -1117,7 +1157,6 @@ static void kmemleak_scan(void)
 #endif
 		/* reset the reference count (whiten the object) */
 		object->count = 0;
-		object->flags &= ~OBJECT_NEW;
 		if (color_gray(object) && get_object(object))
 			list_add_tail(&object->gray_list, &gray_list);
 
@@ -1166,71 +1205,45 @@ static void kmemleak_scan(void)
 		struct task_struct *p, *g;
 
 		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
+		do_each_thread_all(g, p) {
 			scan_block(task_stack_page(p), task_stack_page(p) +
 				   THREAD_SIZE, NULL, 0);
-		} while_each_thread(g, p);
+		} while_each_thread_all(g, p);
 		read_unlock(&tasklist_lock);
 	}
 
 	/*
 	 * Scan the objects already referenced from the sections scanned
-	 * above. More objects will be referenced and, if there are no memory
-	 * leaks, all the objects will be scanned. The list traversal is safe
-	 * for both tail additions and removals from inside the loop. The
-	 * kmemleak objects cannot be freed from outside the loop because their
-	 * use_count was increased.
+	 * above.
 	 */
-repeat:
-	object = list_entry(gray_list.next, typeof(*object), gray_list);
-	while (&object->gray_list != &gray_list) {
-		cond_resched();
-
-		/* may add new objects to the list */
-		if (!scan_should_stop())
-			scan_object(object);
-
-		tmp = list_entry(object->gray_list.next, typeof(*object),
-				 gray_list);
-
-		/* remove the object from the list and release it */
-		list_del(&object->gray_list);
-		put_object(object);
-
-		object = tmp;
-	}
-
-	if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
-		goto scan_end;
+	scan_gray_list();
 
 	/*
-	 * Check for new objects allocated during this scanning and add them
-	 * to the gray list.
+	 * Check for new or unreferenced objects modified since the previous
+	 * scan and color them gray until the next scan.
 	 */
 	rcu_read_lock();
 	list_for_each_entry_rcu(object, &object_list, object_list) {
 		spin_lock_irqsave(&object->lock, flags);
-		if ((object->flags & OBJECT_NEW) && !color_black(object) &&
-		    get_object(object)) {
-			object->flags &= ~OBJECT_NEW;
+		if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
+		    && update_checksum(object) && get_object(object)) {
+			/* color it gray temporarily */
+			object->count = object->min_count;
 			list_add_tail(&object->gray_list, &gray_list);
 		}
 		spin_unlock_irqrestore(&object->lock, flags);
 	}
 	rcu_read_unlock();
 
-	if (!list_empty(&gray_list))
-		goto repeat;
-
-scan_end:
-	WARN_ON(!list_empty(&gray_list));
+	/*
+	 * Re-scan the gray list for modified unreferenced objects.
+	 */
+	scan_gray_list();
 
 	/*
-	 * If scanning was stopped or new objects were being allocated at a
-	 * higher rate than gray list scanning, do not report any new
-	 * unreferenced objects.
+	 * If scanning was stopped do not report any new unreferenced objects.
 	 */
-	if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
+	if (scan_should_stop())
 		return;
 
 	/*
@@ -1654,8 +1667,7 @@ void __init kmemleak_init(void)
 			kmemleak_ignore(log->ptr);
 			break;
 		case KMEMLEAK_SCAN_AREA:
-			kmemleak_scan_area(log->ptr, log->offset, log->length,
-					   GFP_KERNEL);
+			kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
 			break;
 		case KMEMLEAK_NO_SCAN:
 			kmemleak_no_scan(log->ptr);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/ksm.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/ksm.c
--- linux-2.6.32-504.3.3.el6.orig/mm/ksm.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/ksm.c	2015-01-21 12:02:58.680816055 +0300
@@ -16,6 +16,7 @@
 
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/fs.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
@@ -1075,7 +1076,7 @@ static int try_to_merge_one_page(struct 
 		if (!PageMlocked(kpage)) {
 			unlock_page(page);
 			lock_page(kpage);
-			mlock_vma_page(kpage);
+			mlock_vma_page(vma, kpage);
 			page = kpage;		/* for final unlock */
 		}
 	}
@@ -1884,6 +1885,12 @@ struct page *ksm_might_need_to_copy(stru
 		return page;		/* let do_swap_page report the error */
 
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+
+	if (gang_add_user_page(new_page, get_mm_gang(vma->vm_mm), GFP_KERNEL)) {
+		put_page(new_page);
+		new_page = NULL;
+	}
+
 	if (new_page) {
 		copy_user_highpage(new_page, page, address, vma);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/kstaled.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/kstaled.c
--- linux-2.6.32-504.3.3.el6.orig/mm/kstaled.c	2015-01-21 12:02:58.931809393 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/kstaled.c	2015-01-21 12:02:58.931809393 +0300
@@ -0,0 +1,305 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/mmgang.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/wait.h>
+#include <linux/seqlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/ioport.h>
+#include <linux/backing-dev.h>
+
+static unsigned int kstaled_scan_secs;
+static DECLARE_WAIT_QUEUE_HEAD(kstaled_wait);
+
+struct kstaled_page_info {
+	int referenced_ptes;
+	int dirty_ptes;
+	unsigned long vm_flags;
+};
+
+static int __kstaled_check_page(struct page *page,
+		struct vm_area_struct *vma, unsigned long address, void *data)
+{
+	struct kstaled_page_info *info = data;
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (unlikely(PageTransHuge(page))) {
+		pmd_t *pmd;
+
+		spin_lock(&mm->page_table_lock);
+		pmd = page_check_address_pmd(page, mm, address,
+					     PAGE_CHECK_ADDRESS_PMD_FLAG);
+		if (!pmd) {
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
+
+		if (pmdp_test_and_clear_young(vma, address, pmd)) {
+			info->referenced_ptes++;
+			info->vm_flags |= vma->vm_flags;
+			SetPageYoung(page);
+		}
+		info->dirty_ptes++;
+		spin_unlock(&mm->page_table_lock);
+	} else {
+		pte_t *pte;
+		spinlock_t *ptl;
+
+		pte = page_check_address(page, mm, address, &ptl, 0);
+		if (!pte)
+			goto out;
+
+		if (ptep_test_and_clear_young(vma, address, pte)) {
+			info->referenced_ptes++;
+			info->vm_flags |= vma->vm_flags;
+			SetPageYoung(page);
+		}
+		if (pte_dirty(*pte))
+			info->dirty_ptes++;
+		pte_unmap_unlock(pte, ptl);
+	}
+
+out:
+	if (vma->vm_flags & VM_LOCKED)
+		info->vm_flags |= VM_LOCKED;
+
+	return SWAP_AGAIN;
+}
+
+static void kstaled_check_page(struct page *page,
+			       struct kstaled_page_info *info)
+{
+	info->referenced_ptes = 0;
+	info->dirty_ptes = 0;
+	info->vm_flags = 0;
+
+	rmap_walk(page, __kstaled_check_page, info);
+
+	if (page_test_and_clear_young(page))
+		info->referenced_ptes++;
+
+	if (info->referenced_ptes && !PageAnon(page)) {
+		/*
+		 * Promote shared file-mapped pages.
+		 */
+		if (info->referenced_ptes > 1)
+			SetPageReferenced(page);
+
+		/*
+		 * Stimulate activation of file-backed executable pages at
+		 * first reference.
+		 */
+		if (info->vm_flags & VM_EXEC)
+			SetPageReferenced(page);
+	}
+
+	if (!PageIdle(page)) {
+		SetPageIdle(page);
+		info->referenced_ptes++;
+	}
+}
+
+static int kstaled_scan_page(struct page *page)
+{
+	struct kstaled_page_info info;
+	struct address_space *mapping;
+	struct gang *gang;
+	struct idle_page_stats *stats;
+	int nr_pages = 1;
+	int swap_backed = 1;
+
+	if (!PageLRU(page))
+		goto out;
+
+	if (!PageCompound(page)) {
+		if (PageMlocked(page))
+			goto out;
+		if (!page->mapping && !PageSwapCache(page))
+			goto out;
+	}
+
+	if (!get_page_unless_zero(page))
+		goto out;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_put_page;
+
+	nr_pages = 1 << compound_trans_order(page);
+
+	if (PageMlocked(page))
+		goto out_put_page;
+
+	if (!trylock_page(page))
+		goto out_put_page;
+
+	if (!PageAnon(page) && !PageSwapCache(page)) {
+		mapping = page->mapping;
+		if (!mapping)
+			goto out_unlock_page;
+
+		if (mapping_unevictable(mapping))
+			goto out_unlock_page;
+
+		swap_backed = mapping_cap_swap_backed(mapping);
+		if (!swap_backed &&
+		    !mapping_cap_writeback_dirty(mapping))
+			goto out_unlock_page;
+	}
+
+	kstaled_check_page(page, &info);
+
+	unlock_page(page);
+
+	if (info.referenced_ptes || (info.vm_flags & VM_LOCKED))
+		goto out_put_page;
+
+	rcu_read_lock();
+	gang = page_gang(page);
+	stats = &gang->idle_scan_stats;
+	if (info.dirty_ptes || PageDirty(page) || PageWriteback(page)) {
+		if (swap_backed)
+			stats->idle_dirty_swap += nr_pages;
+		else
+			stats->idle_dirty_file += nr_pages;
+	} else
+		stats->idle_clean += nr_pages;
+	rcu_read_unlock();
+
+out_put_page:
+	put_page(page);
+out:
+	return nr_pages;
+
+out_unlock_page:
+	unlock_page(page);
+	goto out_put_page;
+}
+
+static int kstaled_scan_pages_range(unsigned long start_pfn,
+				    unsigned long nr_pages, void *arg)
+{
+	unsigned long pfn = start_pfn;
+	unsigned long end_pfn = start_pfn + nr_pages;
+
+	while (pfn < end_pfn) {
+		if (!pfn_valid(pfn)) {
+			pfn++;
+			continue;
+		}
+		pfn += kstaled_scan_page(pfn_to_page(pfn));
+		cond_resched();
+	}
+	return 0;
+}
+
+static void kstaled_do_scan(void)
+{
+	walk_system_ram_range(0, 1UL << MAX_PHYSMEM_BITS, NULL,
+			      kstaled_scan_pages_range);
+}
+
+static void kstaled_update_stats(void)
+{
+	struct zone *zone;
+	struct gang *gang;
+
+	rcu_read_lock();
+	for_each_zone(zone) {
+		for_each_gang(gang, zone) {
+			write_seqcount_begin(&gang->idle_page_stats_lock);
+			gang->idle_page_stats = gang->idle_scan_stats;
+			write_seqcount_end(&gang->idle_page_stats_lock);
+			memset(&gang->idle_scan_stats, 0,
+			       sizeof(gang->idle_scan_stats));
+		}
+	}
+	rcu_read_unlock();
+
+}
+
+static inline int kstaled_should_run(void)
+{
+	return kstaled_scan_secs > 0;
+}
+
+static int kstaled_scan_thread(void *arg)
+{
+	set_freezable();
+	set_user_nice(current, 5);
+
+	while (!kthread_should_stop()) {
+		if (kstaled_should_run()) {
+			kstaled_do_scan();
+			kstaled_update_stats();
+		}
+
+		try_to_freeze();
+
+		if (kstaled_should_run()) {
+			schedule_timeout_interruptible(kstaled_scan_secs * HZ);
+		} else {
+			/* zero idle page stats */
+			kstaled_update_stats();
+			wait_event_freezable(kstaled_wait,
+				kstaled_should_run() || kthread_should_stop());
+		}
+	}
+
+	return 0;
+}
+
+static ssize_t kstaled_scan_secs_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", kstaled_scan_secs);
+}
+
+static ssize_t kstaled_scan_secs_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val))
+		return -EINVAL;
+	kstaled_scan_secs = val;
+	wake_up_interruptible(&kstaled_wait);
+	return count;
+}
+
+static struct kobj_attribute kstaled_scan_secs_attr = __ATTR(scan_secs, 0644,
+		kstaled_scan_secs_show, kstaled_scan_secs_store);
+
+static struct attribute *kstaled_attrs[] = {
+	&kstaled_scan_secs_attr.attr,
+	NULL,
+};
+
+static struct attribute_group kstaled_attr_group = {
+	.attrs = kstaled_attrs,
+	.name = "kstaled",
+};
+
+static __init int kstaled_init(void)
+{
+	int err;
+	struct task_struct *kstaled_thread;
+
+	kstaled_thread = kthread_run(kstaled_scan_thread, NULL, "kstaled");
+	if (IS_ERR(kstaled_thread)) {
+		printk(KERN_ERR "Failed to start kstaled\n");
+		return PTR_ERR(kstaled_thread);
+	}
+
+	err = sysfs_create_group(mm_kobj, &kstaled_attr_group);
+	if (err) {
+		printk(KERN_ERR "kstaled: register sysfs failed\n");
+		kthread_stop(kstaled_thread);
+		return err;
+	}
+
+	return 0;
+}
+module_init(kstaled_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/madvise.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/madvise.c
--- linux-2.6.32-504.3.3.el6.orig/mm/madvise.c	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/madvise.c	2015-01-21 12:02:41.739265799 +0300
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/swap.h>
 #include <linux/file.h>
 
 /*
@@ -26,6 +27,7 @@ static int madvise_need_mmap_write(int b
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
+	case MADV_DEACTIVATE:
 		return 0;
 	default:
 		/* be safe, default to 1. list exceptions explicitly */
@@ -80,7 +82,7 @@ static long madvise_behavior(struct vm_a
 		break;
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
-		error = hugepage_madvise(&new_flags, behavior);
+		error = hugepage_madvise(vma, &new_flags, behavior);
 		if (error)
 			goto out;
 		break;
@@ -272,6 +274,26 @@ static int madvise_hwpoison(int bhv, uns
 }
 #endif
 
+static long madvise_deactivate(struct vm_area_struct * vma,
+			       struct vm_area_struct ** prev,
+			       unsigned long start, unsigned long end)
+{
+	unsigned long addr;
+	struct page *page;
+
+	*prev = vma;
+	for (addr = start ; addr < end ; addr++) {
+		page = follow_page(vma, addr, FOLL_GET);
+		if (!page)
+			continue;
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		deactivate_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -283,6 +305,8 @@ madvise_vma(struct vm_area_struct *vma, 
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_DONTNEED:
 		return madvise_dontneed(vma, prev, start, end);
+	case MADV_DEACTIVATE:
+		return madvise_deactivate(vma, prev, start, end);
 	default:
 		return madvise_behavior(vma, prev, start, end, behavior);
 	}
@@ -300,6 +324,7 @@ madvise_behavior_valid(int behavior)
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
+	case MADV_DEACTIVATE:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/memory-failure.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/memory-failure.c
--- linux-2.6.32-504.3.3.el6.orig/mm/memory-failure.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/memory-failure.c	2015-01-21 12:02:43.799211106 +0300
@@ -392,7 +392,7 @@ static struct task_struct *find_early_ki
 	do {
 		if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
 			return t;
-	} while_each_thread(tsk, t);
+	} while_each_thread_all(tsk, t);
 	return NULL;
 }
 
@@ -432,7 +432,7 @@ static void collect_procs_anon(struct pa
 	av = page_lock_anon_vma(page);
 	if (av == NULL)	/* Not actually mapped anymore */
 		goto out;
-	for_each_process (tsk) {
+	for_each_process_all (tsk) {
 		struct anon_vma_chain *vmac;
 		struct task_struct *t = task_early_kill(tsk, force_early);
 
@@ -473,7 +473,7 @@ static void collect_procs_file(struct pa
 
 	read_lock(&tasklist_lock);
 	spin_lock(&mapping->i_mmap_lock);
-	for_each_process(tsk) {
+	for_each_process_all(tsk) {
 		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 		struct task_struct *t = task_early_kill(tsk, force_early);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/memory.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/memory.c
--- linux-2.6.32-504.3.3.el6.orig/mm/memory.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/memory.c	2015-01-21 12:02:58.681816029 +0300
@@ -40,8 +40,10 @@
 
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
+#include <linux/virtinfo.h>
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -56,6 +58,12 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <linux/pram.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/kmem.h>
+#include <bc/vmpages.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -95,7 +103,7 @@ EXPORT_SYMBOL(high_memory);
  * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
  *   as ancient (libc5 based) binaries can segfault. )
  */
-int randomize_va_space __read_mostly =
+int _randomize_va_space __read_mostly =
 #ifdef CONFIG_COMPAT_BRK
 					1;
 #else
@@ -133,18 +141,21 @@ void pgd_clear_bad(pgd_t *pgd)
 	pgd_ERROR(*pgd);
 	pgd_clear(pgd);
 }
+EXPORT_SYMBOL(pgd_clear_bad);
 
 void pud_clear_bad(pud_t *pud)
 {
 	pud_ERROR(*pud);
 	pud_clear(pud);
 }
+EXPORT_SYMBOL(pud_clear_bad);
 
 void pmd_clear_bad(pmd_t *pmd)
 {
 	pmd_ERROR(*pmd);
 	pmd_clear(pmd);
 }
+EXPORT_SYMBOL(pmd_clear_bad);
 
 /*
  * Note: this doesn't free the actual pages themselves. That
@@ -157,6 +168,8 @@ static void free_pte_range(struct mmu_ga
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
 	tlb->mm->nr_ptes--;
+	tlb->ptes_freed++;
+	ub_page_table_uncharge(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -190,6 +203,10 @@ static inline void free_pmd_range(struct
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd, start);
+#ifndef __PAGETABLE_PMD_FOLDED
+	tlb->mm->nr_ptds--;
+	ub_page_table_uncharge(tlb->mm);
+#endif
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -223,6 +240,10 @@ static inline void free_pud_range(struct
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+#ifndef __PAGETABLE_PUD_FOLDED
+	tlb->mm->nr_ptds--;
+	ub_page_table_uncharge(tlb->mm);
+#endif
 }
 
 /*
@@ -323,15 +344,25 @@ void free_pgtables(struct mmu_gather *tl
 		}
 		vma = next;
 	}
+	ub_page_table_commit(tlb->mm);
 }
 
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmd_t *pmd, unsigned long address)
 {
-	pgtable_t new = pte_alloc_one(mm, address);
+	pgtable_t new;
 	int wait_split_huge_page;
-	if (!new)
+	int one;
+
+	one = ub_page_table_get_one(mm);
+	if (one < 0)
+		return -ENOMEM;
+
+	new = pte_alloc_one(mm, address);
+	if (!new) {
+		ub_page_table_put_one(mm, one);
 		return -ENOMEM;
+	}
 
 	/*
 	 * Ensure all pte setup (eg. pte page lock and page clearing) are
@@ -351,18 +382,26 @@ int __pte_alloc(struct mm_struct *mm, st
 	spin_lock(&mm->page_table_lock);
 	wait_split_huge_page = 0;
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
+		if (ub_page_table_charge(mm, one)) {
+			spin_unlock(&mm->page_table_lock);
+			pte_free(mm, new);
+			return -ENOMEM;
+		}
 		mm->nr_ptes++;
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	} else if (unlikely(pmd_trans_splitting(*pmd)))
 		wait_split_huge_page = 1;
 	spin_unlock(&mm->page_table_lock);
-	if (new)
+	if (new) {
+		ub_page_table_put_one(mm, one);
 		pte_free(mm, new);
+	}
 	if (wait_split_huge_page)
 		wait_split_huge_page(vma->anon_vma, pmd);
 	return 0;
 }
+EXPORT_SYMBOL(__pte_alloc);
 
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 {
@@ -402,8 +441,8 @@ add_mm_rss(struct mm_struct *mm, int fil
  *
  * The calling function must still handle the error.
  */
-static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
-			  pte_t pte, struct page *page)
+void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
+		   pte_t pte, struct page *page)
 {
 	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
 	pud_t *pud = pud_offset(pgd, addr);
@@ -462,8 +501,9 @@ static void print_bad_pte(struct vm_area
 	dump_stack();
 	add_taint(TAINT_BAD_PAGE);
 }
+EXPORT_SYMBOL(print_bad_pte);
 
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -576,6 +616,7 @@ check_pfn:
 out:
 	return pfn_to_page(pfn);
 }
+EXPORT_SYMBOL(vm_normal_page);
 
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
@@ -608,17 +649,35 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (!is_migration_entry(entry))
+
+			if (!non_swap_entry(entry)) {
 				rss[2]++;
-			else if (is_write_migration_entry(entry) &&
-					is_cow_mapping(vm_flags)) {
-				/*
-				 * COW mappings require pages in both parent
-				 * and child to be set to read.
-				 */
-				make_migration_entry_read(&entry);
-				pte = swp_entry_to_pte(entry);
-				set_pte_at(src_mm, addr, src_pte, pte);
+			} else if (is_migration_entry(entry)) {
+				page = migration_entry_to_page(entry);
+
+				rss[PageAnon(page)]++;
+
+				if (is_write_migration_entry(entry) &&
+				    is_cow_mapping(vm_flags)) {
+					/*
+					 * COW mappings require pages in both
+					 * parent and child to be set to read.
+					 */
+					make_migration_entry_read(&entry);
+					pte = swp_entry_to_pte(entry);
+					set_pte_at(src_mm, addr, src_pte, pte);
+				}
+			} else if (is_vswap_entry(entry)) {
+				rss[2]++;
+				page = vswap_entry_to_page(entry);
+				get_page(page);
+				get_vswap_page(page);
+				if (is_write_vswap_entry(entry) &&
+				    is_cow_mapping(vm_flags)) {
+					entry = wprotect_vswap_entry(entry);
+					pte = swp_entry_to_pte(entry);
+					set_pte_at(src_mm, addr, src_pte, pte);
+				}
 			}
 		}
 		goto out_set_pte;
@@ -653,8 +712,17 @@ out_set_pte:
 	return 0;
 }
 
+#define pte_ptrs(a)	(PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1)))
+#ifdef CONFIG_BEANCOUNTERS
+#define same_ub(mm1, mm2)      ((mm1)->mm_ub == (mm2)->mm_ub)
+#else
+#define same_ub(mm1, mm2)      1
+#endif
+
 int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+		   pmd_t *dst_pmd, pmd_t *src_pmd,
+		   struct vm_area_struct *dst_vma,
+		   struct vm_area_struct *vma,
 		   unsigned long addr, unsigned long end)
 {
 	pte_t *orig_src_pte, *orig_dst_pte;
@@ -701,6 +769,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
+
 	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
@@ -716,7 +785,9 @@ again:
 }
 
 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+		pud_t *dst_pud, pud_t *src_pud,
+		struct vm_area_struct *dst_vma,
+		struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pmd_t *src_pmd, *dst_pmd;
@@ -730,7 +801,10 @@ static inline int copy_pmd_range(struct 
 		next = pmd_addr_end(addr, end);
 		if (pmd_trans_huge(*src_pmd)) {
 			int err;
-			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+			if (next-addr != HPAGE_PMD_SIZE) {
+				split_huge_page_pmd(src_mm, src_pmd);
+				goto split_fallthrough;
+			}
 			err = copy_huge_pmd(dst_mm, src_mm,
 					    dst_pmd, src_pmd, addr, vma);
 			if (err == -ENOMEM)
@@ -739,17 +813,20 @@ static inline int copy_pmd_range(struct 
 				continue;
 			/* fall through */
 		}
+split_fallthrough:
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
-						vma, addr, next))
+						dst_vma, vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+		pgd_t *dst_pgd, pgd_t *src_pgd,
+		struct vm_area_struct *dst_vma,
+		struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pud_t *src_pud, *dst_pud;
@@ -764,19 +841,21 @@ static inline int copy_pud_range(struct 
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
-						vma, addr, next))
+						dst_vma, vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pud++, src_pud++, addr = next, addr != end);
 	return 0;
 }
 
-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		struct vm_area_struct *vma)
+int __copy_page_range(struct vm_area_struct *dst_vma,
+		      struct vm_area_struct *vma,
+		      unsigned long addr, size_t size)
 {
+	struct mm_struct *dst_mm = dst_vma->vm_mm;
+	struct mm_struct *src_mm = vma->vm_mm;
 	pgd_t *src_pgd, *dst_pgd;
 	unsigned long next;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
+	unsigned long end = addr + size;
 	int ret;
 
 	/*
@@ -820,7 +899,7 @@ int copy_page_range(struct mm_struct *ds
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
 		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-					    vma, addr, next))) {
+					    dst_vma, vma, addr, next))) {
 			ret = -ENOMEM;
 			break;
 		}
@@ -831,6 +910,17 @@ int copy_page_range(struct mm_struct *ds
 						  vma->vm_start, end);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__copy_page_range);
+
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+		    struct vm_area_struct *dst_vma, struct vm_area_struct *vma)
+{
+	if (dst_vma->vm_mm != dst)
+		BUG();
+	if (vma->vm_mm != src)
+		BUG();
+	return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start);
+}
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
@@ -892,7 +982,7 @@ static unsigned long zap_pte_range(struc
 				trace_mm_anon_userfree(mm, addr);
 			} else {
 				if (pte_dirty(ptent))
-					set_page_dirty(page);
+					set_page_dirty_mm(page, mm);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
@@ -917,8 +1007,23 @@ static unsigned long zap_pte_range(struc
 		} else {
 			swp_entry_t ent = pte_to_swp_entry(ptent);
 
-			if (!is_migration_entry(ent))
+			if (!non_swap_entry(ent)) {
+				swap_usage--;
+			} else if (is_migration_entry(ent)) {
+				struct page *page;
+
+				page = migration_entry_to_page(ent);
+
+				if (PageAnon(page))
+					anon_rss--;
+				else
+					file_rss--;
+			} else if (is_vswap_entry(ent)) {
+				struct page *page = vswap_entry_to_page(ent);
 				swap_usage--;
+				put_vswap_page(page);
+				put_page(page);
+			}
 			if (unlikely(!free_swap_and_cache(ent)))
 				print_bad_pte(vma, addr, ptent, NULL);
 		}
@@ -1296,7 +1401,7 @@ split_fallthrough:
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
+			set_page_dirty_mm(page, mm);
 		/*
 		 * pte_mkyoung() would be more correct here, but atomic care
 		 * is needed to avoid losing the dirty bit: it is easier to use
@@ -1599,6 +1704,8 @@ int get_user_pages(struct task_struct *t
 		flags |= FOLL_WRITE;
 	if (force)
 		flags |= FOLL_FORCE;
+	if (write < 0)
+		flags &= ~FOLL_TOUCH;
 
 	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
@@ -1677,7 +1784,7 @@ static int insert_page(struct vm_area_st
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
 	inc_mm_counter(mm, file_rss);
-	page_add_file_rmap(page);
+	page_add_file_rmap(page, mm);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
 	retval = 0;
@@ -2166,6 +2273,11 @@ static inline void cow_user_page(struct 
 		copy_user_highpage(dst, src, va, vma);
 }
 
+static inline int check_memory_limits(struct mm_struct *mm)
+{
+	return ub_check_ram_limits(mm_ub(mm), GFP_HIGHUSER);
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -2306,6 +2418,8 @@ reuse:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = pte_mkyoung(orig_pte);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		if (old_page)
+			ClearPageCheckpointed(old_page);
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
@@ -2319,6 +2433,9 @@ reuse:
 gotten:
 	pte_unmap_unlock(page_table, ptl);
 
+	if (unlikely(check_memory_limits(mm)))
+		goto oom;
+
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 
@@ -2344,8 +2461,13 @@ gotten:
 		unlock_page(old_page);
 	}
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+	if (gang_add_user_page(new_page, get_mm_gang(mm), GFP_KERNEL))
+		goto oom_free_new;
+
+	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+		gang_del_user_page(new_page);
 		goto oom_free_new;
+	}
 
 	/*
 	 * Re-check the pte - we dropped the lock
@@ -2365,6 +2487,7 @@ gotten:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ClearPageCheckpointed(new_page);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
 		 * pte with the new entry. This will avoid a race condition
@@ -2409,8 +2532,10 @@ gotten:
 		/* Free the old page.. */
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
-	} else
+	} else {
 		mem_cgroup_uncharge_page(new_page);
+		gang_del_user_page(new_page);
+	}
 
 	if (new_page)
 		page_cache_release(new_page);
@@ -2438,7 +2563,7 @@ unlock:
 		if (page_mkwrite) {
 			struct address_space *mapping = dirty_page->mapping;
 
-			set_page_dirty(dirty_page);
+			set_page_dirty_mm(dirty_page, mm);
 			unlock_page(dirty_page);
 			page_cache_release(dirty_page);
 			if (mapping)	{
@@ -2522,6 +2647,44 @@ static void reset_vma_truncate_counts(st
 		vma->vm_truncate_count = 0;
 }
 
+static int synchronize_mapping_faults_vma(struct address_space *mapping,
+					  struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (vma->vm_truncate_count)
+		return 0;
+
+	vma->vm_truncate_count = 1;
+	atomic_inc(&mm->mm_count);
+	spin_unlock(&mapping->i_mmap_lock);
+	down_write(&mm->mmap_sem);
+	up_write(&mm->mmap_sem);
+	mmdrop(mm);
+	spin_lock(&mapping->i_mmap_lock);
+
+	return 1;
+}
+
+/* under mapping->host->i_mutex */
+void synchronize_mapping_faults(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	spin_lock(&mapping->i_mmap_lock);
+	mapping->truncate_count = 1;
+	reset_vma_truncate_counts(mapping);
+restart:
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (synchronize_mapping_faults_vma(mapping, vma))
+			goto restart;
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (synchronize_mapping_faults_vma(mapping, vma))
+			goto restart;
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
@@ -2659,7 +2822,15 @@ void unmap_mapping_range(struct address_
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
-	details.i_mmap_lock = &mapping->i_mmap_lock;
+
+	zap_mapping_range(mapping, &details);
+}
+EXPORT_SYMBOL(unmap_mapping_range);
+
+void zap_mapping_range(struct address_space *mapping,
+		       struct zap_details *details)
+{
+	details->i_mmap_lock = &mapping->i_mmap_lock;
 
 	spin_lock(&mapping->i_mmap_lock);
 
@@ -2670,15 +2841,96 @@ void unmap_mapping_range(struct address_
 			reset_vma_truncate_counts(mapping);
 		mapping->truncate_count++;
 	}
-	details.truncate_count = mapping->truncate_count;
+	details->truncate_count = mapping->truncate_count;
 
 	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
-		unmap_mapping_range_tree(&mapping->i_mmap, &details);
+		unmap_mapping_range_tree(&mapping->i_mmap, details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
+		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, details);
 	spin_unlock(&mapping->i_mmap_lock);
 }
-EXPORT_SYMBOL(unmap_mapping_range);
+
+static int do_vswap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned int flags, pte_t orig_pte)
+{
+	swp_entry_t entry;
+	struct page *page;
+	spinlock_t *ptl;
+	pte_t pte;
+	int ret = 0;
+
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (unlikely(!pte_same(*page_table, orig_pte))) {
+		pte_unmap_unlock(page_table, ptl);
+		return 0;
+	}
+
+	entry = pte_to_swp_entry(orig_pte);
+	page = vswap_entry_to_page(entry);
+	BUG_ON(!PageUptodate(page));
+	get_page(page);
+
+	pte_unmap_unlock(page_table, ptl);
+
+	if (!page_mapped(page) && !page_in_gang(page, get_mm_gang(mm))) {
+		ub_percpu_inc(mm_ub(mm), vswapin);
+		ub_reclaim_rate_limit(mm_ub(mm), 1, 1);
+		if (!isolate_lru_page(page)) {
+			if (gang_mod_user_page(page, get_mm_gang(mm), GFP_KERNEL))
+				ret = VM_FAULT_OOM;
+			putback_lru_page(page);
+			if (ret == VM_FAULT_OOM)
+				goto out;
+		}
+	}
+
+	if (!lock_page_or_retry(page, mm, flags)) {
+		ret |= VM_FAULT_RETRY;
+		goto out;
+	}
+
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (unlikely(!pte_same(*page_table, orig_pte)))
+		goto out_unmap;
+
+	__count_vm_event(VSWPIN);
+	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
+	pte = mk_pte(page, vma->vm_page_prot);
+	if ((flags & FAULT_FLAG_WRITE) && is_write_vswap_entry(entry)) {
+		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+#ifdef ClearPageCheckpointed
+		ClearPageCheckpointed(page);
+#endif
+		flags &= ~FAULT_FLAG_WRITE;
+	}
+	flush_icache_page(vma, page);
+	set_pte_at(mm, address, page_table, pte);
+	page_add_anon_rmap(page, vma, address);
+	put_vswap_page(page);
+	unlock_page(page);
+	put_page(page);
+
+	if (flags & FAULT_FLAG_WRITE) {
+		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+		if (ret & VM_FAULT_ERROR)
+			ret &= VM_FAULT_ERROR;
+		return ret;
+	}
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, address, pte);
+	pte_unmap_unlock(page_table, ptl);
+	return ret;
+
+out_unmap:
+	pte_unmap_unlock(page_table, ptl);
+	unlock_page(page);
+out:
+	put_page(page);
+	return ret;
+}
 
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
@@ -2697,14 +2949,24 @@ static int do_swap_page(struct mm_struct
 	struct mem_cgroup *ptr = NULL;
 	int exclusive = 0;
 	int ret = 0;
+	cycles_t start;
 
+	start = get_cycles();
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 
+	if (unlikely(check_memory_limits(mm))) {
+		ret = VM_FAULT_OOM;
+		goto out;
+	}
+
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
 			migration_entry_wait(mm, pmd, address);
+		} else if (is_vswap_entry(entry)) {
+			return do_vswap_page(mm, vma, address, page_table, pmd,
+					     flags, orig_pte);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else {
@@ -2742,6 +3004,20 @@ static int do_swap_page(struct mm_struct
 		ret = VM_FAULT_HWPOISON;
 		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 		goto out_release;
+	} else if (!page_mapped(page) && !page_in_gang(page, get_mm_gang(mm))) {
+		ub_percpu_inc(mm_ub(mm), vswapin);
+		ub_reclaim_rate_limit(mm_ub(mm), 1, 1);
+		/*
+		 * move page into container after vswapin throttling
+		 * to protect against endless bouncing in vswap.
+		 */
+		if (!isolate_lru_page(page)) {
+			if (gang_mod_user_page(page, get_mm_gang(mm), GFP_KERNEL))
+				ret = VM_FAULT_OOM;
+			putback_lru_page(page);
+			if (ret == VM_FAULT_OOM)
+				goto out_release;
+		}
 	}
 
 	locked = lock_page_or_retry(page, mm, flags);
@@ -2807,6 +3083,7 @@ static int do_swap_page(struct mm_struct
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+		ClearPageCheckpointed(page);
 		flags &= ~FAULT_FLAG_WRITE;
 		exclusive = 1;
 	}
@@ -2817,7 +3094,8 @@ static int do_swap_page(struct mm_struct
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
 	swap_free(entry);
-	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+	if (vm_swap_full() || ub_swap_full(mm->mm_ub) ||
+			(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
 	if (swapcache) {
@@ -2845,6 +3123,9 @@ static int do_swap_page(struct mm_struct
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	spin_lock_irq(&kstat_glb_lock);
+	KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start);
+	spin_unlock_irq(&kstat_glb_lock);
 	trace_mm_anon_pgin(mm, address);
 	return ret;
 out_nomap:
@@ -2925,6 +3206,9 @@ static int do_anonymous_page(struct mm_s
 	}
 
 	/* Allocate our own private page. */
+	if (unlikely(check_memory_limits(mm)))
+		goto oom;
+
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -2933,8 +3217,14 @@ static int do_anonymous_page(struct mm_s
 	__SetPageUptodate(page);
 
 	trace_mm_anon_fault(mm, address);
-	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+
+	if (gang_add_user_page(page, get_mm_gang(mm), GFP_KERNEL))
+		goto oom_free_page;
+
+	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+		gang_del_user_page(page);
 		goto oom_free_page;
+	}
 
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (vma->vm_flags & VM_WRITE)
@@ -2956,6 +3246,7 @@ unlock:
 	return 0;
 release:
 	mem_cgroup_uncharge_page(page);
+	gang_del_user_page(page);
 	page_cache_release(page);
 	goto unlock;
 oom_free_page:
@@ -2991,12 +3282,17 @@ static int __do_fault(struct mm_struct *
 	struct vm_fault vmf;
 	int ret;
 	int page_mkwrite = 0;
+	cycles_t start;
+
+	if (unlikely(check_memory_limits(mm)))
+		return VM_FAULT_OOM;
 
 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.page = NULL;
 
+	start = get_cycles();
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
 			    VM_FAULT_RETRY)))
@@ -3017,6 +3313,11 @@ static int __do_fault(struct mm_struct *
 	else
 		VM_BUG_ON(!PageLocked(vmf.page));
 
+	local_irq_disable();
+	KSTAT_LAT_PCPU_ADD(&kstat_glob.page_in, smp_processor_id(),
+			get_cycles() - start);
+	local_irq_enable();
+
 	/*
 	 * Should we do an early C-O-W break?
 	 */
@@ -3034,8 +3335,14 @@ static int __do_fault(struct mm_struct *
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
+			if (gang_add_user_page(page, get_mm_gang(mm), GFP_KERNEL)) {
+				ret = VM_FAULT_OOM;
+				page_cache_release(page);
+				goto out;
+			}
 			if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
 				ret = VM_FAULT_OOM;
+				gang_del_user_page(page);
 				page_cache_release(page);
 				goto out;
 			}
@@ -3096,14 +3403,16 @@ static int __do_fault(struct mm_struct *
 	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
-		if (flags & FAULT_FLAG_WRITE)
+		if (flags & FAULT_FLAG_WRITE) {
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			ClearPageCheckpointed(page);
+		}
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
-			page_add_file_rmap(page);
+			page_add_file_rmap(page, mm);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
 				get_page(dirty_page);
@@ -3116,9 +3425,10 @@ static int __do_fault(struct mm_struct *
 	} else {
 		if (charged)
 			mem_cgroup_uncharge_page(page);
-		if (anon)
+		if (anon) {
+			gang_del_user_page(page);
 			page_cache_release(page);
-		else
+		} else
 			anon = 1; /* no anon but release faulted_page */
 	}
 
@@ -3129,7 +3439,7 @@ out:
 		struct address_space *mapping = page->mapping;
 		int dirtied = 0;
 
-		if (set_page_dirty(dirty_page))
+		if (set_page_dirty_mm(dirty_page, mm))
 			dirtied = 1;
 		unlock_page(dirty_page);
 		put_page(dirty_page);
@@ -3348,6 +3658,149 @@ retry:
 
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
+EXPORT_SYMBOL(handle_mm_fault);
+
+static int __install_new_anon_page(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long addr,
+			pmd_t *pmd, struct page *page)
+{
+	spinlock_t *ptl;
+	pte_t *pte;
+	pte_t entry;
+	int err = -ENOMEM;
+
+	__SetPageUptodate(page);
+
+	if (unlikely(check_memory_limits(mm)))
+		goto out;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		goto out;
+
+	if (gang_add_user_page(page, get_mm_gang(mm), GFP_KERNEL))
+		goto out;
+
+	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+		goto out_gang_del;
+
+	entry = mk_pte(page, vma->vm_page_prot);
+	if (vma->vm_flags & VM_WRITE)
+		entry = pte_mkwrite(pte_mkdirty((entry)));
+
+	err = -EBUSY;
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!pte_none(*pte)) {
+		pte_unmap_unlock(pte, ptl);
+		goto out_uncharge;
+	}
+
+	inc_mm_counter(mm, anon_rss);
+	page_add_new_anon_rmap(page, vma, addr);
+	set_pte_at(mm, addr, pte, entry);
+	update_mmu_cache(vma, addr, entry);
+	pte_unmap_unlock(pte, ptl);
+
+	return 0;
+
+out_uncharge:
+	mem_cgroup_uncharge_page(page);
+out_gang_del:
+	gang_del_user_page(page);
+out:
+	return err;
+}
+
+static int __install_anon_page(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long addr,
+			pmd_t *pmd, struct page *page)
+{
+	spinlock_t *ptl;
+	pte_t *pte;
+	pte_t entry;
+
+	if (unlikely(check_memory_limits(mm)))
+		return -ENOMEM;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+
+	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+		return -ENOMEM;
+
+	entry = mk_pte(page, vma->vm_page_prot);
+	if (vma->vm_flags & VM_WRITE)
+		entry = pte_mkwrite(pte_mkdirty((entry)));
+
+	lock_page(page);
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!pte_none(*pte)) {
+		pte_unmap_unlock(pte, ptl);
+		unlock_page(page);
+		mem_cgroup_uncharge_page(page);
+		return -EBUSY;
+	}
+
+	inc_mm_counter(mm, anon_rss);
+	page_add_anon_rmap(page, vma, addr);
+	set_pte_at(mm, addr, pte, entry);
+	update_mmu_cache(vma, addr, entry);
+	pte_unmap_unlock(pte, ptl);
+	unlock_page(page);
+
+	return 0;
+}
+
+/*
+ * Called with mm->mmap_sem held for reading.
+ */
+int install_anon_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		      unsigned long addr, struct page *page)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (unlikely(is_vm_hugetlb_page(vma)))
+		return -EFAULT;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return -EFAULT;
+
+	if (PageCompound(page))
+		return -EFAULT;
+
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+
+	/* See comments in handle_mm_fault */
+	if (unlikely(pmd_none(*pmd)) &&
+	    unlikely(__pte_alloc(mm, vma, pmd, addr)))
+		return -ENOMEM;
+	if (unlikely(pmd_trans_huge(*pmd)))
+		return -EFAULT;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return -ENOMEM;
+	if (!pte_none(*pte)) {
+		pte_unmap(pte);
+		return -EBUSY;
+	}
+
+	pte_unmap(pte);
+	if (!pram_page_dirty(page))
+		return __install_new_anon_page(mm, vma, addr, pmd, page);
+
+	page->mapping = NULL;
+	return __install_anon_page(mm, vma, addr, pmd, page);
+}
+EXPORT_SYMBOL(install_anon_page);
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
@@ -3356,20 +3809,37 @@ retry:
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	pud_t *new = pud_alloc_one(mm, address);
-	if (!new)
+	pud_t *new;
+	int one;
+
+	one = ub_page_table_get_one(mm);
+	if (one < 0)
 		return -ENOMEM;
 
+	new = pud_alloc_one(mm, address);
+	if (!new) {
+		ub_page_table_put_one(mm, one);
+		return -ENOMEM;
+	}
+
 	smp_wmb(); /* See comment in __pte_alloc */
 
 	spin_lock(&mm->page_table_lock);
-	if (pgd_present(*pgd))		/* Another has populated it */
+	if (pgd_present(*pgd)) {		/* Another has populated it */
+		ub_page_table_put_one(mm, one);
 		pud_free(mm, new);
-	else
+	} else if (ub_page_table_charge(mm, one)) {
+		spin_unlock(&mm->page_table_lock);
+		pud_free(mm, new);
+		return -ENOMEM;
+	} else {
 		pgd_populate(mm, pgd, new);
+		mm->nr_ptds++;
+	}
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+EXPORT_SYMBOL(__pud_alloc);
 #endif /* __PAGETABLE_PUD_FOLDED */
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -3379,27 +3849,51 @@ int __pud_alloc(struct mm_struct *mm, pg
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	pmd_t *new = pmd_alloc_one(mm, address);
-	if (!new)
+	pmd_t *new;
+	int one;
+
+	one = ub_page_table_get_one(mm);
+	if (one < 0)
 		return -ENOMEM;
 
+	new = pmd_alloc_one(mm, address);
+	if (!new) {
+		ub_page_table_put_one(mm, one);
+		return -ENOMEM;
+	}
+
 	smp_wmb(); /* See comment in __pte_alloc */
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-	if (pud_present(*pud))		/* Another has populated it */
+	if (pud_present(*pud)) {	/* Another has populated it */
+		ub_page_table_put_one(mm, one);
 		pmd_free(mm, new);
-	else
+	} else if (ub_page_table_charge(mm, one)) {
+		spin_unlock(&mm->page_table_lock);
+		pmd_free(mm, new);
+		return -ENOMEM;
+	} else {
 		pud_populate(mm, pud, new);
+		mm->nr_ptds++;
+	}
 #else
-	if (pgd_present(*pud))		/* Another has populated it */
+	if (pgd_present(*pud)) {	/* Another has populated it */
+		ub_page_table_put_one(mm, one);
 		pmd_free(mm, new);
-	else
+	} else if (ub_page_table_charge(mm, one)) {
+		spin_unlock(&mm->page_table_lock);
+		pmd_free(mm, new);
+		return -ENOMEM;
+	} else {
 		pgd_populate(mm, pud, new);
+		mm->nr_ptds++;
+	}
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+EXPORT_SYMBOL(__pmd_alloc);
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 int make_pages_present(unsigned long addr, unsigned long end)
@@ -3425,6 +3919,7 @@ int make_pages_present(unsigned long add
 		return ret;
 	return ret == len ? 0 : -EFAULT;
 }
+EXPORT_SYMBOL(make_pages_present);
 
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
@@ -3680,6 +4175,7 @@ int access_process_vm(struct task_struct
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(access_process_vm);
 
 /*
  * Print the name of a VMA.
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/memory_hotplug.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/memory_hotplug.c
--- linux-2.6.32-504.3.3.el6.orig/mm/memory_hotplug.c	2014-12-12 23:29:28.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/memory_hotplug.c	2015-01-21 12:02:58.681816029 +0300
@@ -508,7 +508,6 @@ int online_pages(unsigned long pfn, unsi
 	}
 	mutex_unlock(&zonelists_mutex);
 	setup_per_zone_wmarks();
-	calculate_zone_inactive_ratio(zone);
 
 	if (onlined_pages)
 		kswapd_run(zone_to_nid(zone));
@@ -951,7 +950,6 @@ repeat:
 	totalram_pages -= offlined_pages;
 
 	setup_per_zone_wmarks();
-	calculate_zone_inactive_ratio(zone);
 
 	vm_total_pages = nr_free_pagecache_pages();
 	writeback_set_ratelimit();
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mempolicy.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mempolicy.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mempolicy.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mempolicy.c	2015-01-21 12:02:49.540058697 +0300
@@ -609,27 +609,6 @@ check_range(struct mm_struct *mm, unsign
 	return first;
 }
 
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
-{
-	int err = 0;
-	struct mempolicy *old = vma->vm_policy;
-
-	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
-		 vma->vm_ops, vma->vm_file,
-		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-
-	if (vma->vm_ops && vma->vm_ops->set_policy)
-		err = vma->vm_ops->set_policy(vma, new);
-	if (!err) {
-		mpol_get(new);
-		vma->vm_policy = new;
-		mpol_put(old);
-	}
-	return err;
-}
-
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
 		       unsigned long end, struct mempolicy *new_pol)
@@ -677,9 +656,23 @@ static int mbind_range(struct mm_struct 
 			if (err)
 				goto out;
 		}
-		err = policy_vma(vma, new_pol);
-		if (err)
-			goto out;
+
+		/*
+		 * Apply policy to a single VMA. The reference counting of
+		 * policy for vma_policy linkages has already been handled by
+		 * vma_merge and split_vma as necessary. If this is a shared
+		 * policy then ->set_policy will increment the reference count
+		 * for an sp node.
+		 */
+		pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+			vma->vm_start, vma->vm_end, vma->vm_pgoff,
+			vma->vm_ops, vma->vm_file,
+			vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+		if (vma->vm_ops && vma->vm_ops->set_policy) {
+			err = vma->vm_ops->set_policy(vma, new_pol);
+			if (err)
+				goto out;
+		}
 	}
 
  out:
@@ -1862,6 +1855,7 @@ alloc_pages_vma(gfp_t gfp, int order, st
 	put_mems_allowed();
 	return page;
 }
+EXPORT_SYMBOL(alloc_pages_vma);
 
 /**
  * 	alloc_pages_current - Allocate pages.
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mempool.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mempool.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mempool.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mempool.c	2015-01-21 12:02:58.682816003 +0300
@@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_n
 	init_waitqueue_head(&pool->wait);
 	pool->alloc = alloc_fn;
 	pool->free = free_fn;
+	if (alloc_fn == mempool_alloc_slab)
+		kmem_mark_nocharge((struct kmem_cache *)pool_data);
 
 	/*
 	 * First pre-allocate the guaranteed number of buffers.
@@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int 
 	unsigned long flags;
 
 	BUG_ON(new_min_nr <= 0);
+	gfp_mask &= ~__GFP_UBC;
 
 	spin_lock_irqsave(&pool->lock, flags);
 	if (new_min_nr <= pool->min_nr) {
@@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gf
 	gfp_mask |= __GFP_NOMEMALLOC;	/* don't allocate emergency reserves */
 	gfp_mask |= __GFP_NORETRY;	/* don't loop in __alloc_pages */
 	gfp_mask |= __GFP_NOWARN;	/* failures are OK */
+	gfp_mask &= ~__GFP_UBC;
 
 	gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
 
@@ -224,28 +228,40 @@ repeat_alloc:
 	if (likely(pool->curr_nr)) {
 		element = remove_element(pool);
 		spin_unlock_irqrestore(&pool->lock, flags);
+		/* paired with rmb in mempool_free(), read comment there */
+		smp_wmb();
 		return element;
 	}
-	spin_unlock_irqrestore(&pool->lock, flags);
 
-	/* We must not sleep in the GFP_ATOMIC case */
-	if (!(gfp_mask & __GFP_WAIT))
+	/*
+	 * We use gfp mask w/o __GFP_WAIT or IO for the first round.  If
+	 * alloc failed with that and @pool was empty, retry immediately.
+	 */
+	if (gfp_temp != gfp_mask) {
+		spin_unlock_irqrestore(&pool->lock, flags);
+		gfp_temp = gfp_mask;
+		goto repeat_alloc;
+	}
+
+	/* We must not sleep if !__GFP_WAIT */
+	if (!(gfp_mask & __GFP_WAIT)) {
+		spin_unlock_irqrestore(&pool->lock, flags);
 		return NULL;
+	}
 
-	/* Now start performing page reclaim */
-	gfp_temp = gfp_mask;
+	/* Let's wait for someone else to return an element to @pool */
 	init_wait(&wait);
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-	smp_mb();
-	if (!pool->curr_nr) {
-		/*
-		 * FIXME: this should be io_schedule().  The timeout is there
-		 * as a workaround for some DM problems in 2.6.18.
-		 */
-		io_schedule_timeout(5*HZ);
-	}
-	finish_wait(&pool->wait, &wait);
 
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/*
+	 * FIXME: this should be io_schedule().  The timeout is there as a
+	 * workaround for some DM problems in 2.6.18.
+	 */
+	io_schedule_timeout(5*HZ);
+
+	finish_wait(&pool->wait, &wait);
 	goto repeat_alloc;
 }
 EXPORT_SYMBOL(mempool_alloc);
@@ -265,7 +281,39 @@ void mempool_free(void *element, mempool
 	if (unlikely(element == NULL))
 		return;
 
-	smp_mb();
+	/*
+	 * Paired with the wmb in mempool_alloc().  The preceding read is
+	 * for @element and the following @pool->curr_nr.  This ensures
+	 * that the visible value of @pool->curr_nr is from after the
+	 * allocation of @element.  This is necessary for fringe cases
+	 * where @element was passed to this task without going through
+	 * barriers.
+	 *
+	 * For example, assume @p is %NULL at the beginning and one task
+	 * performs "p = mempool_alloc(...);" while another task is doing
+	 * "while (!p) cpu_relax(); mempool_free(p, ...);".  This function
+	 * may end up using curr_nr value which is from before allocation
+	 * of @p without the following rmb.
+	 */
+	smp_rmb();
+
+	/*
+	 * For correctness, we need a test which is guaranteed to trigger
+	 * if curr_nr + #allocated == min_nr.  Testing curr_nr < min_nr
+	 * without locking achieves that and refilling as soon as possible
+	 * is desirable.
+	 *
+	 * Because curr_nr visible here is always a value after the
+	 * allocation of @element, any task which decremented curr_nr below
+	 * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
+	 * incremented to min_nr afterwards.  If curr_nr gets incremented
+	 * to min_nr after the allocation of @element, the elements
+	 * allocated after that are subject to the same guarantee.
+	 *
+	 * Waiters happen iff curr_nr is 0 and the above guarantee also
+	 * ensures that there will be frees which return elements to the
+	 * pool waking up the waiters.
+	 */
 	if (pool->curr_nr < pool->min_nr) {
 		spin_lock_irqsave(&pool->lock, flags);
 		if (pool->curr_nr < pool->min_nr) {
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/migrate.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/migrate.c
--- linux-2.6.32-504.3.3.el6.orig/mm/migrate.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/migrate.c	2015-01-21 12:02:58.913809870 +0300
@@ -31,6 +31,7 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/memcontrol.h>
+#include <linux/mmgang.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
 
@@ -160,7 +161,7 @@ static int remove_migration_pte(struct p
 	} else if (PageAnon(new))
 		page_add_anon_rmap(new, vma, addr);
 	else
-		page_add_file_rmap(new);
+		page_add_file_rmap(new, mm);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, pte);
@@ -756,6 +757,11 @@ static int __unmap_and_move(struct page 
 	}
 	BUG_ON(charge);
 
+	if (gang_add_user_page(newpage, page_gang(page)->set, GFP_KERNEL)) {
+		rc = -ENOMEM;
+		goto uncharge;
+	}
+
 	if (PageWriteback(page)) {
 		/*
 		 * Only in the case of a full syncronous migration is it
@@ -780,6 +786,10 @@ static int __unmap_and_move(struct page 
 	 * just care Anon page here.
 	 */
 	if (PageAnon(page) && !PageKsm(page)) {
+
+		if (PageVSwap(page) && remove_from_vswap(page))
+			goto uncharge;
+
 		/*
 		 * Only page_lock_anon_vma() understands the subtleties of
 		 * getting a hold on an anon_vma from outside one of its mms.
@@ -895,6 +905,11 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
+	if (unlikely(!page_gang(newpage)))
+		gang_add_user_page(newpage, &init_gang_set,
+				GFP_KERNEL|__GFP_NOFAIL);
+
 	/*
 	 * Move the new page to the LRU. If migration was not successful
 	 * then this will free the page.
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mincore.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mincore.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mincore.c	2014-12-12 23:29:17.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mincore.c	2015-01-21 12:02:58.682816003 +0300
@@ -132,6 +132,11 @@ static void mincore_pte_range(struct vm_
 			if (is_migration_entry(entry)) {
 				/* migration entries are always uptodate */
 				*vec = 1;
+			} else if (is_vswap_entry(entry)) {
+				struct page *page = vswap_entry_to_page(entry);
+
+				/* vswap present if already mapped somewhere */
+				*vec = page_mapped(page);
 			} else {
 #ifdef CONFIG_SWAP
 				pgoff = entry.val;
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mlock.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mlock.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mlock.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mlock.c	2015-01-21 12:02:58.682816003 +0300
@@ -17,7 +17,9 @@
 #include <linux/module.h>
 #include <linux/rmap.h>
 #include <linux/mmzone.h>
+#include <linux/mmgang.h>
 #include <linux/hugetlb.h>
+#include <bc/vmpages.h>
 
 #include "internal.h"
 
@@ -76,7 +78,7 @@ void __clear_page_mlock(struct page *pag
  * Mark page as mlocked if not already.
  * If page on LRU, isolate and putback to move to unevictable list.
  */
-void mlock_vma_page(struct page *page)
+void mlock_vma_page(struct vm_area_struct *vma, struct page *page)
 {
 	/* Serialize with page migration */
 	BUG_ON(!PageLocked(page));
@@ -84,10 +86,18 @@ void mlock_vma_page(struct page *page)
 	if (!TestSetPageMlocked(page)) {
 		inc_zone_page_state(page, NR_MLOCK);
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
-		if (!isolate_lru_page(page))
+		if (!isolate_lru_page(page)) {
+			struct gang_set *gs = get_mm_gang(vma->vm_mm);
+
+			if (!page_in_gang(page, gs))
+				gang_mod_user_page(page, gs,
+						GFP_ATOMIC|__GFP_NOFAIL);
+
 			putback_lru_page(page);
+		}
 	}
 }
+EXPORT_SYMBOL(mlock_vma_page);
 
 /*
  * called from munlock()/munmap() path with page supposedly on the LRU.
@@ -139,6 +149,7 @@ void munlock_vma_page(struct page *page)
 		}
 	}
 }
+EXPORT_SYMBOL(munlock_vma_page);
 
 static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
 {
@@ -243,7 +254,7 @@ static long __mlock_vma_pages_range(stru
 				 * only check for file-cache page truncation.
 				 */
 				if (page->mapping)
-					mlock_vma_page(page);
+					mlock_vma_page(vma, page);
 				unlock_page(page);
 			}
 			put_page(page);	/* ref from get_user_pages() */
@@ -337,12 +348,14 @@ no_mlock:
  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
  * free them.  This will result in freeing mlocked pages.
  */
-void munlock_vma_pages_range(struct vm_area_struct *vma,
-			     unsigned long start, unsigned long end)
+void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			     unsigned long start, unsigned long end, int acct)
 {
 	unsigned long addr;
 
 	lru_add_drain();
+	if (acct)
+		ub_locked_uncharge(vma->vm_mm, end - start);
 	vma->vm_flags &= ~VM_LOCKED;
 
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -382,13 +395,14 @@ void munlock_vma_pages_range(struct vm_a
  * For vmas that pass the filters, merge/split as appropriate.
  */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-	unsigned long start, unsigned long end, unsigned int newflags)
+	unsigned long start, unsigned long end, vm_flags_t newflags,
+	bool convert_error)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int nr_pages;
 	int ret = 0;
-	int lock = newflags & VM_LOCKED;
+	int lock = !!(newflags & VM_LOCKED);
 
 	if (newflags == vma->vm_flags ||
 			(vma->vm_flags & (VM_IO | VM_PFNMAP)))
@@ -402,6 +416,12 @@ static int mlock_fixup(struct vm_area_st
 		goto out;	/* don't set VM_LOCKED,  don't count */
 	}
 
+	if (newflags & VM_LOCKED) {
+		ret = ub_locked_charge(mm, end - start);
+		if (ret < 0)
+			goto out;
+	}
+
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 			  vma->vm_file, pgoff, vma_policy(vma));
@@ -413,13 +433,13 @@ static int mlock_fixup(struct vm_area_st
 	if (start != vma->vm_start) {
 		ret = split_vma(mm, vma, start, 1);
 		if (ret)
-			goto out;
+			goto out_uncharge;
 	}
 
 	if (end != vma->vm_end) {
 		ret = split_vma(mm, vma, end, 0);
 		if (ret)
-			goto out;
+			goto out_uncharge;
 	}
 
 success:
@@ -440,7 +460,7 @@ success:
 	if (lock) {
 		vma->vm_flags = newflags;
 		ret = __mlock_vma_pages_range(vma, start, end);
-		if (ret < 0)
+		if ((ret < 0) && (convert_error == true))
 			ret = __mlock_posix_error_return(ret);
 	} else {
 		munlock_vma_pages_range(vma, start, end);
@@ -449,9 +469,14 @@ success:
 out:
 	*prev = vma;
 	return ret;
+
+out_uncharge:
+	if (newflags & VM_LOCKED)
+		ub_locked_uncharge(mm, end - start);
+	goto out;
 }
 
-static int do_mlock(unsigned long start, size_t len, int on)
+static int do_mlock(unsigned long start, size_t len, int on, bool convert_error)
 {
 	unsigned long nstart, end, tmp;
 	struct vm_area_struct * vma, * prev;
@@ -471,7 +496,7 @@ static int do_mlock(unsigned long start,
 		prev = vma;
 
 	for (nstart = start ; ; ) {
-		unsigned int newflags;
+		vm_flags_t newflags;
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
@@ -482,7 +507,7 @@ static int do_mlock(unsigned long start,
 		tmp = vma->vm_end;
 		if (tmp > end)
 			tmp = end;
-		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+		error = mlock_fixup(vma, &prev, nstart, tmp, newflags, convert_error);
 		if (error)
 			break;
 		nstart = tmp;
@@ -500,7 +525,7 @@ static int do_mlock(unsigned long start,
 	return error;
 }
 
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+int __mlock(unsigned long start, size_t len, bool convert_error)
 {
 	unsigned long locked;
 	unsigned long lock_limit;
@@ -523,22 +548,34 @@ SYSCALL_DEFINE2(mlock, unsigned long, st
 
 	/* check against resource limits */
 	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
-		error = do_mlock(start, len, 1);
+		error = do_mlock(start, len, 1, convert_error);
 	up_write(&current->mm->mmap_sem);
 	return error;
 }
+EXPORT_SYMBOL_GPL(__mlock);
 
-SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+	return __mlock(start, len, true);
+}
+
+int __munlock(unsigned long start, size_t len, bool convert_error)
 {
 	int ret;
 
 	down_write(&current->mm->mmap_sem);
 	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
 	start &= PAGE_MASK;
-	ret = do_mlock(start, len, 0);
+	ret = do_mlock(start, len, 0, convert_error);
 	up_write(&current->mm->mmap_sem);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__munlock);
+
+SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
+{
+	return __munlock(start, len, true);
+}
 
 static int do_mlockall(int flags)
 {
@@ -552,14 +589,14 @@ static int do_mlockall(int flags)
 		goto out;
 
 	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
-		unsigned int newflags;
+		vm_flags_t newflags;
 
 		newflags = vma->vm_flags | VM_LOCKED;
 		if (!(flags & MCL_CURRENT))
 			newflags &= ~VM_LOCKED;
 
 		/* Ignore errors */
-		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags, true);
 	}
 out:
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mmap.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mmap.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mmap.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mmap.c	2015-01-21 12:02:58.727814807 +0300
@@ -28,6 +28,7 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/virtinfo.h>
 #include <linux/random.h>
 #include <linux/khugepaged.h>
 
@@ -42,6 +43,9 @@
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
 
+#include <bc/vmpages.h>
+#include <bc/kmem.h>
+
 #ifndef arch_rebalance_pgtables
 #define arch_rebalance_pgtables(addr, len)		(addr)
 #endif
@@ -58,6 +62,7 @@
 #endif
 
 
+static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft);
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
@@ -138,6 +143,12 @@ int __vm_enough_memory(struct mm_struct 
 
 	vm_acct_memory(pages);
 
+#ifdef CONFIG_BEANCOUNTERS
+	if (mm && mm->mm_ub->ub_parms[UB_PRIVVMPAGES].held <=
+			mm->mm_ub->ub_parms[UB_VMGUARPAGES].barrier)
+		return 0;
+#endif
+
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
@@ -248,6 +259,7 @@ void unlink_file_vma(struct vm_area_stru
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 }
+EXPORT_SYMBOL_GPL(unlink_file_vma);
 
 /*
  * Close a vm structure and free it, returning the next.
@@ -257,6 +269,9 @@ static struct vm_area_struct *remove_vma
 	struct vm_area_struct *next = vma->vm_next;
 
 	might_sleep();
+
+	ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start,
+			vma->vm_flags, vma->vm_file);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file) {
@@ -265,7 +280,7 @@ static struct vm_area_struct *remove_vma
 			removed_exe_file_vma(vma->vm_mm);
 	}
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma->vm_mm, vma);
 	return next;
 }
 
@@ -314,7 +329,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+	if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk)
 		goto out;
 set_brk:
 	mm->brk = brk;
@@ -444,7 +459,7 @@ void __vma_link_rb(struct mm_struct *mm,
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 }
 
-static void __vma_link_file(struct vm_area_struct *vma)
+void __vma_link_file(struct vm_area_struct *vma)
 {
 	struct file *file;
 
@@ -465,6 +480,7 @@ static void __vma_link_file(struct vm_ar
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
+EXPORT_SYMBOL_GPL(__vma_link_file);
 
 static void
 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -690,7 +706,7 @@ again:			remove_next = 1 + (end > next->
 			anon_vma_merge(vma, next);
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(mm, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1018,7 +1034,7 @@ unsigned long do_mmap_pgoff(struct file 
 {
 	struct mm_struct * mm = current->mm;
 	struct inode *inode;
-	unsigned int vm_flags;
+	vm_flags_t vm_flags;
 	int error;
 	unsigned long reqprot = prot;
 
@@ -1110,7 +1126,8 @@ unsigned long do_mmap_pgoff(struct file 
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
-			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+			if ((file->f_path.mnt->mnt_flags & MNT_NOEXEC) &&
+					!(flags & MAP_CPT)) {
 				if (vm_flags & VM_EXEC)
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
@@ -1159,7 +1176,7 @@ EXPORT_SYMBOL(do_mmap_pgoff);
  */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-	unsigned int vm_flags = vma->vm_flags;
+	vm_flags_t vm_flags = vma->vm_flags;
 
 	/* If it was private or non-writable, the write bit is already clear */
 	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1180,14 +1197,15 @@ int vma_wants_writenotify(struct vm_area
 
 	/* Can the mapping track the dirty pages? */
 	return vma->vm_file && vma->vm_file->f_mapping &&
-		mapping_cap_account_dirty(vma->vm_file->f_mapping);
+		(mapping_cap_account_dirty(vma->vm_file->f_mapping) ||
+		 test_bit(AS_CHECKPOINT, &vma->vm_file->f_mapping->flags));
 }
 
 /*
  * We account for memory if it's a private writeable mapping,
  * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
 	/*
 	 * hugetlb has its own accounting separate from the core VM
@@ -1201,7 +1219,7 @@ static inline int accountable_mapping(st
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
-			  unsigned int vm_flags, unsigned long pgoff)
+			  vm_flags_t vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1210,6 +1228,7 @@ unsigned long mmap_region(struct file *f
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
 	struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
+	unsigned long ub_charged = 0;
 
 	/* Clear old maps */
 	error = -ENOMEM;
@@ -1249,6 +1268,11 @@ munmap_back:
 		vm_flags |= VM_ACCOUNT;
 	}
 
+	if (ub_memory_charge(mm, len, vm_flags, file,
+				(flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD)))
+		goto charge_error;
+	ub_charged = 1;
+
 	/*
 	 * Can we just expand an old mapping?
 	 */
@@ -1261,7 +1285,8 @@ munmap_back:
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO |
+			(flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0));
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
@@ -1292,6 +1317,19 @@ munmap_back:
 			goto unmap_and_free_vma;
 		if (vm_flags & VM_EXECUTABLE)
 			added_exe_file_vma(mm);
+		if (vm_flags != vma->vm_flags) {
+		/* 
+		 * ->vm_flags has been changed in f_op->mmap method.
+		 * We have to recharge ub memory.
+		 */
+			ub_memory_uncharge(mm, len, vm_flags, file);
+			if (ub_memory_charge(mm, len, vma->vm_flags, file,
+				(flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) {
+				ub_charged = 0;
+				error = -ENOMEM;
+				goto unmap_and_free_vma;
+			}
+		}
 
 		/* Can addr have changed??
 		 *
@@ -1328,6 +1366,8 @@ out:
 		long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
 		if (nr_pages < 0)
 			return nr_pages;	/* vma gone! */
+		if (nr_pages)
+			ub_locked_uncharge(mm, nr_pages << PAGE_SHIFT);
 		mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
 	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
 		make_pages_present(addr, addr + len);
@@ -1343,8 +1383,11 @@ unmap_and_free_vma:
 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
 unacct_error:
+	if (ub_charged)
+		ub_memory_uncharge(mm, len, vm_flags, file);
+charge_error:
 	if (charged)
 		vm_unacct_memory(charged);
 	return error;
@@ -1846,12 +1889,16 @@ static int acct_stack_growth(struct vm_a
 	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
 		return -EFAULT;
 
+	if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags,
+				vma->vm_file, UB_SOFT))
+		goto fail_charge;
+
 	/*
 	 * Overcommit..  This must be the final test, as it will
 	 * update security statistics.
 	 */
 	if (security_vm_enough_memory_mm(mm, grow))
-		return -ENOMEM;
+		goto fail_sec;
 
 	/* Ok, everything looks good - let it rip */
 	mm->total_vm += grow;
@@ -1859,6 +1906,11 @@ static int acct_stack_growth(struct vm_a
 		mm->locked_vm += grow;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
+
+fail_sec:
+	ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file);
+fail_charge:
+	return -ENOMEM;
 }
 
 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
@@ -2118,7 +2170,7 @@ static int __split_vma(struct mm_struct 
 					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = allocate_vma(mm, GFP_KERNEL);
 	if (!new)
 		goto out_err;
 
@@ -2180,7 +2232,7 @@ static int __split_vma(struct mm_struct 
  out_free_mpol:
 	mpol_put(pol);
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new);
+	free_vma(mm, new);
  out_err:
 	return err;
 }
@@ -2197,6 +2249,7 @@ int split_vma(struct mm_struct *mm, stru
 
 	return __split_vma(mm, vma, addr, new_below);
 }
+EXPORT_SYMBOL(split_vma);
 
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
@@ -2314,7 +2367,7 @@ static inline void verify_mm_writelocked
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
@@ -2374,8 +2427,11 @@ unsigned long do_brk(unsigned long addr,
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
+	if (ub_memory_charge(mm, len, flags, NULL, soft))
+		goto fail_charge;
+
 	if (security_vm_enough_memory(len >> PAGE_SHIFT))
-		return -ENOMEM;
+		goto fail_sec;
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
@@ -2386,11 +2442,10 @@ unsigned long do_brk(unsigned long addr,
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!vma) {
-		vm_unacct_memory(len >> PAGE_SHIFT);
-		return -ENOMEM;
-	}
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO |
+			(soft == UB_SOFT ? __GFP_SOFT_UBC : 0));
+	if (!vma)
+		goto fail_alloc;
 
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_mm = mm;
@@ -2408,8 +2463,19 @@ out:
 			mm->locked_vm += (len >> PAGE_SHIFT);
 	}
 	return addr;
+
+fail_alloc:
+	vm_unacct_memory(len >> PAGE_SHIFT);
+fail_sec:
+	ub_memory_uncharge(mm, len, flags, NULL);
+fail_charge:
+	return -ENOMEM;
 }
 
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+	return __do_brk(addr, len, UB_SOFT);
+}
 EXPORT_SYMBOL(do_brk);
 
 /* Release all mmaps. */
@@ -2419,6 +2485,7 @@ void exit_mmap(struct mm_struct *mm)
 	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
 	unsigned long end;
+	unsigned int ptes_before;
 
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
@@ -2445,6 +2512,8 @@ void exit_mmap(struct mm_struct *mm)
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL, 1);
 	vm_unacct_memory(nr_accounted);
 
+	ptes_before = mm->nr_ptes;
+	tlb->ptes_freed = 0;
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 	arch_flush_exec_range(mm);
@@ -2456,7 +2525,11 @@ void exit_mmap(struct mm_struct *mm)
 	while (vma)
 		vma = remove_vma(vma);
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+	if (mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT) {
+		printk(KERN_ERR "PTE pages leak: had %u freed %u has %u\n",
+				ptes_before, tlb->ptes_freed, (unsigned int)mm->nr_ptes);
+		dump_stack();
+	}
 }
 
 /* Insert vm structure into process list sorted by address
@@ -2526,7 +2599,7 @@ struct vm_area_struct *copy_vma(struct v
 		    vma_start < new_vma->vm_end)
 			*vmap = new_vma;
 	} else {
-		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		new_vma = allocate_vma(mm, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
 			pol = mpol_dup(vma_policy(vma));
@@ -2554,7 +2627,7 @@ struct vm_area_struct *copy_vma(struct v
  out_free_mempol:
 	mpol_put(pol);
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new_vma);
+	free_vma(mm, new_vma);
 	return NULL;
 }
 
@@ -2609,10 +2682,11 @@ static void special_mapping_close(struct
 {
 }
 
-static const struct vm_operations_struct special_mapping_vmops = {
+const struct vm_operations_struct special_mapping_vmops = {
 	.close = special_mapping_close,
 	.fault = special_mapping_fault,
 };
+EXPORT_SYMBOL(special_mapping_vmops);
 
 /*
  * Called with mm->mmap_sem held for writing.
@@ -2630,7 +2704,7 @@ int install_special_mapping(struct mm_st
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (unlikely(vma == NULL))
 		return -ENOMEM;
 
@@ -2660,7 +2734,7 @@ int install_special_mapping(struct mm_st
 	return 0;
 
 out:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
 	return ret;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mmzone.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mmzone.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mmzone.c	2014-12-12 23:29:08.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mmzone.c	2015-01-21 12:02:58.940809155 +0300
@@ -8,12 +8,26 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/mmgang.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
 #include <linux/module.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+
+#include "internal.h"
+
+static DEFINE_MUTEX(gs_lock);
+
+unsigned long total_committed_pages;
+
+unsigned long commitment_for_unlimited_containers = 1ul << (30 - PAGE_SHIFT); /* 1Gb */
 
 struct pglist_data *first_online_pgdat(void)
 {
 	return NODE_DATA(first_online_node);
 }
+EXPORT_SYMBOL(first_online_pgdat);
 
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
@@ -23,6 +37,7 @@ struct pglist_data *next_online_pgdat(st
 		return NULL;
 	return NODE_DATA(nid);
 }
+EXPORT_SYMBOL(next_online_pgdat);
 
 /*
  * next_zone - helper magic for for_each_zone()
@@ -87,3 +102,1013 @@ int memmap_valid_within(unsigned long pf
 	return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+void lruvec_init(struct lruvec *lruvec)
+{
+	enum lru_list lru;
+
+	memset(lruvec, 0, sizeof(struct lruvec));
+
+	spin_lock_init(&lruvec->lru_lock);
+	for_each_lru(lru)
+		INIT_LIST_HEAD(&lruvec->lru_list[lru]);
+}
+
+void setup_zone_gang(struct gang_set *gs, struct zone *zone, struct gang *gang)
+{
+	enum lru_list lru;
+	int __maybe_unused i;
+
+	lruvec_init(&gang->lruvec);
+	gang->lruvec.zone = zone;
+	gang->set = gs;
+
+#ifdef CONFIG_MEMORY_GANGS
+	gang->last_milestone = 0;
+	for_each_evictable_lru(lru) {
+		gang->timestamp[lru] = jiffies;
+		for (i = 0; i < NR_LRU_MILESTONES; i++)
+			INIT_LIST_HEAD(&gang->milestones[i].lru[lru]);
+	}
+	gang->lruvec.priority = DEF_PRIORITY;
+#endif
+}
+
+#ifdef CONFIG_MEMORY_GANGS
+
+void remove_lru_milestone(struct lruvec *lruvec, enum lru_list lru)
+{
+	struct gang *gang = lruvec_gang(lruvec);
+	struct lru_milestone *ms;
+
+	ms = container_of(lruvec->lru_list[lru].prev,
+			struct lru_milestone, lru[lru]);
+	list_del_init(&ms->lru[lru]);
+	gang->timestamp[lru] = ms->timestamp;
+
+	set_bit(GANG_NEED_RESCHED, &gang->flags);
+}
+
+bool insert_lru_milestone(struct gang *gang, unsigned long now,
+			  unsigned long *eldest_milestone)
+{
+	bool reused = false;
+	struct lru_milestone *ms;
+	enum lru_list lru;
+
+	*eldest_milestone = now;
+	gang->last_milestone = (gang->last_milestone + 1) % NR_LRU_MILESTONES;
+	ms = gang->milestones + gang->last_milestone;
+	for_each_evictable_lru(lru) {
+		if (!list_empty(&ms->lru[lru])) {
+			list_del(&ms->lru[lru]);
+			if (is_file_lru(lru) || nr_swap_pages > 0) {
+				reused = true;
+				if (time_before(ms->timestamp, *eldest_milestone))
+					*eldest_milestone = ms->timestamp;
+			}
+		} else {
+			if ((is_file_lru(lru) || nr_swap_pages > 0) &&
+			    time_before(gang->timestamp[lru], *eldest_milestone))
+				*eldest_milestone = gang->timestamp[lru];
+		}
+		list_add(&ms->lru[lru], &gang->lruvec.lru_list[lru]);
+	}
+	ms->timestamp = now;
+	return reused;
+}
+
+static void splice_timed_pages(struct gang *gang, enum lru_list lru,
+		struct list_head *pages, unsigned long timestamp)
+{
+	struct list_head *head = &gang->lruvec.lru_list[lru];
+	struct lru_milestone *ms;
+	int i;
+
+	if (is_unevictable_lru(lru)) {
+		list_splice_tail(pages, head);
+		return;
+	}
+
+	for (i = 0; i < NR_LRU_MILESTONES; i++) {
+		ms = gang->milestones + (gang->last_milestone +
+				NR_LRU_MILESTONES - i) % NR_LRU_MILESTONES;
+		if (list_empty(ms->lru + lru)) {
+			list_add_tail(ms->lru + lru,
+					&gang->lruvec.lru_list[lru]);
+			gang->timestamp[lru] = ms->timestamp;
+		}
+		if (time_after_eq(timestamp, ms->timestamp)) {
+			head = ms->lru + lru;
+			break;
+		}
+	}
+
+	list_splice_tail(pages, head);
+}
+
+void add_zone_gang(struct zone *zone, struct gang *gang)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->gangs_lock, flags);
+	list_add_tail_rcu(&gang->list, &zone->gangs);
+	zone->nr_gangs++;
+	list_add_rcu(&gang->vmscan_list, zone->vmscan_prio + gang->lruvec.priority);
+	__set_bit(gang->lruvec.priority, zone->vmscan_mask);
+	spin_unlock_irqrestore(&zone->gangs_lock, flags);
+}
+
+static void del_zone_gang(struct zone *zone, struct gang *gang)
+{
+	struct lruvec *lruvec = &gang->lruvec;
+	unsigned long flags;
+	enum lru_list lru;
+	int i;
+
+	spin_lock_irqsave(&zone->gangs_lock, flags);
+	set_bit(GANG_UNHASHED, &gang->flags);
+	list_del_rcu(&gang->list);
+	list_del_rcu(&gang->vmscan_list);
+	if (list_empty(zone->vmscan_prio + lruvec->priority))
+		__clear_bit(lruvec->priority, zone->vmscan_mask);
+	for (i = 0; i < NR_VMSCAN_PRIORITIES; i++)
+		(void)cmpxchg(zone->vmscan_iter + i, &gang->vmscan_list,
+			      gang->vmscan_list.next);
+	zone->nr_gangs--;
+	spin_unlock_irqrestore(&zone->gangs_lock, flags);
+
+	BUG_ON(gang->committed);
+
+	spin_lock_irqsave(&lruvec->lru_lock, flags);
+	for_each_evictable_lru(lru) {
+		while (is_lru_milestone(lruvec, lruvec->lru_list[lru].prev))
+			remove_lru_milestone(lruvec, lru);
+	}
+
+	for_each_lru(lru) {
+		if (lruvec->nr_pages[lru] ||
+		    !list_empty(&lruvec->lru_list[lru])) {
+			printk(KERN_EMERG "gang leak:%ld lru:%d gang:%p\n",
+					lruvec->nr_pages[lru], lru, gang);
+			add_taint(TAINT_CRAP);
+		}
+	}
+	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
+void set_gang_priority(struct gang *gang, int priority)
+{
+	struct lruvec *lruvec = &gang->lruvec;
+	struct zone *zone = gang_zone(gang);
+	int i;
+
+	VM_BUG_ON(priority < 0 || priority > MAX_VMSCAN_PRIORITY);
+
+	spin_lock_irq(&zone->gangs_lock);
+	if (lruvec->priority == priority ||
+	    test_bit(GANG_UNHASHED, &gang->flags))
+		goto out;
+	list_del_rcu(&gang->vmscan_list);
+	if (list_empty(zone->vmscan_prio + lruvec->priority))
+		__clear_bit(lruvec->priority, zone->vmscan_mask);
+	for (i = 0; i <= lruvec->priority; i++)
+		(void)cmpxchg(zone->vmscan_iter + i, &gang->vmscan_list,
+				gang->vmscan_list.next);
+	lruvec->priority = priority;
+	list_add_rcu(&gang->vmscan_list, zone->vmscan_prio + priority);
+	__set_bit(priority, zone->vmscan_mask);
+out:
+	spin_unlock_irq(&zone->gangs_lock);
+}
+
+void set_gang_limits(struct gang_set *gs,
+		     unsigned long *newlimit, nodemask_t *newmask)
+{
+	unsigned long limit, available, committed, portion;
+	unsigned long max_committed, zone_committed, gang_committed;
+	nodemask_t nodemask;
+	struct zone *zone;
+	struct gang *gang;
+	int nid;
+
+	mutex_lock(&gs_lock);
+
+	if (gs->memory_limit > totalram_pages) {
+		for_each_zone(zone)
+			if (node_isset(zone_to_nid(zone), gs->nodemask))
+				zone->nr_unlimited_gangs--;
+	}
+
+	if (newlimit)
+		gs->memory_limit = *newlimit;
+	if (newmask)
+		gs->nodemask = *newmask;
+	limit = gs->memory_limit;
+	nodemask = gs->nodemask;
+
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+	/* include migration source nodes into coverage */
+	nodes_or(nodemask, nodemask, gs->migration_work.src_nodes);
+#endif
+
+	available = 0;
+	for_each_zone(zone) {
+		if (node_isset(zone_to_nid(zone), nodemask)) {
+			available += zone->present_pages;
+			if (limit > totalram_pages)
+				zone->nr_unlimited_gangs++;
+		}
+	}
+	gs->memory_available = available;
+
+	committed = min(limit, available);
+
+	/* limit commitment for unlimited containers */
+	if (limit > totalram_pages)
+		committed = min(committed, commitment_for_unlimited_containers);
+
+	total_committed_pages += committed - gs->memory_committed;
+	gs->memory_committed = committed;
+
+	for_each_zone(zone) {
+		nid = zone_to_nid(zone);
+		gang = mem_zone_gang(gs, zone);
+		if (!gang->committed && !node_isset(nid, nodemask))
+			continue;
+		spin_lock_irq(&zone->gangs_lock);
+		zone->committed -= gang->committed;
+		if (node_isset(nid, nodemask) && available)
+			gang->committed = committed * zone->present_pages
+						    / available;
+		else
+			gang->committed = 0;
+		zone->committed += gang->committed;
+
+		/* get maximum memory commitment among limited containers */
+		max_committed = 0;
+		for_each_gang(gang, zone) {
+			if (gang->set->memory_limit <= totalram_pages &&
+			    gang->committed > max_committed)
+				max_committed = gang->committed;
+		}
+
+		zone_committed = zone->committed +
+			max_committed * zone->nr_unlimited_gangs;
+
+		for_each_gang(gang, zone) {
+			gang_committed = gang->committed;
+
+			/*
+			 * increase commitment of unlimited containers by
+			 * maximum commitment among limited containers
+			 */
+			if (gang_committed &&
+			    gang->set->memory_limit > totalram_pages)
+				gang_committed += max_committed;
+
+			if (zone_committed > zone->present_pages) {
+				portion = zone->present_pages
+						* gang_committed
+						/ zone_committed;
+			} else {
+				portion = gang_committed;
+				/* divide remains between unlimited containers */
+				if (gang_committed &&
+				    gang->set->memory_limit > totalram_pages)
+					portion += (zone->present_pages -
+							zone_committed) /
+							zone->nr_unlimited_gangs;
+			}
+			gang->set->memory_portion += portion - gang->portion;
+			gang->portion = portion;
+		}
+		spin_unlock_irq(&zone->gangs_lock);
+	}
+
+	mutex_unlock(&gs_lock);
+}
+
+int commitment_for_unlimited_containers_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	int err;
+
+	err = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+	if (!err && write) {
+		rcu_read_lock();
+		for_each_beancounter(ub) {
+			if (get_beancounter_rcu(ub)) {
+				rcu_read_unlock();
+				set_gang_limits(get_ub_gs(ub), NULL, NULL);
+				rcu_read_lock();
+				put_beancounter(ub);
+			}
+		}
+		rcu_read_unlock();
+	}
+	return err;
+}
+
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+static void init_gangs_migration_work(struct gang_set *gs);
+#else
+static inline void init_gangs_migration_work(struct gang_set *gs) { }
+#endif
+
+int alloc_mem_gangs(struct gang_set *gs)
+{
+	struct zone *zone;
+	struct gang *gang;
+	int node, zid;
+
+	memset(gs, 0, sizeof(struct gang_set));
+
+	gs->gangs = kzalloc(nr_node_ids * sizeof(struct gang *), GFP_KERNEL);
+	if (!gs->gangs)
+		goto noarr;
+
+	/* decrease NR_LRU_MILESTONES if it doesn't fit */
+	BUILD_BUG_ON(sizeof(struct gang) * MAX_NR_ZONES > (PAGE_SIZE << 2));
+
+	for_each_node(node) {
+		gs->gangs[node] = kzalloc_node(sizeof(struct gang)
+				* MAX_NR_ZONES, GFP_KERNEL, node);
+		if (!gs->gangs[node])
+			goto nomem;
+		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+			gs->gangs[node][zid].shadow =
+				kzalloc_node(sizeof(struct gang),
+						GFP_KERNEL, node);
+			if (!gs->gangs[node][zid].shadow)
+				goto nomem;
+		}
+	}
+
+	for_each_populated_zone(zone) {
+		gang = mem_zone_gang(gs, zone);
+		setup_zone_gang(gs, zone, gang);
+		gang = gang_to_shadow_gang(gang);
+		setup_zone_gang(gs, zone, gang);
+		__set_bit(GANG_IN_SHADOW, &gang->flags);
+	}
+
+	init_gangs_migration_work(gs);
+
+	return 0;
+
+nomem:
+	free_mem_gangs(gs);
+noarr:
+	return -ENOMEM;
+}
+
+void free_mem_gangs(struct gang_set *gs)
+{
+	int node, zid;
+
+	for_each_node(node) {
+		for (zid = 0; zid < MAX_NR_ZONES; zid++)
+			kfree(gs->gangs[node][zid].shadow);
+		kfree(gs->gangs[node]);
+	}
+	kfree(gs->gangs);
+}
+
+void add_mem_gangs(struct gang_set *gs)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone) {
+		struct gang * gang = mem_zone_gang(gs, zone);
+
+		add_zone_gang(zone, gang_to_shadow_gang(gang));
+		add_zone_gang(zone, gang);
+	}
+}
+
+#define MAX_MOVE_BATCH	256
+
+static void move_gang_pages(struct gang *gang, struct gang *dst_gang)
+{
+	enum lru_list lru;
+	int restart;
+	struct user_beancounter *src_ub = get_gang_ub(gang);
+	struct user_beancounter *dst_ub = get_gang_ub(dst_gang);
+	LIST_HEAD(pages_to_wait);
+	LIST_HEAD(pages_to_free);
+	struct lruvec *lruvec;
+
+again:
+	restart = 0;
+	for_each_lru(lru) {
+		struct page *page, *next;
+		LIST_HEAD(list);
+		unsigned long nr_pages = 0;
+		unsigned long uninitialized_var(timestamp);
+		unsigned batch = 0;
+
+		lruvec = &gang->lruvec;
+		spin_lock_irq(&lruvec->lru_lock);
+		list_for_each_entry_safe_reverse(page, next,
+				&lruvec->lru_list[lru], lru) {
+			int numpages;
+
+			if (is_lru_milestone(lruvec, &page->lru)) {
+				remove_lru_milestone(lruvec, lru);
+				continue;
+			}
+
+			numpages = hpage_nr_pages(page);
+
+			if (batch >= MAX_MOVE_BATCH) {
+				restart = 1;
+				break;
+			}
+			if (!get_page_unless_zero(page)) {
+				list_move(&page->lru, &pages_to_wait);
+				continue;
+			}
+			batch++;
+			nr_pages += numpages;
+			ClearPageLRU(page);
+			set_page_gang(page, dst_gang);
+			list_move(&page->lru, &list);
+		}
+		list_splice_init(&pages_to_wait, &lruvec->lru_list[lru]);
+		lruvec->nr_pages[lru] -= nr_pages;
+		if (!is_unevictable_lru(lru))
+			timestamp = gang->timestamp[lru];
+		spin_unlock_irq(&lruvec->lru_lock);
+
+		if (!nr_pages)
+			continue;
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+		if (!is_file_lru(lru) && !is_unevictable_lru(lru)) {
+			list_for_each_entry(page, &list, lru) {
+				if (PageSwapCache(page)) {
+					lock_page(page);
+					ub_unuse_swap_page(page);
+					unlock_page(page);
+				}
+			}
+		}
+#endif
+
+		if (!gang_in_shadow(gang)) {
+			uncharge_beancounter_fast(src_ub,
+					UB_PHYSPAGES, nr_pages);
+		} else {
+			uncharge_beancounter_fast(src_ub,
+					UB_SHADOWPAGES, nr_pages);
+			if (!is_file_lru(lru) && !is_unevictable_lru(lru))
+				uncharge_beancounter_fast(src_ub, UB_SWAPPAGES,
+							  nr_pages);
+		}
+
+		if (!gang_in_shadow(dst_gang)) {
+			charge_beancounter_fast(dst_ub,
+					UB_PHYSPAGES, nr_pages, UB_FORCE);
+		} else {
+			charge_beancounter_fast(dst_ub,
+					UB_SHADOWPAGES, nr_pages, UB_FORCE);
+			if (!is_file_lru(lru) && !is_unevictable_lru(lru))
+				charge_beancounter_fast(dst_ub, UB_SWAPPAGES,
+							nr_pages, UB_FORCE);
+		}
+
+		lruvec = &dst_gang->lruvec;
+		spin_lock_irq(&lruvec->lru_lock);
+		lruvec->nr_pages[lru] += nr_pages;
+		list_for_each_entry_safe(page, next, &list, lru) {
+			SetPageLRU(page);
+			if (unlikely(put_page_testzero(page))) {
+				__ClearPageLRU(page);
+				del_page_from_lru(lruvec, page);
+				gang_del_user_page(page);
+				list_add(&page->lru, &pages_to_free);
+			}
+		}
+		splice_timed_pages(dst_gang, lru, &list, timestamp);
+		spin_unlock_irq(&lruvec->lru_lock);
+
+		list_for_each_entry_safe(page, next, &pages_to_free, lru) {
+			list_del(&page->lru);
+			VM_BUG_ON(PageTail(page));
+			if (PageCompound(page))
+				get_compound_page_dtor(page)(page);
+			else
+				free_hot_page(page);
+		}
+	}
+	update_vmscan_priority(gang);
+	update_vmscan_priority(dst_gang);
+	cond_resched();
+	if (restart)
+		goto again;
+}
+
+void junk_mem_gangs(struct gang_set *gs)
+{
+	struct zone *zone;
+
+	cancel_gangs_migration(gs);
+
+	lru_add_drain_all();
+
+	for_each_populated_zone(zone) {
+		struct gang *src, *dst;
+
+		/* push normal and shadow gangs into shadow gang */
+		src = mem_zone_gang(gs, zone);
+		dst = zone_junk_gang(zone);
+		move_gang_pages(src, dst);
+		move_gang_pages(gang_to_shadow_gang(src), dst);
+	}
+}
+
+void del_mem_gangs(struct gang_set *gs)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone) {
+		struct gang *gang = mem_zone_gang(gs, zone);
+		del_zone_gang(zone, gang);
+		del_zone_gang(zone, gang_to_shadow_gang(gang));
+	}
+}
+
+void gang_page_stat(struct gang_set *gs, nodemask_t *nodemask,
+		    unsigned long *stat, unsigned long *shadow)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	struct gang *gang;
+	enum lru_list lru;
+
+	memset(stat, 0, sizeof(unsigned long) * NR_LRU_LISTS);
+	if (shadow)
+		memset(shadow, 0, sizeof(unsigned long) * NR_LRU_LISTS);
+	for_each_zone_zonelist_nodemask(zone, z,
+			node_zonelist(numa_node_id(), GFP_KERNEL),
+			MAX_NR_ZONES - 1, nodemask) {
+		gang = mem_zone_gang(gs, zone);
+		for_each_lru(lru)
+			stat[lru] += gang->lruvec.nr_pages[lru];
+		if (shadow) {
+			gang = gang_to_shadow_gang(gang);
+			for_each_lru(lru)
+				shadow[lru] += gang->lruvec.nr_pages[lru];
+			if (gs == &init_gang_set) {
+				gang = zone_junk_gang(zone);
+				for_each_lru(lru)
+					shadow[lru] += gang->lruvec.nr_pages[lru];
+			}
+		}
+	}
+}
+
+static void show_one_gang(struct zone *zone, struct gang *gang)
+{
+	unsigned long now = jiffies;
+
+	printk("Node %d %s%s prio:%u portion:%ld scan:%lu"
+	       " a_anon:%lu %dms i_anon:%lu %dms"
+	       " a_file:%lu %dms i_file:%lu %dms"
+	       " unevictable:%lu"
+	       " reclaim_stat: %lu %lu %lu %lu\n",
+	       zone_to_nid(zone), zone->name,
+	       gang_of_junk(gang) ? "/junk" :
+	       gang_in_shadow(gang) ? "/shadow" : "",
+	       gang->lruvec.priority, gang->portion,
+	       atomic_long_read(&gang->lruvec.pages_scanned),
+	       gang->lruvec.nr_pages[LRU_ACTIVE_ANON],
+	       jiffies_to_msecs(now - gang->timestamp[LRU_ACTIVE_ANON]),
+	       gang->lruvec.nr_pages[LRU_INACTIVE_ANON],
+	       jiffies_to_msecs(now - gang->timestamp[LRU_INACTIVE_ANON]),
+	       gang->lruvec.nr_pages[LRU_ACTIVE_FILE],
+	       jiffies_to_msecs(now - gang->timestamp[LRU_ACTIVE_FILE]),
+	       gang->lruvec.nr_pages[LRU_INACTIVE_FILE],
+	       jiffies_to_msecs(now - gang->timestamp[LRU_INACTIVE_FILE]),
+	       gang->lruvec.nr_pages[LRU_UNEVICTABLE],
+	       gang->lruvec.recent_scanned[0],
+	       gang->lruvec.recent_rotated[0],
+	       gang->lruvec.recent_scanned[1],
+	       gang->lruvec.recent_rotated[1]);
+}
+
+void gang_show_state(struct gang_set *gs)
+{
+	struct zone *zone;
+	struct gang *gang;
+	unsigned long stat[NR_LRU_LISTS];
+
+	for_each_populated_zone(zone) {
+		gang = mem_zone_gang(gs, zone);
+		show_one_gang(zone, gang);
+		show_one_gang(zone, gang_to_shadow_gang(gang));
+		if (gs == &init_gang_set)
+			show_one_gang(zone, zone_junk_gang(zone));
+	}
+
+	gang_page_stat(gs, NULL, stat, stat);
+
+	printk("Total %lu anon:%lu file:%lu"
+			" a_anon:%lu i_anon:%lu"
+			" a_file:%lu i_file:%lu"
+			" unevictable:%lu\n",
+			stat[LRU_ACTIVE_ANON] + stat[LRU_INACTIVE_ANON] +
+			stat[LRU_ACTIVE_FILE] + stat[LRU_INACTIVE_FILE] +
+			stat[LRU_UNEVICTABLE],
+			stat[LRU_ACTIVE_ANON] + stat[LRU_INACTIVE_ANON],
+			stat[LRU_ACTIVE_FILE] + stat[LRU_INACTIVE_FILE],
+			stat[LRU_ACTIVE_ANON],
+			stat[LRU_INACTIVE_ANON],
+			stat[LRU_ACTIVE_FILE],
+			stat[LRU_INACTIVE_FILE],
+			stat[LRU_UNEVICTABLE]);
+}
+
+#else /* CONFIG_MEMORY_GANGS */
+
+void gang_page_stat(struct gang_set *gs, nodemask_t *nodemask,
+		    unsigned long *stat, unsigned long *shadow)
+{
+	enum lru_list lru;
+
+	if (shadow)
+		memset(shadow, 0, sizeof(unsigned long) * NR_LRU_LISTS);
+	for_each_lru(lru)
+		stat[lru] = global_page_state(NR_LRU_BASE + lru);
+}
+
+void gang_show_state(struct gang_set *gs) { }
+
+#endif /* CONFIG_MEMORY_GANGS */
+
+#ifdef CONFIG_MEMORY_GANGS_MIGRATION
+static struct workqueue_struct **gangs_migration_wq;
+
+unsigned int gangs_migration_max_isolate = 50;
+unsigned int gangs_migration_min_batch = 100;
+unsigned int gangs_migration_max_batch = 12800;
+unsigned int gangs_migration_interval = 500;
+
+static unsigned long isolate_gang_pages(struct gang *gang, enum lru_list lru,
+		unsigned long nr_to_scan, struct list_head *pagelist)
+{
+	struct lruvec *lruvec = &gang->lruvec;
+	struct list_head *lru_list = &lruvec->lru_list[lru];
+	unsigned long nr_isolated = 0;
+	struct page *page, *next;
+	int restart;
+	LIST_HEAD(busy_pages);
+
+again:
+	restart = 0;
+	spin_lock_irq(&lruvec->lru_lock);
+	list_for_each_entry_safe_reverse(page, next, lru_list, lru) {
+
+		if (is_lru_milestone(lruvec, &page->lru)) {
+			remove_lru_milestone(lruvec, lru);
+			continue;
+		}
+
+		if (nr_to_scan-- == 0)
+			break;
+
+		if (!get_page_unless_zero(page)) {
+			list_move(&page->lru, &busy_pages);
+			continue;
+		}
+
+		if (unlikely(PageTransHuge(page))) {
+			spin_unlock_irq(&lruvec->lru_lock);
+			split_huge_page(page);
+			put_page(page);
+			restart = 1;
+			spin_lock_irq(&lruvec->lru_lock);
+			break;
+		}
+
+		ClearPageLRU(page);
+		del_page_from_lru_list(lruvec, page, lru);
+		inc_zone_page_state(page, NR_ISOLATED_ANON +
+				    page_is_file_cache(page));
+
+		nr_isolated++;
+		list_add(&page->lru, pagelist);
+	}
+	list_splice_init(&busy_pages, lru_list);
+	spin_unlock_irq(&lruvec->lru_lock);
+
+	if (restart)
+		goto again;
+
+	return nr_isolated;
+}
+
+static struct page *gangs_migration_new_page(struct page *page,
+					     unsigned long private, int **x)
+{
+	struct gangs_migration_work *w = (void *)private;
+	gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE |
+			__GFP_NORETRY | __GFP_OTHER_NODE;
+
+	return __alloc_pages_nodemask(gfp_mask, 0,
+			node_zonelist(w->preferred_node, gfp_mask),
+			&w->dest_nodes);
+}
+
+static int __migrate_gangs(struct gang_set *gs, struct gangs_migration_work *w)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	enum lru_list lru;
+	nodemask_t cur_nodemask;
+	LIST_HEAD(pagelist);
+	unsigned long nr_to_scan, nr_isolated, nr_moved;
+	int rc;
+
+	nr_moved = 0;
+	cur_nodemask = nodemask_of_node(w->cur_node);
+	for_each_zone_zonelist_nodemask(zone, z,
+			node_zonelist(w->cur_node, GFP_KERNEL),
+			MAX_NR_ZONES - 1, &cur_nodemask) {
+		struct gang *gang = mem_zone_gang(gs, zone);
+		unsigned long left = gang->nr_migratepages;
+
+		if (!left)
+			continue;
+		while (nr_moved < w->batch && left) {
+			int empty = 1;
+
+			for_each_lru(lru) {
+				if (!gang->lruvec.nr_pages[lru])
+					continue;
+				empty = 0;
+
+				nr_to_scan = min_t(unsigned long,
+					left, gangs_migration_max_isolate);
+				left -= nr_to_scan;
+
+				nr_isolated = isolate_gang_pages(gang, lru,
+						nr_to_scan, &pagelist);
+				if (!nr_isolated)
+					continue;
+				rc = migrate_pages(&pagelist,
+						gangs_migration_new_page,
+						(unsigned long)w, false, true);
+				if (rc < 0)
+					return -1;
+				nr_moved += nr_isolated - rc;
+			}
+			if (empty)
+				left = 0;
+		}
+		gang->nr_migratepages = left;
+		if (nr_moved >= w->batch)
+			return 1;
+	}
+	return 0;
+}
+
+static void migrate_gangs(struct work_struct *work)
+{
+	struct delayed_work *dwork;
+	struct gangs_migration_work *w;
+	struct gang_set *gs;
+	const struct cpumask *cpumask;
+	int cpu, rc;
+	unsigned long delay = 0;
+
+	dwork = to_delayed_work(work);
+	w = container_of(dwork, struct gangs_migration_work, dwork);
+	gs = container_of(w, struct gang_set, migration_work);
+
+	if (!node_online(w->cur_node)) {
+		node_clear(w->cur_node, w->src_nodes);
+		set_gang_limits(gs, NULL, NULL);
+		goto next;
+	}
+
+	cpu = task_cpu(current);
+	cpumask = cpumask_of_node(w->cur_node);
+	if (!cpumask_test_cpu(cpu, cpumask))
+		set_cpus_allowed_ptr(current, cpumask);
+
+	rc = __migrate_gangs(gs, w);
+	if (rc < 0) {
+		nodes_clear(w->src_nodes);
+		set_gang_limits(gs, NULL, NULL);
+		return;
+	}
+	if (!rc) {
+		node_clear(w->cur_node, w->src_nodes);
+		set_gang_limits(gs, NULL, NULL);
+	}
+next:
+	if (!nodes_empty(w->src_nodes)) {
+		w->cur_node = next_node(w->cur_node, w->src_nodes);
+		if (w->cur_node >= MAX_NUMNODES) {
+			w->cur_node = first_node(w->src_nodes);
+			w->batch *= 2;
+			if (w->batch > gangs_migration_max_batch)
+				w->batch = gangs_migration_max_batch;
+			delay = msecs_to_jiffies(gangs_migration_interval);
+		}
+		w->preferred_node = next_node(w->preferred_node, w->dest_nodes);
+		if (w->preferred_node >= MAX_NUMNODES)
+			w->preferred_node = first_node(w->dest_nodes);
+		queue_delayed_work(gangs_migration_wq[w->cur_node],
+				   dwork, delay);
+	}
+}
+
+static void __schedule_gangs_migration(struct gang_set *gs,
+				       struct gangs_migration_work *w)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	enum lru_list lru;
+
+	for_each_zone_zonelist_nodemask(zone, z,
+			node_zonelist(numa_node_id(), GFP_KERNEL),
+			MAX_NR_ZONES - 1, &w->src_nodes) {
+		struct gang *gang = mem_zone_gang(gs, zone);
+
+		gang->nr_migratepages = 0;
+		for_each_lru(lru)
+			gang->nr_migratepages += gang->lruvec.nr_pages[lru];
+		gang->nr_migratepages *= NR_LRU_LISTS;
+	}
+	w->cur_node = first_node(w->src_nodes);
+	w->preferred_node = first_node(w->dest_nodes);
+	w->batch = gangs_migration_min_batch;
+	queue_delayed_work(gangs_migration_wq[w->cur_node], &w->dwork, 0);
+}
+
+/* Returns 0 if migration was already scheduled, non-zero otherwise */
+int schedule_gangs_migration(struct gang_set *gs,
+		const nodemask_t *src_nodes, const nodemask_t *dest_nodes)
+{
+	struct gangs_migration_work *w = &gs->migration_work;
+	nodemask_t tmp;
+	int ret = 0;
+
+	mutex_lock(&w->lock);
+	if (!nodes_empty(w->src_nodes))
+		goto out;
+	cancel_delayed_work_sync(&w->dwork);
+	nodes_and(w->dest_nodes, *dest_nodes, node_online_map);
+	if (!nodes_empty(w->dest_nodes)) {
+		nodes_andnot(tmp, *src_nodes, *dest_nodes);
+		nodes_and(w->src_nodes, tmp, node_online_map);
+		if (!nodes_empty(w->src_nodes)) {
+			set_gang_limits(gs, NULL, NULL);
+			__schedule_gangs_migration(gs, w);
+		}
+	}
+	ret = 1;
+out:
+	mutex_unlock(&w->lock);
+	return ret;
+}
+
+/* Returns 0 if migration was not pending, non-zero otherwise. */
+int cancel_gangs_migration(struct gang_set *gs)
+{
+	struct gangs_migration_work *w = &gs->migration_work;
+	int ret = 0;
+
+	mutex_lock(&w->lock);
+	if (nodes_empty(w->src_nodes))
+		goto out;
+	cancel_delayed_work_sync(&w->dwork);
+	nodes_clear(w->src_nodes);
+	set_gang_limits(gs, NULL, NULL);
+	ret = 1;
+out:
+	mutex_unlock(&w->lock);
+	return ret;
+}
+
+int gangs_migration_pending(struct gang_set *gs, nodemask_t *pending)
+{
+	struct gangs_migration_work *w = &gs->migration_work;
+	int ret;
+
+	mutex_lock(&w->lock);
+	if (pending)
+		*pending = w->src_nodes;
+	ret = !nodes_empty(w->src_nodes);
+	mutex_unlock(&w->lock);
+	return ret;
+}
+
+static void init_gangs_migration_work(struct gang_set *gs)
+{
+	struct gangs_migration_work *w = &gs->migration_work;
+
+	INIT_DELAYED_WORK(&w->dwork, migrate_gangs);
+	nodes_clear(w->src_nodes);
+	mutex_init(&w->lock);
+}
+
+static __init int init_gangs_migration_wq(void)
+{
+	int node;
+	char name[32];
+
+	init_gangs_migration_work(&init_gang_set);
+
+	if (nr_node_ids == 1)
+		return 0;
+
+	gangs_migration_wq = kcalloc(nr_node_ids,
+			sizeof(struct workqueue_struct *), GFP_KERNEL);
+	BUG_ON(!gangs_migration_wq);
+
+	for_each_node(node) {
+		snprintf(name, sizeof(name), "gsmigration/%d", node);
+		gangs_migration_wq[node] = create_singlethread_workqueue(name);
+		BUG_ON(!gangs_migration_wq[node]);
+	}
+
+	return 0;
+}
+late_initcall(init_gangs_migration_wq);
+
+static int gangs_migration_batch_constraints(void)
+{
+	if (gangs_migration_min_batch <= 0 ||
+	    gangs_migration_min_batch > gangs_migration_max_batch)
+		return -EINVAL;
+	return 0;
+}
+
+int gangs_migration_batch_sysctl_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(lock);
+	unsigned int old_min, old_max;
+	int err;
+
+	mutex_lock(&lock);
+
+	old_min = gangs_migration_min_batch;
+	old_max = gangs_migration_max_batch;
+
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (err || !write)
+		goto out;
+
+	err = gangs_migration_batch_constraints();
+	if (err) {
+		gangs_migration_min_batch = old_min;
+		gangs_migration_max_batch = old_max;
+	}
+
+out:
+	mutex_unlock(&lock);
+	return err;
+}
+#endif /* CONFIG_MEMORY_GANGS_MIGRATION */
+
+#ifdef CONFIG_KSTALED
+void gang_idle_page_stat(struct gang_set *gs, nodemask_t *nodemask,
+			 struct idle_page_stats *stats)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	struct gang *gang;
+	struct idle_page_stats *gang_stats;
+	unsigned seq;
+
+	memset(stats, 0, sizeof(*stats));
+	for_each_zone_zonelist_nodemask(zone, z,
+			node_zonelist(numa_node_id(), GFP_KERNEL),
+			MAX_NR_ZONES - 1, nodemask) {
+		gang = mem_zone_gang(gs, zone);
+		gang_stats = &gang->idle_page_stats;
+		do {
+			seq = read_seqcount_begin(&gang->idle_page_stats_lock);
+			stats->idle_clean += gang_stats->idle_clean;
+			stats->idle_dirty_file += gang_stats->idle_dirty_file;
+			stats->idle_dirty_swap += gang_stats->idle_dirty_swap;
+		} while (read_seqcount_retry(&gang->idle_page_stats_lock, seq));
+	}
+}
+#endif /* CONFIG_KSTALED */
+
+struct gang *init_gang_array[MAX_NUMNODES];
+
+#ifndef CONFIG_BC_RSS_ACCOUNTING
+struct gang_set init_gang_set = {
+#ifdef CONFIG_MEMORY_GANGS
+	.gangs = init_gang_array,
+#endif
+};
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mprotect.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mprotect.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mprotect.c	2014-12-12 23:29:11.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mprotect.c	2015-01-21 12:02:58.684815949 +0300
@@ -30,6 +30,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/vmpages.h>
+
 #ifndef arch_remove_exec_range
 #define arch_remove_exec_range(mm, limit)      do { ; } while (0)
 #endif
@@ -176,6 +178,12 @@ mprotect_fixup(struct vm_area_struct *vm
 		return 0;
 	}
 
+	error = -ENOMEM;
+       if (!VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+            VM_UB_PRIVATE(newflags, vma->vm_file) &&
+            charge_beancounter_fast(mm->mm_ub, UB_PRIVVMPAGES, nrpages, UB_SOFT))
+		goto fail_ch;
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
@@ -187,7 +195,7 @@ mprotect_fixup(struct vm_area_struct *vm
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
-				return -ENOMEM;
+				goto fail_sec;
 			newflags |= VM_ACCOUNT;
 		}
 	}
@@ -228,7 +236,9 @@ success:
 
 	if (vma_wants_writenotify(vma)) {
 		vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
-		dirty_accountable = 1;
+		if (!vma->vm_file ||
+		    !test_bit(AS_CHECKPOINT, &vma->vm_file->f_mapping->flags))
+			dirty_accountable = 1;
 	}
 
 	if (oldflags & VM_EXEC)
@@ -242,11 +252,21 @@ success:
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+
+       if (VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+                       !VM_UB_PRIVATE(newflags, vma->vm_file))
+               uncharge_beancounter_fast(mm->mm_ub, UB_PRIVVMPAGES, nrpages);
+
 	perf_event_mmap(vma);
 	return 0;
 
 fail:
 	vm_unacct_memory(charged);
+fail_sec:
+       if (!VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+                       VM_UB_PRIVATE(newflags, vma->vm_file))
+               uncharge_beancounter_fast(mm->mm_ub, UB_PRIVVMPAGES, nrpages);
+fail_ch:
 	return error;
 }
 
@@ -348,3 +368,4 @@ out:
 	up_write(&current->mm->mmap_sem);
 	return error;
 }
+EXPORT_SYMBOL(sys_mprotect);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/mremap.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/mremap.c
--- linux-2.6.32-504.3.3.el6.orig/mm/mremap.c	2014-12-12 23:29:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/mremap.c	2015-01-21 12:02:58.684815949 +0300
@@ -27,6 +27,8 @@
 
 #include "internal.h"
 
+#include <bc/vmpages.h>
+
 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -195,12 +197,16 @@ static unsigned long move_vma(struct vm_
 	int split = 0;
 	int err;
 
+	if (ub_memory_charge(mm, new_len, vm_flags,
+				vma->vm_file, UB_HARD))
+		goto err;
+
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
 	 * which may split one vma into three before unmapping.
 	 */
 	if (mm->map_count >= sysctl_max_map_count - 3)
-		return -ENOMEM;
+		goto err_nomem;
 
 	/*
 	 * Advise KSM to break any KSM pages in the area to be moved:
@@ -212,12 +218,12 @@ static unsigned long move_vma(struct vm_
 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
 						MADV_UNMERGEABLE, &vm_flags);
 	if (err)
-		return err;
+		goto err_nomem;
 
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
 	if (!new_vma)
-		return -ENOMEM;
+		goto err_nomem;
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 	if (moved_len < old_len) {
@@ -276,7 +282,13 @@ static unsigned long move_vma(struct vm_
 						       new_addr + new_len);
 	}
 
-	return new_addr;
+	if (new_addr != -ENOMEM)
+		return new_addr;
+
+err_nomem:
+	ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file);
+err:
+	return -ENOMEM;
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
@@ -483,10 +495,18 @@ unsigned long do_mremap(unsigned long ad
 	if (old_len == vma->vm_end - addr) {
 		/* can we just expand the current mapping? */
 		if (vma_expandable(vma, new_len - old_len)) {
-			int pages = (new_len - old_len) >> PAGE_SHIFT;
+			unsigned long len = (new_len - old_len);
+			int pages = len >> PAGE_SHIFT;
+
+			ret = -ENOMEM;
+			if (ub_memory_charge(mm, len, vma->vm_flags,
+						vma->vm_file, UB_HARD))
+				goto out;
 
 			if (vma_adjust(vma, vma->vm_start, addr + new_len,
 				       vma->vm_pgoff, NULL)) {
+				ub_memory_uncharge(mm, len,
+						vma->vm_flags, vma->vm_file);
 				ret = -ENOMEM;
 				goto out;
 			}
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/msync.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/msync.c
--- linux-2.6.32-504.3.3.el6.orig/mm/msync.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/msync.c	2015-01-21 12:02:47.720107011 +0300
@@ -48,6 +48,8 @@ SYSCALL_DEFINE3(msync, unsigned long, st
 	if (end < start)
 		goto out;
 	error = 0;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto out;
 	if (end == start)
 		goto out;
 	/*
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/nommu.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/nommu.c
--- linux-2.6.32-504.3.3.el6.orig/mm/nommu.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/nommu.c	2015-01-21 12:02:57.977834713 +0300
@@ -1529,6 +1529,7 @@ int split_vma(struct mm_struct *mm, stru
 	add_vma_to_mm(mm, new);
 	return 0;
 }
+EXPORT_SYMBOL(split_vma);
 
 /*
  * shrink a VMA by removing the specified chunk from either the beginning or
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/oom_group.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/oom_group.c
--- linux-2.6.32-504.3.3.el6.orig/mm/oom_group.c	2015-01-21 12:02:43.441220612 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/oom_group.c	2015-01-21 12:02:43.450220373 +0300
@@ -0,0 +1,215 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/ctype.h>
+#include <linux/oom.h>
+
+#include <bc/beancounter.h>
+
+static LIST_HEAD(oom_group_list_head);
+static DEFINE_RWLOCK(oom_group_lock);
+
+struct oom_group_pattern {
+	char comm[TASK_COMM_LEN], pcomm[TASK_COMM_LEN];
+	int oom_uid;
+	int oom_score_adj;
+	struct list_head group_list;
+};
+
+static void oom_groups_append(struct list_head *list)
+{
+	write_lock_irq(&oom_group_lock);
+	list_splice_tail(list, &oom_group_list_head);
+	write_unlock_irq(&oom_group_lock);
+}
+
+static void oom_groups_reset(void)
+{
+	struct list_head list;
+	struct oom_group_pattern *gp, *tmp;
+
+	write_lock_irq(&oom_group_lock);
+	list_replace_init(&oom_group_list_head, &list);
+	write_unlock_irq(&oom_group_lock);
+
+	list_for_each_entry_safe(gp, tmp, &list, group_list)
+		kfree(gp);
+}
+
+/*
+ * If mask ends with asterisk it matches any comm suffix:
+ * "foo" matches only "foo", "foo*" matches "foo" and "foobar"
+ * "*" matches any string.
+ */
+static bool oom_match_comm(const char *comm, const char *mask)
+{
+	while (*comm && *mask != '*' && *comm == *mask) {
+		comm++;
+		mask++;
+	}
+	return (!*mask && !*comm) || (*mask == '*');
+}
+
+int get_task_oom_score_adj(struct task_struct *t)
+{
+	struct oom_group_pattern *gp;
+	unsigned long flags;
+	int adj = t->signal->oom_score_adj;
+
+	if (adj != OOM_SCORE_ADJ_UNSET)
+		return adj;
+
+	adj = 0;
+	read_lock_irqsave(&oom_group_lock, flags);
+	list_for_each_entry(gp, &oom_group_list_head, group_list) {
+		if (gp->oom_uid >= 0 && task_uid(t) != gp->oom_uid)
+			continue;
+		if (gp->oom_uid < -1 && task_uid(t) >= -gp->oom_uid)
+			continue;
+		if (!oom_match_comm(t->comm, gp->comm))
+			continue;
+		if (!oom_match_comm(t->parent->comm, gp->pcomm))
+			continue;
+		adj = gp->oom_score_adj;
+		break;
+	}
+	read_unlock_irqrestore(&oom_group_lock, flags);
+	return adj;
+}
+
+static int oom_group_parse_line(struct list_head *list, char *line)
+{
+	struct oom_group_pattern *gp;
+	char dummy;
+	int ret;
+
+	gp = kmalloc(sizeof(struct oom_group_pattern), GFP_KERNEL);
+	if (gp == NULL)
+		return -ENOMEM;
+
+	BUILD_BUG_ON(TASK_COMM_LEN != 16);
+	ret = sscanf(line, "%15s %15s %d %d %c",
+			gp->comm, gp->pcomm, &gp->oom_uid,
+			&gp->oom_score_adj, &dummy);
+
+	if (ret != 4 || gp->oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			gp->oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		kfree(gp);
+		return -EINVAL;
+	}
+
+	list_add_tail(&gp->group_list, list);
+
+	return 0;
+}
+
+static ssize_t oom_group_write(struct file * file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char *line, *next, *page;
+	int ret, len;
+	LIST_HEAD(groups);
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	len = min(count, PAGE_SIZE - 1);
+	ret = copy_from_user(page, buf, len);
+	if (ret)
+		goto err;
+
+	page[len] = '\0';
+
+	next = page;
+	while (1) {
+		line = skip_spaces(next);
+		next = strchr(line, '\n');
+		if (next) {
+			*next++ = '\0';
+		} else if (len < count) {
+			ret = line != page ? line - page : -EINVAL;
+			break;
+		}
+		if (*line && *line != '#') {
+			ret = oom_group_parse_line(&groups, line);
+			if (ret)
+				break;
+		}
+		if (!next) {
+			ret = len;
+			break;
+		}
+	}
+
+	oom_groups_append(&groups);
+err:
+	free_page((unsigned long)page);
+	return ret;
+}
+
+static void *oom_group_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_irq(&oom_group_lock);
+	return seq_list_start(&oom_group_list_head, *pos);
+}
+
+static void oom_group_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_irq(&oom_group_lock);
+}
+
+static int oom_group_seq_show(struct seq_file *s, void *v)
+{
+	struct list_head *entry = v;
+	struct oom_group_pattern *p;
+
+	p = list_entry(entry, struct oom_group_pattern, group_list);
+	seq_printf(s, "%s %s %d %d\n", p->comm, p->pcomm,
+			p->oom_uid, p->oom_score_adj);
+	return 0;
+}
+
+static void *oom_group_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &oom_group_list_head, pos);
+}
+
+static struct seq_operations oom_group_seq_ops = {
+	.start = oom_group_seq_start,
+	.next  = oom_group_seq_next,
+	.stop  = oom_group_seq_stop,
+	.show  = oom_group_seq_show,
+};
+
+static int oom_group_seq_open(struct inode *inode, struct file *file)
+{
+	if (file->f_flags & O_TRUNC)
+		oom_groups_reset();
+	return seq_open(file, &oom_group_seq_ops);
+}
+
+static struct file_operations proc_oom_group_ops = {
+	.owner   = THIS_MODULE,
+	.open    = oom_group_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = oom_group_write,
+};
+
+static int __init oom_group_init(void) {
+	struct proc_dir_entry *proc;
+	LIST_HEAD(groups);
+
+	proc = proc_create("oom_score_adj", 0660,
+			   proc_vz_dir, &proc_oom_group_ops);
+	if (!proc)
+		return -ENOMEM;
+	oom_group_parse_line(&groups, "init init 0 -900");
+	oom_groups_append(&groups);
+	return 0;
+}
+
+module_init(oom_group_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/oom_kill.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/oom_kill.c
--- linux-2.6.32-504.3.3.el6.orig/mm/oom_kill.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/oom_kill.c	2015-01-21 12:02:43.805210947 +0300
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/virtinfo.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
@@ -32,6 +33,10 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 
+#include <bc/beancounter.h>
+#include <bc/oom_kill.h>
+#include <bc/vmpages.h>
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
@@ -53,17 +58,12 @@ int test_set_oom_score_adj(int new_val)
 
 	spin_lock_irq(&sighand->siglock);
 	old_val = current->signal->oom_score_adj;
-	if (new_val != old_val) {
-		if (new_val == OOM_SCORE_ADJ_MIN)
-			atomic_inc(&current->mm->oom_disable_count);
-		else if (old_val == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&current->mm->oom_disable_count);
-		current->signal->oom_score_adj = new_val;
-	}
+	current->signal->oom_score_adj = new_val;
 	spin_unlock_irq(&sighand->siglock);
 
 	return old_val;
 }
+EXPORT_SYMBOL(test_set_oom_score_adj);
 
 #ifdef CONFIG_NUMA
 /**
@@ -98,7 +98,7 @@ static bool has_intersects_mems_allowed(
 			if (cpuset_mems_allowed_intersects(current, tsk))
 				return true;
 		}
-	} while_each_thread(start, tsk);
+	} while_each_thread_ve(start, tsk);
 
 	return false;
 }
@@ -125,23 +125,41 @@ struct task_struct *find_lock_task_mm(st
 		if (likely(t->mm))
 			return t;
 		task_unlock(t);
-	} while_each_thread(p, t);
+	} while_each_thread_ve(p, t);
 
 	return NULL;
 }
 
 /* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
+static bool oom_unkillable_task(struct task_struct *p, struct user_beancounter *ub,
 		const struct mem_cgroup *mem, const nodemask_t *nodemask)
 {
 	if (is_global_init(p))
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
+	if (p->flags & PF_FROZEN)
+		return true; /* FIXME - this is wrong */
+
+	/*
+	 * This task already has access to memory reserves and is
+	 * being killed. Don't allow any other task access to the
+	 * memory reserve.
+	 *
+	 * Note: this may have a chance of deadlock if it gets
+	 * blocked waiting for another task which itself is waiting
+	 * for memory. Is there a better alternative?
+	 *
+	 * Deadlock fixed: we wait in ub_oom_lock() UB_OOM_TIMEOUT.
+	 */
+	if (test_tsk_thread_flag(p, TIF_MEMDIE))
+		return true;
 
 	/* When mem_cgroup_out_of_memory() and p is not member of the group */
 	if (mem && !task_in_mem_cgroup(p, mem))
 		return true;
+	if (ub_oom_task_skip(ub, p))
+		return true;
 
 	/* p may not have freeable memory in nodemask */
 	if (!has_intersects_mems_allowed(p, nodemask))
@@ -159,27 +177,17 @@ static bool oom_unkillable_task(struct t
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
-		      const nodemask_t *nodemask, unsigned long totalpages)
+int oom_badness(struct task_struct *p, unsigned long totalpages,
+		long *overdraft)
 {
 	long points;
 
-	if (oom_unkillable_task(p, mem, nodemask))
-		return 0;
-
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 0;
 
-	/*
-	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
-	 * so the entire heuristic doesn't need to be executed for something
-	 * that cannot be killed.
-	 */
-	if (atomic_read(&p->mm->oom_disable_count)) {
-		task_unlock(p);
-		return 0;
-	}
+	if (overdraft)
+		*overdraft = ub_current_overdraft(p->mm->mm_ub);
 
 	/*
 	 * The memory controller may have a limit of 0 bytes, so avoid a divide
@@ -211,16 +219,9 @@ unsigned int oom_badness(struct task_str
 	 * either completely disable oom killing or always prefer a certain
 	 * task.
 	 */
-	points += p->signal->oom_score_adj;
+	points += get_task_oom_score_adj(p);
 
-	/*
-	 * Never return 0 for an eligible task that may be killed since it's
-	 * possible that no single user task uses more than 0.1% of memory and
-	 * no single admin tasks uses more than 3.0%.
-	 */
-	if (points <= 0)
-		return 1;
-	return (points < 1000) ? points : 1000;
+	return clamp(points, -1000l, 1000l);
 }
 
 /*
@@ -292,33 +293,22 @@ static enum oom_constraint constrained_a
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
-		unsigned long totalpages, struct mem_cgroup *mem,
-		const nodemask_t *nodemask)
+struct task_struct *select_bad_process(int *ppoints,
+		unsigned long totalpages, struct user_beancounter *ub,
+		struct mem_cgroup *mem, const nodemask_t *nodemask)
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	*ppoints = 0;
 
-	do_each_thread(g, p) {
-		unsigned int points;
+	do_each_thread_all(g, p) {
+		int points;
 
 		if (p->exit_state)
 			continue;
-		if (oom_unkillable_task(p, mem, nodemask))
+		if (oom_unkillable_task(p, ub, mem, nodemask))
 			continue;
 
-		/*
-		 * This task already has access to memory reserves and is
-		 * being killed. Don't allow any other task access to the
-		 * memory reserve.
-		 *
-		 * Note: this may have a chance of deadlock if it gets
-		 * blocked waiting for another task which itself is waiting
-		 * for memory. Is there a better alternative?
-		 */
-		if (test_tsk_thread_flag(p, TIF_MEMDIE))
-			return ERR_PTR(-1UL);
 		if (!p->mm)
 			continue;
 
@@ -336,23 +326,16 @@ static struct task_struct *select_bad_pr
 				chosen = p;
 				*ppoints = 1000;
 			} else {
-				/*
-				 * If this task is not being ptraced on exit,
-				 * then wait for it to finish before killing
-				 * some other task unnecessarily.
-				 */
-				if (!(task_ptrace(p->group_leader) &
-							PT_TRACE_EXIT))
-					return ERR_PTR(-1UL);
+				return p;
 			}
 		}
 
-		points = oom_badness(p, mem, nodemask, totalpages);
-		if (points > *ppoints) {
+		points = oom_badness(p, totalpages, NULL);
+		if (!chosen || points > *ppoints) {
 			chosen = p;
 			*ppoints = points;
 		}
-	} while_each_thread(g, p);
+	} while_each_thread_all(g, p);
 
 	return chosen;
 }
@@ -376,8 +359,8 @@ static void dump_tasks(const struct mem_
 	struct task_struct *task;
 
 	pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
-	for_each_process(p) {
-		if (oom_unkillable_task(p, mem, nodemask))
+	for_each_process_all(p) {
+		if (oom_unkillable_task(p, NULL, mem, nodemask))
 			continue;
 
 		task = find_lock_task_mm(p);
@@ -416,34 +399,177 @@ static void dump_header(struct task_stru
 		dump_tasks(mem, nodemask);
 }
 
+static void __oom_kill_thread(struct task_struct *p, struct oom_control *oom_ctrl)
+{
+	/*
+	 * We give our sacrificial lamb high priority and access to
+	 * all the memory it needs. That way it should be able to
+	 * exit() and clear out its resources quickly...
+	 */
+	p->rt.time_slice = HZ;
+	set_tsk_thread_flag(p, TIF_MEMDIE);
+	force_sig(SIGKILL, p);
+	wake_up_process(p);
+
+	if (current->task_bc.oom_generation == oom_ctrl->generation)
+		oom_ctrl->kill_counter++;
+}
+
+static void __oom_kill_task(struct task_struct *tsk, struct oom_control *oom_ctrl)
+{
+	struct task_struct *p = tsk;
+
+	do {
+		__oom_kill_thread(p, oom_ctrl);
+		p = next_thread(p);
+	} while (p != tsk);
+}
+
+int sysctl_oom_relaxation = HZ;
+
+#define OOM_BASE_RAGE	-10
+
+#define OOM_MAX_RAGE	20
+
+static void oom_berserker(struct task_struct *victim,
+		int points, unsigned long totalpages,
+		struct oom_control *oom_ctrl, struct user_beancounter *ub,
+		struct mem_cgroup *mem, nodemask_t *nodemask)
+{
+	unsigned long now = jiffies;
+	unsigned killed = 0;
+	struct task_struct *tsk;
+	long victim_overdraft, overdraft;
+
+	/* Update oom rage on each oom-killer invocation. */
+	if (time_after_eq(now, oom_ctrl->last_kill + sysctl_oom_relaxation) ||
+			time_before(now, oom_ctrl->last_kill))
+		oom_ctrl->oom_rage = OOM_BASE_RAGE;
+	else if (oom_ctrl->oom_rage < OOM_MAX_RAGE)
+		oom_ctrl->oom_rage++;
+	oom_ctrl->last_kill = now;
+
+	if (oom_ctrl->oom_rage < 0)
+		return;
+
+	oom_badness(victim, totalpages, &victim_overdraft);
+
+	/*
+	 * Kill some youngest tasks. New task at the end of this list.
+	 * We skip unkillable tasks and tasks with score lower than maximum.
+	 */
+	list_for_each_entry_reverse(tsk, &init_task.tasks, tasks) {
+		int score = oom_badness(tsk, totalpages, &overdraft);
+		struct task_struct *p;
+
+		if (oom_unkillable_task(tsk, ub, mem, nodemask) ||
+		    score < points || (victim_overdraft > 0 && overdraft < 0))
+			continue;
+
+#ifdef CONFIG_BEANCOUNTERS
+		if (tsk != victim) {
+			p = find_lock_task_mm(tsk);
+			if (p) {
+				mm_ub(p->mm)->ub_parms[UB_OOMGUARPAGES].failcnt++;
+				task_unlock(p);
+			} else {
+				if (printk_ratelimit())
+					pr_warn("OOM berserker: no mm for process %d (%s).\n",
+						task_pid_nr(tsk), tsk->comm);
+			}
+		}
+#endif
+
+		__oom_kill_task(tsk, oom_ctrl);
+
+		if (printk_ratelimit()) {
+			task_lock(tsk);
+			pr_warning("OOM kill in rage task %d (%s) score %d in ub %d\n",
+				   task_pid_nr(tsk), tsk->comm, score,
+				   ub ? ub->ub_uid : -1);
+			task_unlock(tsk);
+		}
+
+		if (++killed >= (1 << oom_ctrl->oom_rage))
+			break;
+	}
+
+	pr_err("OOM killer in rage, %u tasks killed in ub %d\n",
+			killed, ub ? ub->ub_uid : -1);
+
+#ifdef CONFIG_VE
+	if (ub) {
+		struct ve_struct *ve;
+
+		ve = set_exec_env(VE_TASK_INFO(victim)->owner_env);
+		ve_printk(VE_LOG, KERN_ERR "OOM killer in rage, "
+				"%u tasks killed\n", killed);
+		set_exec_env(ve);
+	}
+#endif
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static int oom_kill_task(struct task_struct *p)
+static int oom_kill_task(struct task_struct *tsk, struct oom_control *oom_ctrl,
+			 int points, const char *message)
 {
-	struct task_struct *q;
+	unsigned long total_vm, total_rss, total_swap;
+	struct task_struct *p, *q;
 	struct mm_struct *mm;
-
-	p = find_lock_task_mm(p);
-	if (!p)
-		return 1;
+	struct ve_struct *ve;
 
 	if (sysctl_would_have_oomkilled == 1) {
 		printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n",
-				task_pid_nr(p), p->comm);
-		task_unlock(p);
-		return 0;
+				task_pid_nr(tsk), tsk->comm);
+		return -EAGAIN;
+	}
+
+	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OOMKILL, tsk)
+			& NOTIFY_FAIL) {
+		printk(KERN_WARNING "OOM: disabled for process %d (%s) by virtinfo.\n",
+				task_pid_nr(tsk), tsk->comm);
+		return -EAGAIN;
+	}
+
+	p = find_lock_task_mm(tsk);
+	if (!p) {
+		printk(KERN_WARNING "OOM: no mm for process %d (%s).\n",
+				task_pid_nr(tsk), tsk->comm);
+		return -EAGAIN;
 	}
 
 	/* mm cannot be safely dereferenced after task_unlock(p) */
 	mm = p->mm;
-
-	pr_err("Killed process %d, UID %d, (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
-		task_pid_nr(p), task_uid(p), p->comm, K(p->mm->total_vm),
-		K(get_mm_counter(p->mm, anon_rss)),
-		K(get_mm_counter(p->mm, file_rss)));
+	total_vm = mm->total_vm;
+	total_rss = get_mm_rss(mm);
+	total_swap = get_mm_counter(mm, swap_usage);
+	ub_oom_mark_mm(mm, oom_ctrl);
 	task_unlock(p);
 
+	__oom_kill_task(p, oom_ctrl);
+
+	printk(KERN_ERR "%s: OOM killed process %d (%s) score %d "
+			"vm:%lukB, rss:%lukB, swap:%lukB\n",
+			message, task_pid_nr(p), p->comm, points,
+			K(total_vm),
+			K(total_rss),
+			K(total_swap));
+#ifdef CONFIG_VE
+	ve = VE_TASK_INFO(p)->owner_env;
+	if (!ve_is_super(ve)) {
+		ve = set_exec_env(ve);
+		ve_printk(VE_LOG, KERN_ERR "%s: OOM killed process %d (%s) score %d "
+				"vm:%lukB, rss:%lukB, swap:%lukB\n",
+				message, task_pid_vnr(p), p->comm, points,
+				K(total_vm),
+				K(total_rss),
+				K(total_swap));
+		set_exec_env(ve);
+	}
+#endif
+
 	/*
-	 * Kill all processes sharing p->mm in other thread groups, if any.
+	 * Kill all user processes sharing p->mm in other thread groups, if any.
 	 * They don't get access to memory reserves or a higher scheduler
 	 * priority, though, to avoid depletion of all memory or task
 	 * starvation.  This prevents mm->mmap_sem livelock when an oom killed
@@ -452,34 +578,33 @@ static int oom_kill_task(struct task_str
 	 * now get access to memory reserves since it has a pending fatal
 	 * signal.
 	 */
-	for_each_process(q)
-		if (q->mm == mm && !same_thread_group(q, p)) {
+	for_each_process_ve(q)
+		if (q->mm == mm && !same_thread_group(q, p) &&
+		    !(q->flags & PF_KTHREAD)) {
+			if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+				continue;
+
 			task_lock(q);	/* Protect ->comm from prctl() */
 			pr_err("Kill process %d (%s) sharing same memory\n",
 				task_pid_nr(q), q->comm);
 			task_unlock(q);
 			force_sig(SIGKILL, q);
 		}
- 
-	set_tsk_thread_flag(p, TIF_MEMDIE);
-	force_sig(SIGKILL, p);
 
 	return 0;
 }
 #undef K
 
-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			    unsigned int points, unsigned long totalpages,
-			    struct mem_cgroup *mem, nodemask_t *nodemask,
-			    const char *message)
+int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+			    int points, unsigned long totalpages,
+			    struct user_beancounter *ub, struct mem_cgroup *mem,
+			    nodemask_t *nodemask, const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
 	struct task_struct *t = p;
-	unsigned int victim_points = 0;
-
-	if (printk_ratelimit())
-		dump_header(p, gfp_mask, order, mem, nodemask);
+	int victim_points = 0;
+	struct oom_control *oom_ctrl = ub ? &ub->oom_ctrl : &global_oom_ctrl;
 
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -490,11 +615,6 @@ static int oom_kill_process(struct task_
 		return 0;
 	}
 
-	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
-		message, task_pid_nr(p), p->comm, points);
-	task_unlock(p);
-
 	/*
 	 * If any of p's children has a different mm and is eligible for kill,
 	 * the one with the highest badness() score is sacrificed for its
@@ -503,23 +623,30 @@ static int oom_kill_process(struct task_
 	 */
 	do {
 		list_for_each_entry(child, &t->children, sibling) {
-			unsigned int child_points;
+			int child_points;
 
 			if (child->mm == p->mm)
 				continue;
-			/*
-			 * oom_badness() returns 0 if the thread is unkillable
-			 */
-			child_points = oom_badness(child, mem, nodemask,
-								totalpages);
+			if (oom_unkillable_task(child, ub, mem, nodemask))
+				continue;
+			child_points = oom_badness(child, totalpages, NULL);
 			if (child_points > victim_points) {
 				victim = child;
 				victim_points = child_points;
 			}
 		}
-	} while_each_thread(p, t);
+	} while_each_thread_ve(p, t);
 
-	return oom_kill_task(victim);
+	if (victim != p) {
+		task_lock(p);
+		printk(KERN_ERR"%s: Kill child of process %d (%s) score %d\n",
+				message, task_pid_nr(p), p->comm, points);
+		task_unlock(p);
+	}
+
+	oom_berserker(victim, points, totalpages, oom_ctrl, ub, mem, nodemask);
+
+	return oom_kill_task(victim, oom_ctrl, victim_points, message);
 }
 
 /*
@@ -550,18 +677,18 @@ static void check_panic_on_oom(enum oom_
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
 	unsigned long limit;
-	unsigned int points = 0;
+	int points = 0;
 	struct task_struct *p;
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
 	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, limit, mem, NULL);
+	p = select_bad_process(&points, limit, NULL, mem, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
-	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
+	if (oom_kill_process(p, gfp_mask, 0, points, limit, NULL, mem, NULL,
 				"Memory cgroup out of memory"))
 		goto retry;
 out:
@@ -689,9 +816,10 @@ void out_of_memory(struct zonelist *zone
 	struct task_struct *p;
 	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned int points;
+	int points;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 	int killed = 0;
+	struct user_beancounter *ub = NULL;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
@@ -717,40 +845,50 @@ void out_of_memory(struct zonelist *zone
 	mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
 	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
 
+	if (ub_oom_lock(&global_oom_ctrl, gfp_mask))
+		goto skip;
+
 	read_lock(&tasklist_lock);
 	if (sysctl_oom_kill_allocating_task &&
-	    !oom_unkillable_task(current, NULL, nodemask) &&
-	    current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+	    !oom_unkillable_task(current, NULL, NULL, nodemask) &&
+	    current->mm) {
 		/*
 		 * oom_kill_process() needs tasklist_lock held.  If it returns
 		 * non-zero, current could not be killed so we must fallback to
 		 * the tasklist scan.
 		 */
 		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
-				NULL, nodemask,
+				NULL, NULL, nodemask,
 				"Out of memory (oom_kill_allocating_task)"))
 			goto out;
 	}
 
 retry:
-	p = select_bad_process(&points, totalpages, NULL, mpol_mask);
+	put_beancounter(ub);
+	ub = ub_oom_select_worst();
+	p = select_bad_process(&points, totalpages, ub, NULL, mpol_mask);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
+		if (ub != NULL)
+			goto retry;
+
 		dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
 		read_unlock(&tasklist_lock);
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, NULL,
 				nodemask, "Out of memory"))
 		goto retry;
 	killed = 1;
 out:
+	put_beancounter(ub);
 	read_unlock(&tasklist_lock);
-
+	ub_oom_unlock(&global_oom_ctrl);
+skip:
 	/*
 	 * Give "p" a good chance of killing itself before we
 	 * retry to allocate memory unless "p" is current
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/page-writeback.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/page-writeback.c
--- linux-2.6.32-504.3.3.el6.orig/mm/page-writeback.c	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/page-writeback.c	2015-01-21 12:02:58.943809074 +0300
@@ -376,6 +376,41 @@ int bdi_set_max_ratio(struct backing_dev
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned min_dirty)
+{
+	int ret = 0;
+
+	spin_lock_bh(&bdi_lock);
+	if (min_dirty > bdi->max_dirty_pages) {
+		ret = -EINVAL;
+	} else {
+		bdi->min_dirty_pages = min_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_min_dirty);
+
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned max_dirty)
+{
+	int ret = 0;
+
+	if (max_dirty > num_physpages)
+		return -EINVAL;
+
+	spin_lock_bh(&bdi_lock);
+	if (bdi->min_dirty_pages > max_dirty) {
+		ret = -EINVAL;
+	} else {
+		bdi->max_dirty_pages = max_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_dirty);
+
 /*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
@@ -491,6 +526,15 @@ get_dirty_limits(unsigned long *pbackgro
 		*pbdi_dirty = bdi_dirty;
 		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
 		task_dirty_limit(current, pbdi_dirty);
+
+		if (bdi->min_dirty_pages &&
+		    *pbdi_dirty < bdi->min_dirty_pages)
+			*pbdi_dirty = min((unsigned long)bdi->min_dirty_pages,
+					  dirty);
+
+		if (bdi->max_dirty_pages &&
+		    *pbdi_dirty > bdi->max_dirty_pages)
+			*pbdi_dirty = bdi->max_dirty_pages;
 	}
 }
 
@@ -506,11 +550,14 @@ static void balance_dirty_pages(struct a
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
+	long ub_dirty, ub_writeback;
+	long ub_thresh, ub_background_thresh;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long pause = 1;
+	struct user_beancounter *ub = get_io_ub();
 
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
@@ -525,6 +572,14 @@ static void balance_dirty_pages(struct a
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
 
+		if (ub_dirty_limits(&ub_background_thresh, &ub_thresh, ub)) {
+			ub_dirty = ub_stat_get(ub, dirty_pages);
+			ub_writeback = ub_stat_get(ub, writeback_pages);
+		} else {
+			ub_dirty = ub_writeback = 0;
+			ub_thresh = ub_background_thresh = LONG_MAX / 2;
+		}
+
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
@@ -532,7 +587,21 @@ static void balance_dirty_pages(struct a
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
-		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+		/*
+		 * Check thresholds, set dirty_exceeded flags and
+		 * start background writeback before throttling.
+		 */
+		if (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) {
+			if (!bdi->dirty_exceeded)
+				bdi->dirty_exceeded = 1;
+			if (!writeback_in_progress(bdi))
+				bdi_start_background_writeback(bdi, NULL);
+		} else if (ub_dirty + ub_writeback > ub_thresh) {
+			if (!test_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags))
+				set_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags);
+			if (!writeback_in_progress(bdi))
+				bdi_start_background_writeback(bdi, ub);
+		} else
 			break;
 
 		/*
@@ -540,13 +609,13 @@ static void balance_dirty_pages(struct a
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <
-				(background_thresh + dirty_thresh) / 2)
+		if (bdi_cap_account_writeback(bdi) &&
+		    nr_reclaimable + nr_writeback <
+				(background_thresh + dirty_thresh) / 2 &&
+		    ub_dirty + ub_writeback <
+				(ub_background_thresh + ub_thresh) / 2)
 			break;
 
-		if (!bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
-
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
@@ -563,6 +632,14 @@ static void balance_dirty_pages(struct a
 			trace_wbc_balance_dirty_written(&wbc, bdi);
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+		} else if (ub_dirty > ub_thresh) {
+			wbc.wb_ub = ub;
+			writeback_inodes_wb(&bdi->wb, &wbc);
+			pages_written += write_chunk - wbc.nr_to_write;
+			trace_wbc_balance_dirty_written(&wbc, bdi);
+			ub_dirty = ub_stat_get(ub, dirty_pages);
+			ub_writeback = ub_stat_get(ub, writeback_pages);
+			wbc.wb_ub = NULL;
 		}
 
 		/*
@@ -583,8 +660,18 @@ static void balance_dirty_pages(struct a
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
-		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+		/* fixup ub-stat per-cpu drift to avoid false-positive */
+		if (ub_dirty + ub_writeback > ub_thresh &&
+		    ub_dirty + ub_writeback - ub_thresh <
+				    UB_STAT_BATCH * num_possible_cpus()) {
+			ub_dirty = ub_stat_get_exact(ub, dirty_pages);
+			ub_writeback = ub_stat_get_exact(ub, writeback_pages);
+		}
+
+		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh &&
+		    ub_dirty + ub_writeback <= ub_thresh)
 			break;
+
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
@@ -609,6 +696,17 @@ static void balance_dirty_pages(struct a
 			bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
+	if (ub_dirty + ub_writeback < ub_thresh &&
+	    test_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags))
+		clear_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags);
+
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY,
+			       (void*)write_chunk);
+
+	/*
+	 * Even if this is filtered writeback for other ub it will write
+	 * inodes for this ub, because ub->dirty_exceeded is set.
+	 */
 	if (writeback_in_progress(bdi))
 		return;
 
@@ -624,7 +722,10 @@ static void balance_dirty_pages(struct a
 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_background_writeback(bdi);
+		bdi_start_background_writeback(bdi, NULL);
+	else if ((laptop_mode && pages_written) ||
+		 (!laptop_mode && ub_dirty > ub_background_thresh))
+		bdi_start_background_writeback(bdi, ub);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -660,7 +761,8 @@ void balance_dirty_pages_ratelimited_nr(
 	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
-	if (mapping->backing_dev_info->dirty_exceeded)
+	if (mapping->backing_dev_info->dirty_exceeded ||
+	    test_bit(UB_DIRTY_EXCEEDED, &get_io_ub()->ub_flags))
 		ratelimit = 8;
 
 	/*
@@ -727,7 +829,7 @@ int dirty_writeback_centisecs_handler(ct
 
 static void do_laptop_sync(struct work_struct *work)
 {
-	wakeup_flusher_threads(0);
+	wakeup_flusher_threads(NULL, 0);
 	kfree(work);
 }
 
@@ -957,6 +1059,8 @@ retry:
 
 			done_index = page->index;
 
+			virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 			lock_page(page);
 
 			/*
@@ -1149,7 +1253,7 @@ void account_page_dirtied(struct page *p
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		task_dirty_inc(current);
-		task_io_account_write(PAGE_CACHE_SIZE);
+		task_io_account_dirty(PAGE_CACHE_SIZE);
 	}
 }
 
@@ -1185,6 +1289,11 @@ int __set_page_dirty_nobuffers(struct pa
 			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					!radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_dirty(mapping);
 		}
 		spin_unlock_irq(&mapping->tree_lock);
 		if (mapping->host) {
@@ -1224,6 +1333,8 @@ int set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 
+	ClearPageCheckpointed(page);
+
 	if (likely(mapping)) {
 		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
 		/*
@@ -1251,6 +1362,18 @@ int set_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty);
 
+int set_page_dirty_mm(struct page *page, struct mm_struct *mm)
+{
+	struct user_beancounter *old_ub;
+	int ret;
+
+	old_ub = set_exec_ub(mm_ub(mm));
+	ret = set_page_dirty(page);
+	(void)set_exec_ub(old_ub);
+	return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_mm);
+
 /*
  * set_page_dirty() is racy if the caller has no reference against
  * page->mapping->host, and if the page is unlocked.  This is because another
@@ -1358,6 +1481,9 @@ int test_clear_page_writeback(struct pag
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
+				if (radix_tree_prev_tag_get(&mapping->page_tree,
+							PAGECACHE_TAG_WRITEBACK))
+					ub_io_writeback_dec(mapping);
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
@@ -1386,13 +1512,23 @@ int test_set_page_writeback(struct page 
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_account_writeback(bdi))
+			if (bdi_cap_account_writeback(bdi)) {
+				if (!radix_tree_prev_tag_get(&mapping->page_tree,
+							PAGECACHE_TAG_WRITEBACK))
+					ub_io_writeback_inc(mapping);
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+			}
 		}
-		if (!PageDirty(page))
+		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_clean(mapping);
+		}
 		radix_tree_tag_clear(&mapping->page_tree,
 				     page_index(page),
 				     PAGECACHE_TAG_TOWRITE);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/page_alloc.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/page_alloc.c
--- linux-2.6.32-504.3.3.el6.orig/mm/page_alloc.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/page_alloc.c	2015-01-21 12:02:58.901810190 +0300
@@ -16,6 +16,7 @@
 
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
@@ -57,6 +58,10 @@
 #include <asm/div64.h>
 #include "internal.h"
 
+#include <bc/kmem.h>
+#include <bc/io_acct.h>
+#include <bc/oom_kill.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/kmem.h>
 
@@ -414,6 +419,20 @@ static inline void prep_zero_page(struct
 		clear_highpage(page + i);
 }
 
+#define PAGE_PCP_MAPCOUNT_VALUE		(-256)
+
+static inline void set_page_pcp(struct page *page)
+{
+	VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
+	atomic_set(&page->_mapcount, PAGE_PCP_MAPCOUNT_VALUE);
+}
+
+static inline void rmv_page_pcp(struct page *page)
+{
+	VM_BUG_ON(atomic_read(&page->_mapcount) != PAGE_PCP_MAPCOUNT_VALUE);
+	atomic_set(&page->_mapcount, -1);
+}
+
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
@@ -544,6 +563,7 @@ static inline void __free_one_page(struc
 		order++;
 	}
 	set_page_order(page, order);
+	gang_add_free_page(page);
 
 	/*
 	 * If this is not the largest possible page, check if the buddy
@@ -591,6 +611,12 @@ static inline int free_pages_check(struc
 		bad_page(page);
 		return 1;
 	}
+#ifdef CONFIG_BEANCOUNTERS
+	if (unlikely(page->kmem_ub)) {
+		if (WARN_ON(page->kmem_ub->ub_magic != UB_MAGIC))
+			return 1;
+	}
+#endif
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
@@ -616,7 +642,7 @@ static void free_pcppages_bulk(struct zo
 
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
-	zone->pages_scanned = 0;
+	atomic_long_set(&zone->pages_scanned, 0);
 
 	while (count) {
 		struct page *page;
@@ -640,6 +666,7 @@ static void free_pcppages_bulk(struct zo
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
+			rmv_page_pcp(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, page_private(page));
 			trace_mm_page_pcpu_drain(page, 0, page_private(page));
@@ -656,7 +683,7 @@ static void free_one_page(struct zone *z
 {
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
-	zone->pages_scanned = 0;
+	atomic_long_set(&zone->pages_scanned, 0);
 
 	__free_one_page(page, zone, order, migratetype);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -689,6 +716,7 @@ static void __free_pages_ok(struct page 
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 
+	ub_page_uncharge(page, order);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
@@ -752,6 +780,7 @@ static inline void expand(struct zone *z
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
+		gang_add_free_page(&page[size]);
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
@@ -1052,6 +1081,7 @@ static int rmqueue_bulk(struct zone *zon
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
+		set_page_pcp(page);
 		set_page_private(page, migratetype);
 		list = &page->lru;
 	}
@@ -1193,6 +1223,7 @@ static void free_hot_cold_page(struct pa
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	migratetype = get_pageblock_migratetype(page);
 	set_page_private(page, migratetype);
+	ub_page_uncharge(page, 0);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
@@ -1213,10 +1244,12 @@ static void free_hot_cold_page(struct pa
 		migratetype = MIGRATE_MOVABLE;
 	}
 
+	gang_add_free_page(page);
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
+	set_page_pcp(page);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		free_pcppages_bulk(zone, pcp->batch, pcp);
@@ -1235,6 +1268,19 @@ void free_hot_page(struct page *page)
 }
 	
 /*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		trace_mm_pagevec_free(page, cold);
+		free_hot_cold_page(page, cold);
+	}
+}
+
+/*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
@@ -1346,6 +1392,7 @@ again:
 			page = list_entry(list->next, struct page, lru);
 
 		list_del(&page->lru);
+		rmv_page_pcp(page);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
@@ -1992,6 +2039,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_m
 	struct task_struct *p = current;
 	bool drained = false;
 
+	ub_oom_start(&global_oom_ctrl);
+
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
@@ -2113,6 +2162,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	return alloc_flags;
 }
 
+int alloc_fail_warn;
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2161,6 +2212,8 @@ restart:
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
+	if (!sysctl_strict_mem_cpuset)
+		alloc_flags &= ~ALLOC_CPUSET;
 
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
@@ -2280,7 +2333,7 @@ rebalance:
 	}
 
 nopage:
-	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+	if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		unsigned int filter = SHOW_MEM_FILTER_NODES;
 
 		/*
@@ -2309,6 +2362,36 @@ got_pg:
 
 }
 
+static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
+		struct page *page, u64 time)
+{
+#ifdef CONFIG_VE
+	unsigned long flags;
+	int ind, cpu;
+
+	time = jiffies_to_usecs(jiffies - time) * 1000;
+	if (!(gfp_mask & __GFP_WAIT))
+		ind = KSTAT_ALLOCSTAT_ATOMIC;
+	else if (!(gfp_mask & __GFP_HIGHMEM))
+		if (order > 0)
+			ind = KSTAT_ALLOCSTAT_LOW_MP;
+		else
+			ind = KSTAT_ALLOCSTAT_LOW;
+	else
+		if (order > 0)
+			ind = KSTAT_ALLOCSTAT_HIGH_MP;
+		else
+			ind = KSTAT_ALLOCSTAT_HIGH;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	KSTAT_LAT_PCPU_ADD(&kstat_glob.alloc_lat[ind], cpu, time);
+	if (!page)
+		kstat_glob.alloc_fails[cpu][ind]++;
+	local_irq_restore(flags);
+#endif
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -2320,12 +2403,14 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
 	struct zone *preferred_zone;
 	struct page *page;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
+	cycles_t start;
 
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
+	WARN_ON((gfp_mask & __GFP_FS) && current->journal_info);
 
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
@@ -2346,6 +2431,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
 		return NULL;
 	}
 
+	start = jiffies;
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
@@ -2356,6 +2442,14 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
 				preferred_zone, migratetype);
 	put_mems_allowed();
 
+	__alloc_collect_stats(gfp_mask, order, page, start);
+
+	if (page && (gfp_mask & __GFP_UBC) &&
+		ub_page_charge(page, order, get_exec_ub(), gfp_mask)) {
+		__free_pages(page, order);
+		page = NULL;
+	}
+
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 	return page;
 }
@@ -2627,7 +2721,7 @@ void __show_free_areas(unsigned int filt
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
-		" dirty:%lu writeback:%lu unstable:%lu\n"
+		" dirty:%lu writeback:%lu wbtmp:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
@@ -2639,6 +2733,7 @@ void __show_free_areas(unsigned int filt
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
+		global_page_state(NR_WRITEBACK_TEMP),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
@@ -2648,6 +2743,34 @@ void __show_free_areas(unsigned int filt
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 
+#ifdef CONFIG_VM_EVENT_COUNTERS
+	printk(
+#ifdef CONFIG_ZONE_DMA
+		"pgscan_dma: %lu %lu "
+#endif
+#ifdef CONFIG_ZONE_DMA32
+		"pgscan_dma32: %lu %lu "
+#endif
+		"pgscan_normal: %lu %lu "
+#ifdef CONFIG_HIGHMEM
+		"pgscan_high: %lu %lu "
+#endif
+		"pgscan_movable: %lu %lu "
+		"slabs_scanned: %lu\n",
+#ifdef CONFIG_ZONE_DMA
+		vm_events(PGSCAN_DIRECT_DMA), vm_events(PGSCAN_KSWAPD_DMA),
+#endif
+#ifdef CONFIG_ZONE_DMA32
+		vm_events(PGSCAN_DIRECT_DMA32), vm_events(PGSCAN_KSWAPD_DMA32),
+#endif
+		vm_events(PGSCAN_DIRECT_NORMAL), vm_events(PGSCAN_KSWAPD_NORMAL),
+#ifdef CONFIG_HIGHMEM
+		vm_events(PGSCAN_DIRECT_HIGH), vm_events(PGSCAN_KSWAPD_HIGH),
+#endif
+		vm_events(PGSCAN_DIRECT_MOVABLE), vm_events(PGSCAN_KSWAPD_MOVABLE),
+		vm_events(SLABS_SCANNED));
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
 	for_each_populated_zone(zone) {
 		int i;
 
@@ -2708,7 +2831,7 @@ void __show_free_areas(unsigned int filt
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-			zone->pages_scanned,
+			atomic_long_read(&zone->pages_scanned),
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
@@ -4316,11 +4439,16 @@ static void __paginginit free_area_init_
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
-	
+
+#ifdef CONFIG_MEMORY_GANGS
+	init_gang_set.gangs[nid] = pgdat->init_gangs;
+#endif
+
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
+		struct gang *gang;
 		unsigned long size, realsize, memmap_pages;
-		enum lru_list l;
+		int __maybe_unused i;
 
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4365,21 +4493,38 @@ static void __paginginit free_area_init_
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
-		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 
-		zone->prev_priority = DEF_PRIORITY;
+		gang = zone_init_gang(zone);
+		setup_zone_gang(&init_gang_set, zone, gang);
+
+#ifdef CONFIG_MEMORY_GANGS
+		zone->eldest_timestamp = jiffies;
+		spin_lock_init(&zone->gangs_lock);
+		zone->nr_gangs = 0;
+		INIT_LIST_HEAD(&zone->gangs);
+		for (i = 0; i < NR_VMSCAN_PRIORITIES; i++) {
+			INIT_LIST_HEAD(zone->vmscan_prio + i);
+			zone->vmscan_iter[i] = zone->vmscan_prio + i;
+		}
+
+		add_zone_gang(zone, gang);
+
+		gang->shadow = pgdat->init_shadow_gangs + j;
+		gang = gang_to_shadow_gang(gang);
+		setup_zone_gang(&init_gang_set, zone, gang);
+		__set_bit(GANG_IN_SHADOW, &gang->flags);
+		add_zone_gang(zone, gang);
+
+		gang = zone_junk_gang(zone);
+		setup_zone_gang(&init_gang_set, zone, gang);
+		__set_bit(GANG_IN_SHADOW, &gang->flags);
+		__set_bit(GANG_OF_JUNK, &gang->flags);
+		add_zone_gang(zone, gang);
+#endif /* CONFIG_MEMORY_GANGS */
 
 		zone_pcp_init(zone);
-		for_each_lru(l) {
-			INIT_LIST_HEAD(&zone->lruvec.lists[l]);
-			zone->reclaim_stat.nr_saved_scan[l] = 0;
-		}
-		zone->reclaim_stat.recent_rotated[0] = 0;
-		zone->reclaim_stat.recent_rotated[1] = 0;
-		zone->reclaim_stat.recent_scanned[0] = 0;
-		zone->reclaim_stat.recent_scanned[1] = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
@@ -5171,49 +5316,6 @@ void setup_per_zone_wmarks(void)
 }
 
 /*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total     target    max
- * memory    ratio     inactive anon
- * -------------------------------------
- *   10MB       1         5MB
- *  100MB       1        50MB
- *    1GB       3       250MB
- *   10GB      10       0.9GB
- *  100GB      31         3GB
- *    1TB     101        10GB
- *   10TB     320        32GB
- */
-void calculate_zone_inactive_ratio(struct zone *zone)
-{
-	unsigned int gb, ratio;
-
-	/* Zone size in gigabytes */
-	gb = zone->present_pages >> (30 - PAGE_SHIFT);
-	if (gb)
-		ratio = int_sqrt(10 * gb);
-	else
-		ratio = 1;
-
-	zone->inactive_ratio = ratio;
-}
-
-static void __init setup_per_zone_inactive_ratio(void)
-{
-	struct zone *zone;
-
-	for_each_zone(zone)
-		calculate_zone_inactive_ratio(zone);
-}
-
-/*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
@@ -5251,7 +5353,6 @@ static int __init init_per_zone_wmark_mi
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
-	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/page_io.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/page_io.c
--- linux-2.6.32-504.3.3.el6.orig/mm/page_io.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/page_io.c	2015-01-21 12:02:58.686815895 +0300
@@ -11,6 +11,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/kernel_stat.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
@@ -19,12 +20,19 @@
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
+static struct bio_set *swap_bio_set;
+
+static void swap_bio_destructor(struct bio *bio)
+{
+	bio_free(bio, swap_bio_set);
+}
+
 static struct bio *get_swap_bio(gfp_t gfp_flags,
 				struct page *page, bio_end_io_t end_io)
 {
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, 1);
+	bio = bio_alloc_bioset(gfp_flags, 1, swap_bio_set);
 	if (bio) {
 		swp_entry_t entry;
 		entry.val = page_private(page);
@@ -37,6 +45,7 @@ static struct bio *get_swap_bio(gfp_t gf
 		bio->bi_idx = 0;
 		bio->bi_size = PAGE_SIZE;
 		bio->bi_end_io = end_io;
+		bio->bi_destructor = swap_bio_destructor;
 	}
 	return bio;
 }
@@ -109,12 +118,14 @@ int swap_writepage(struct page *page, st
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 	count_vm_event(PSWPOUT);
+	ub_percpu_inc(get_gang_ub(page_gang(page)), swapout);
 	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(rw, bio);
 out:
 	return ret;
 }
+EXPORT_SYMBOL(swap_writepage);
 
 int swap_readpage(struct page *page)
 {
@@ -130,7 +141,17 @@ int swap_readpage(struct page *page)
 		goto out;
 	}
 	count_vm_event(PSWPIN);
+	ub_percpu_inc(get_gang_ub(page_gang(page)), swapin);
 	submit_bio(READ, bio);
 out:
 	return ret;
 }
+
+static int __init swap_init(void)
+{
+	swap_bio_set = bioset_create(SWAP_CLUSTER_MAX, 0);
+	if (!swap_bio_set)
+		panic("can't allocate swap_bio_set\n");
+	return 0;
+}
+late_initcall(swap_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/percpu.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/percpu.c
--- linux-2.6.32-504.3.3.el6.orig/mm/percpu.c	2014-12-12 23:29:32.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/percpu.c	2015-01-21 12:02:58.030833308 +0300
@@ -833,7 +833,7 @@ void __percpu *__alloc_percpu(size_t siz
 {
 	return pcpu_alloc(size, align, false);
 }
-EXPORT_SYMBOL_GPL(__alloc_percpu);
+EXPORT_SYMBOL(__alloc_percpu);
 
 /**
  * __alloc_reserved_percpu - allocate reserved percpu area
@@ -934,7 +934,7 @@ void free_percpu(void __percpu *ptr)
 
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
-EXPORT_SYMBOL_GPL(free_percpu);
+EXPORT_SYMBOL(free_percpu);
 
 /**
  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/pram.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/pram.c
--- linux-2.6.32-504.3.3.el6.orig/mm/pram.c	2015-01-21 12:02:52.618976961 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/pram.c	2015-01-21 12:02:58.686815895 +0300
@@ -0,0 +1,1774 @@
+#include <linux/bootmem.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmgang.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/pram.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <asm/cpufeature.h>
+
+#define PRAM_MAGIC		0x7072616D
+#define PRAM_MAGIC_INCOMPLETE	(PRAM_MAGIC+1)
+#define PRAM_MAGIC_V2		(PRAM_MAGIC+2)
+
+#define PRAM_MAGIC_OK(magic) \
+	((magic) == PRAM_MAGIC || \
+	 (magic) == PRAM_MAGIC_V2)
+
+#define PRAM_NAME_MAX		512	/* including nul */
+
+/*
+ * Since currently we support only x86_64, don't worry
+ * about endianness as well as highmem.
+ */
+struct pram_chain {
+	__u32			magic;
+	__u32			csum;
+	__u64			chain_pfn;
+	__u64			link_pfn;
+	__u32			last_link_sz;
+	__u32			last_page_sz;
+	__u8			name[PRAM_NAME_MAX];
+
+	/* v2 fields */
+	__u32			csum_mode;
+};
+
+typedef __u32 (*csum_func_t)(const void *);
+
+struct pram_csum_mode {
+	int id;
+	char *name;
+	csum_func_t func;
+};
+
+#define PRAM_CSUM_NAME_MAX	16
+
+static const struct pram_csum_mode csum_mode_none;
+static const struct pram_csum_mode csum_mode_crc32c;
+static const struct pram_csum_mode csum_mode_xor;
+
+static const struct pram_csum_mode * const csum_mode_list[] = {
+#define PRAM_CSUM_NONE		0
+	&csum_mode_none,
+#define PRAM_CSUM_CRC32C	1
+	&csum_mode_crc32c,
+#define PRAM_CSUM_XOR		2
+	&csum_mode_xor,
+#define NR_PRAM_CSUM_MODES	3
+	NULL,
+};
+
+static const struct pram_csum_mode *csum_mode;
+
+#define PRAM_STATE_SAVE		1
+#define PRAM_STATE_LOAD		2
+
+#define PRAM_CHAIN_STATE(chain) \
+	page_private(virt_to_page(chain))
+#define PRAM_SET_CHAIN_STATE(chain, state) \
+	set_page_private(virt_to_page(chain), state)
+#define PRAM_CHAIN_BUSY(chain) \
+	(PRAM_CHAIN_STATE(chain) != 0)
+
+struct pram_page {
+	__u32			csum;
+	__u64			pfn;
+};
+
+struct pram_link {
+	__u32			magic;
+	__u32			csum;
+	__u64			link_pfn;
+	struct pram_page	page[0];
+};
+
+#define PRAM_LINK_CAPACITY \
+	((PAGE_SIZE - sizeof(struct pram_link)) / sizeof(struct pram_page))
+
+static int pram_reservation;
+unsigned long pram_reserved_pages;
+
+#define DEFAULT_PRAM_LOW	(16UL << 20) /* 16Mb */
+unsigned long long pram_low = DEFAULT_PRAM_LOW;
+
+struct banned_region {
+	unsigned long start, end;
+};
+
+#define MAX_NR_BANNED		(32 + MAX_NUMNODES * 2)
+
+static int nr_banned = 1;
+static struct banned_region banned[MAX_NR_BANNED] = {
+	{ .start = 0, .end = PFN_UP(DEFAULT_PRAM_LOW) - 1 },
+};
+
+static unsigned long total_banned;
+
+/* list of allocated pages that can't be used as pram;
+ * shrinked when memory is low */
+static unsigned long nr_banned_pages;
+static LIST_HEAD(banned_pages);
+static DEFINE_SPINLOCK(banned_pages_lock);
+
+/* pool of free pages available for pram;
+ * allocated by sysctl pram_prealloc */
+static unsigned long page_pool_size;
+static LIST_HEAD(page_pool);
+static DEFINE_SPINLOCK(page_pool_lock);
+
+struct pram_prealloc_struct {
+	int nr_pages;
+	struct list_head pages;
+};
+static DEFINE_PER_CPU(struct pram_prealloc_struct, pram_preallocs);
+
+#define MAX_PREALLOC_SIZE	4
+
+static unsigned long pram_pfn;	/* points to first pram chain */
+static LIST_HEAD(pram_list);	/* list of chains linked through page->lru */
+
+static DEFINE_MUTEX(pram_mutex);
+
+static int __init parse_pram(char *arg)
+{
+	if (!arg)
+		return 0;
+	return strict_strtoul(arg, 16, &pram_pfn);
+}
+early_param("pram", parse_pram);
+
+static int __init parse_pram_low(char *arg)
+{
+	char *endptr;
+	unsigned long long val;
+
+	if (!arg)
+		return 0;
+	val = memparse(arg, &endptr);
+	if (*endptr != '\0')
+		return -EINVAL;
+	if (val > 0) {
+		pram_low = val;
+		banned[0].end = PFN_UP(val) - 1;
+	}
+	return 0;
+}
+early_param("pram_low", parse_pram_low);
+
+static __u32 csum_none_func(const void *p)
+{
+	return 0;
+}
+
+static const struct pram_csum_mode csum_mode_none = {
+	.id = PRAM_CSUM_NONE,
+	.name = "none",
+	.func = csum_none_func,
+};
+
+static __u32 csum_crc32c_func(const void *p)
+{
+	return crc32c(~0, p, PAGE_SIZE);
+}
+
+static const struct pram_csum_mode csum_mode_crc32c = {
+	.id = PRAM_CSUM_CRC32C,
+	.name = "crc32c",
+	.func = csum_crc32c_func,
+};
+
+static __u32 csum_xor_func(const void *p)
+{
+	int idx = PAGE_SIZE / 4;
+	const __u32 *cur = p;
+	__u32 sum = 0;
+
+	while (idx--)
+		sum ^= *cur++;
+	return sum;
+}
+
+static const struct pram_csum_mode csum_mode_xor = {
+	.id = PRAM_CSUM_XOR,
+	.name = "xor",
+	.func = csum_xor_func,
+};
+
+static inline const struct pram_csum_mode *pram_get_csum_mode(void)
+{
+	return csum_mode;
+}
+
+static void pram_set_csum_mode(const struct pram_csum_mode *m)
+{
+	if (csum_mode != m) {
+		csum_mode = m;
+		printk(KERN_INFO "PRAM: selected csum mode: %s\n", m->name);
+	}
+}
+
+static void __init pram_select_csum_mode(void)
+{
+	const struct pram_csum_mode *m;
+
+	if (cpu_has_xmm4_2)
+		m = &csum_mode_crc32c;
+	else
+		m = &csum_mode_xor;
+	pram_set_csum_mode(m);
+}
+
+/* SSE-4.2 crc32c faster than crc32, but not avaliable at early boot */
+static inline __u32 pram_meta_csum(const void *p)
+{
+	/* skip magic and csum fields */
+	return crc32(~0, (char *)p + 8, PAGE_SIZE - 8);
+}
+
+static void pram_list_add(struct pram_chain *chain)
+{
+	struct page *page, *head_page;
+	struct pram_chain *head;
+
+	BUG_ON(!pram_pfn);
+	head_page = pfn_to_page(pram_pfn);
+	head = page_address(head_page);
+
+	page = virt_to_page(chain);
+	BUG_ON(page == head_page);
+
+	chain->chain_pfn = head->chain_pfn;
+	head->chain_pfn = page_to_pfn(page);
+	head->csum = pram_meta_csum(head);
+	list_add(&page->lru, &pram_list);
+}
+
+static void pram_list_del(struct pram_chain *chain)
+{
+	struct page *page, *prev_page;
+	struct pram_chain *prev;
+
+	BUG_ON(!pram_pfn);
+
+	page = virt_to_page(chain);
+	BUG_ON(pram_pfn == page_to_pfn(page));
+
+	prev_page = page->lru.prev == &pram_list ?
+		pfn_to_page(pram_pfn) :
+		list_entry(page->lru.prev, struct page, lru);
+	prev = page_address(prev_page);
+
+	BUG_ON(prev->chain_pfn != page_to_pfn(page));
+	prev->chain_pfn = chain->chain_pfn;
+	if (PRAM_CHAIN_STATE(prev) != PRAM_STATE_SAVE)
+		prev->csum = pram_meta_csum(prev);
+	list_del_init(&page->lru);
+}
+
+static void pram_init_list_head(void)
+{
+	struct pram_chain *head;
+
+	BUG_ON(!pram_pfn);
+	head = pfn_to_kaddr(pram_pfn);
+
+	memset(head, 0, PAGE_SIZE);
+	head->magic = PRAM_MAGIC_V2;
+	head->csum = pram_meta_csum(head);
+}
+
+static struct page *pram_alloc_page(gfp_t gfpmask);
+static void __banned_pages_shrink(int nr_to_scan);
+
+static __init int pram_build_list(void)
+{
+	unsigned long pfn;
+	struct page *page;
+	struct pram_chain *chain;
+
+	if (!pram_pfn) {
+		/* allocate pram list head */
+		page = pram_alloc_page(GFP_KERNEL);
+		if (!page) {
+			__banned_pages_shrink(INT_MAX);
+			return -ENOMEM;
+		}
+		pram_pfn = page_to_pfn(page);
+		pram_init_list_head();
+	}
+
+	for (pfn = pram_pfn; pfn; pfn = chain->chain_pfn) {
+		page = pfn_to_page(pfn);
+		chain = page_address(page);
+		if (pfn != pram_pfn)
+			list_add_tail(&page->lru, &pram_list);
+	}
+
+	return 0;
+}
+
+static struct pram_chain *pram_find_chain(const char *name)
+{
+	struct page *page;
+	struct pram_chain *chain;
+
+	if (strlen(name) >= PRAM_NAME_MAX)
+		return NULL;
+
+	list_for_each_entry(page, &pram_list, lru) {
+		chain = page_address(page);
+		if (strcmp(chain->name, name) == 0)
+			return chain;
+	}
+	return NULL;
+}
+
+static __init int pram_check_reserve(unsigned long pfn)
+{
+	if (pfn > max_pfn) {
+		printk(KERN_ERR "  pfn:%lx invalid\n", pfn);
+		return 0;
+	}
+	if (reserve_bootmem(PFN_PHYS(pfn), PAGE_SIZE, BOOTMEM_EXCLUSIVE) != 0) {
+		printk(KERN_ERR "  pfn:%lx busy\n", pfn);
+		return 0;
+	}
+	return 1;
+}
+
+static __init void pram_free_reserved(unsigned long pfn)
+{
+	free_bootmem(PFN_PHYS(pfn), PAGE_SIZE);
+}
+
+static __init int pram_check_meta(unsigned long pfn)
+{
+	__u32 *map = pfn_to_kaddr(pfn);
+
+	if (!PRAM_MAGIC_OK(map[0])) {
+		printk(KERN_ERR "  pfn:%lx corrupted: wrong magic%s\n", pfn,
+		       map[0] == PRAM_MAGIC_INCOMPLETE ?
+		       " (stream was not closed?)" : "");
+		return 0;
+	}
+	if (map[1] != pram_meta_csum(map)) {
+		printk(KERN_ERR "  pfn:%lx corrupted: wrong checksum\n", pfn);
+		return 0;
+	}
+	return 1;
+}
+
+static __init void pram_version_fixup(struct pram_chain *chain)
+{
+	if (chain->magic == PRAM_MAGIC)
+		chain->csum_mode = PRAM_CSUM_CRC32C;
+}
+
+void __init pram_reserve(void)
+{
+	int i;
+	unsigned long chain_pfn, link_pfn;
+	__u64 first_chain_pfn, *chain_ppfn, *link_ppfn;
+	struct pram_chain *chain;
+	struct pram_link *link;
+	long nr_bad, nr_reserved;
+
+	if (!pram_pfn)
+		return;
+
+	printk(KERN_INFO "PRAM: examine persistent memory...\n");
+
+	pram_reservation = 1;
+	nr_bad = nr_reserved = 0;
+	first_chain_pfn = pram_pfn;
+	for (chain_ppfn = &first_chain_pfn;
+	     *chain_ppfn; chain_ppfn = &chain->chain_pfn) {
+		chain_pfn = *chain_ppfn;
+		chain = pfn_to_kaddr(chain_pfn);
+
+		if (!pram_check_reserve(chain_pfn))
+			goto bad_chain;
+		if (!pram_check_meta(chain_pfn)) {
+			pram_free_reserved(chain_pfn);
+			goto bad_chain;
+		}
+		nr_reserved++;
+
+		pram_version_fixup(chain);
+
+		for (link_ppfn = &chain->link_pfn;
+		     *link_ppfn; link_ppfn = &link->link_pfn) {
+			link_pfn = *link_ppfn;
+			if (!pram_check_reserve(link_pfn))
+				goto bad_link;
+			if (!pram_check_meta(link_pfn)) {
+				pram_free_reserved(link_pfn);
+				goto bad_link;
+			}
+			nr_reserved++;
+
+			link = pfn_to_kaddr(link_pfn);
+			for (i = 0; i < PRAM_LINK_CAPACITY &&
+			     link->page[i].pfn; i++) {
+				if (!pram_check_reserve(link->page[i].pfn)) {
+					link->page[i].pfn = 0;
+					nr_bad++;
+					continue;
+				}
+				nr_reserved++;
+			}
+			continue;
+bad_link:
+			*link_ppfn = 0;
+			nr_bad++;
+			break;
+		}
+		continue;
+bad_chain:
+		*chain_ppfn = 0;
+		nr_bad++;
+		printk("  chain \"%.64s\" corrupted\n", chain->name);
+		break;
+	}
+	pram_pfn = first_chain_pfn;
+	pram_reservation = 0;
+
+	if (!nr_bad) {
+		printk(KERN_INFO "PRAM: %ld pages reserved\n", nr_reserved);
+		pram_reserved_pages = nr_reserved;
+		return;
+	}
+
+	printk(KERN_ERR "PRAM: reservation FAILED: %ld pages corrupted\n",
+	       nr_bad);
+
+	for (chain_pfn = pram_pfn; chain_pfn; chain_pfn = chain->chain_pfn) {
+		chain = pfn_to_kaddr(chain_pfn);
+		for (link_pfn = chain->link_pfn;
+		     link_pfn; link_pfn = link->link_pfn) {
+			link = pfn_to_kaddr(link_pfn);
+			for (i = 0; i < PRAM_LINK_CAPACITY; i++) {
+				if (link->page[i].pfn)
+					pram_free_reserved(link->page[i].pfn);
+			}
+			pram_free_reserved(link_pfn);
+		}
+		pram_free_reserved(chain_pfn);
+	}
+	pram_pfn = 0;
+}
+
+void __init pram_ban_region(unsigned long start, unsigned long end)
+{
+	int i;
+
+	if (pram_reservation)
+		return;
+
+	for (i = nr_banned - 1; i >= 0 && start <= banned[i].end + 1; i--) {
+		if (end + 1 >= banned[i].start) {
+			banned[i].start = min(banned[i].start, start);
+			banned[i].end = max(banned[i].end, end);
+			return;
+		}
+	}
+
+	if (nr_banned == MAX_NR_BANNED) {
+		printk(KERN_WARNING "PRAM: too many banned regions!\n");
+		return;
+	}
+
+	i++;
+	memmove(banned + i + 1, banned + i,
+		sizeof(struct banned_region) * (nr_banned - i));
+	banned[i].start = start;
+	banned[i].end = end;
+	nr_banned++;
+}
+
+void __init pram_show_banned(void)
+{
+	int i;
+	unsigned long n;
+
+	printk("PRAM: banned regions:\n");
+	for (i = 0; i < nr_banned; i++) {
+		n = banned[i].end - banned[i].start + 1;
+		printk("%4d: [%08lx - %08lx] %ld pages\n",
+		       i, banned[i].start, banned[i].end, n);
+		total_banned += n;
+	}
+	printk("Total banned: %ld pages in %d regions\n",
+	       total_banned, nr_banned);
+}
+
+static int page_banned(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	int l = 0, r = nr_banned - 1, m;
+
+	while (l <= r) {
+		m = (l + r) / 2;
+		if (pfn < banned[m].start)
+			r = m - 1;
+		else if (pfn > banned[m].end)
+			l = m + 1;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+static struct page *__pram_alloc_new_page(gfp_t gfpmask)
+{
+	struct page *page;
+	int page_list_len = 0;
+	LIST_HEAD(page_list);
+
+	/*
+	 * For the subsequent boot to be successful, we should not use pages
+	 * that have ever been reserved. So just put them to the banned list to
+	 * be freed later.
+	 */
+
+	page = alloc_page(gfpmask);
+	while (page && page_banned(page)) {
+		page_list_len++;
+		list_add(&page->lru, &page_list);
+		page = alloc_page(gfpmask | __GFP_COLD);
+	}
+
+	if (page_list_len > 0) {
+		spin_lock(&banned_pages_lock);
+		nr_banned_pages += page_list_len;
+		list_splice(&page_list, &banned_pages);
+		spin_unlock(&banned_pages_lock);
+	}
+
+	return page;
+}
+
+static struct page *__pram_alloc_page(gfp_t gfpmask)
+{
+	struct page *page = NULL;
+
+	if (page_pool_size) {
+		spin_lock(&page_pool_lock);
+		if (page_pool_size) {
+			BUG_ON(list_empty(&page_pool));
+			page = list_entry(page_pool.next, struct page, lru);
+			list_del_init(&page->lru);
+			page_pool_size--;
+		}
+		spin_unlock(&page_pool_lock);
+
+		if (page && (gfpmask & __GFP_ZERO))
+			clear_highpage(page);
+	}
+
+	if (!page)
+		page = __pram_alloc_new_page(gfpmask);
+
+	return page;
+}
+
+static struct page *pram_alloc_page(gfp_t gfpmask)
+{
+	struct page *page = NULL;
+
+	if (!(gfpmask & __GFP_WAIT)) {
+		struct pram_prealloc_struct *p;
+
+		p = &get_cpu_var(pram_preallocs);
+		if (p->nr_pages > 0) {
+			BUG_ON(list_empty(&p->pages));
+			page = list_entry(p->pages.next, struct page, lru);
+			list_del_init(&page->lru);
+			p->nr_pages--;
+		}
+		put_cpu_var(pram_preallocs);
+
+		if (page && (gfpmask & __GFP_ZERO))
+			clear_highpage(page);
+	}
+
+	if (!page)
+		page = __pram_alloc_page(gfpmask);
+
+	return page;
+}
+
+static void __init pram_init_preallocs(void)
+{
+	int cpu;
+	struct pram_prealloc_struct *p;
+
+	for_each_possible_cpu(cpu) {
+		p = &per_cpu(pram_preallocs, cpu);
+		p->nr_pages = 0;
+		INIT_LIST_HEAD(&p->pages);
+	}
+}
+
+static inline int pram_prealloc_size(size_t size)
+{
+	int nr_pages;
+
+	if (!size)
+		return 0;
+
+	nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+	nr_pages += DIV_ROUND_UP(nr_pages, PRAM_LINK_CAPACITY);
+
+	return nr_pages;
+}
+
+/**
+ * __pram_prealloc - preallocate pages to ensure that subsequent writes to
+ * persistent memory will not fail due to lack of memory
+ * @gfp_mask: GFP flags to use for page allocations
+ * @n: number of streams that will be written to
+ * @...: @n constants of type size_t containing number of bytes that will be
+ * written to each of @n streams
+ *
+ * On success, returns 0 with preemption disabled. On failure, returns -ENOMEM
+ * with preemption not disabled.
+ *
+ * To make use of this facility, persistent memory streams must be opened for
+ * writing without __GFP_WAIT being passed to __pram_open().
+ */
+int __pram_prealloc(gfp_t gfp_mask, int n, ...)
+{
+	int nr_pages = 0;
+	struct page *page;
+	struct pram_prealloc_struct *p;
+	LIST_HEAD(pages);
+	va_list ap;
+
+	va_start(ap, n);
+	while (n--)
+		nr_pages += pram_prealloc_size(va_arg(ap, size_t));
+	va_end(ap);
+
+	preempt_disable();
+	p = &__get_cpu_var(pram_preallocs);
+
+	if (p->nr_pages >= nr_pages)
+		return 0;
+
+	preempt_enable();
+
+	for (n = 0; n < nr_pages; n++) {
+		page = __pram_alloc_page(gfp_mask);
+		if (!page)
+			break;
+		list_add(&page->lru, &pages);
+	}
+
+	if (n < nr_pages) {
+		while (!list_empty(&pages)) {
+			page = list_entry(pages.next, struct page, lru);
+			list_del_init(&page->lru);
+			__free_page(page);
+		}
+		return -ENOMEM;
+	}
+
+	preempt_disable();
+	p = &__get_cpu_var(pram_preallocs);
+
+	p->nr_pages += nr_pages;
+	list_splice(&pages, &p->pages);
+
+	return 0;
+}
+
+void pram_prealloc_end(void)
+{
+	struct page *page;
+	struct pram_prealloc_struct *p;
+
+	p = &__get_cpu_var(pram_preallocs);
+	while (p->nr_pages > MAX_PREALLOC_SIZE) {
+		BUG_ON(list_empty(&p->pages));
+		page = list_entry(p->pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		__free_page(page);
+		p->nr_pages--;
+	}
+	preempt_enable();
+}
+
+static int __pram_del_page(struct pram_chain *chain, struct page *page);
+
+static unsigned long pram_drain(struct pram_chain *chain)
+{
+	int i;
+	unsigned long link_pfn;
+	struct pram_link *link;
+	struct page *page;
+	unsigned long freed = 0;
+
+	link_pfn = chain->link_pfn;
+	while (link_pfn) {
+		link = pfn_to_kaddr(link_pfn);
+		for (i = 0; i < PRAM_LINK_CAPACITY; i++) {
+			if (!link->page[i].pfn)
+				continue;
+			page = pfn_to_page(link->page[i].pfn);
+
+			if (PRAM_CHAIN_STATE(chain) == PRAM_STATE_LOAD) {
+				if (__pram_del_page(chain, page))
+					/* already removed */
+					continue;
+			}
+
+			ClearPageReserved(page);
+			put_page(page);
+			freed++;
+		}
+		page = pfn_to_page(link_pfn);
+		link_pfn = link->link_pfn;
+		ClearPageReserved(page);
+		put_page(page);
+		freed++;
+	}
+
+	return freed;
+}
+
+static unsigned long __pram_destroy(struct pram_chain *chain)
+{
+	struct page *page;
+	unsigned long freed;
+
+	freed = pram_drain(chain);
+	page = virt_to_page(chain);
+	ClearPageReserved(page);
+	ClearPageDirty(page);
+	put_page(page);
+	freed++;
+	return freed;
+}
+
+static void pram_destroy_all(void)
+{
+	struct page *page, *tmp;
+	struct pram_chain *chain;
+	LIST_HEAD(dispose);
+	int nodes_discarded = 0;
+	unsigned long pages_freed = 0;
+
+	mutex_lock(&pram_mutex);
+	list_for_each_entry_safe(page, tmp, &pram_list, lru) {
+		chain = page_address(page);
+		if (PRAM_CHAIN_BUSY(chain))
+			continue;
+		pram_list_del(chain);
+		list_add(&page->lru, &dispose);
+	}
+	mutex_unlock(&pram_mutex);
+
+	while (!list_empty(&dispose)) {
+		page = list_entry(dispose.next, struct page, lru);
+		list_del_init(&page->lru);
+		chain = page_address(page);
+		pages_freed += __pram_destroy(chain);
+		nodes_discarded++;
+	}
+
+	if (nodes_discarded)
+		printk(KERN_INFO "PRAM: %d nodes discarded (%lu pages freed)\n",
+		       nodes_discarded, pages_freed);
+}
+
+static void pram_stream_init(struct pram_stream *stream,
+			     struct pram_chain *chain, gfp_t gfp_mask)
+{
+	stream->chain = chain;
+	stream->link = chain->link_pfn ?
+		pfn_to_kaddr(chain->link_pfn) : NULL;
+	stream->offset = 0;
+	stream->data_page = NULL;
+	stream->data_offset = 0;
+	stream->gfp_mask = gfp_mask;
+}
+
+static int pram_create(const char *name, gfp_t gfp_mask,
+		       struct pram_stream *stream)
+{
+	struct page *page;
+	struct pram_chain *chain;
+	int ret = 0;
+
+	if (strlen(name) >= PRAM_NAME_MAX)
+		return -EINVAL;
+
+	mutex_lock(&pram_mutex);
+	if (pram_find_chain(name)) {
+		ret = -EEXIST;
+		goto unlock;
+	}
+
+	page = pram_alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	chain = page_address(page);
+	strcpy(chain->name, name);
+	PRAM_SET_CHAIN_STATE(chain, PRAM_STATE_SAVE);
+
+	chain->magic = PRAM_MAGIC_INCOMPLETE;
+
+	pram_list_add(chain);
+unlock:
+	mutex_unlock(&pram_mutex);
+
+	if (!ret)
+		pram_stream_init(stream, chain, gfp_mask);
+	return ret;
+}
+
+static int __pram_push_page(struct pram_stream *stream, struct page *page)
+{
+	struct pram_link *link = stream->link;
+	unsigned long offset = stream->offset;
+
+	if (!link || stream->offset >= PRAM_LINK_CAPACITY) {
+		unsigned long link_pfn;
+		struct page *link_page;
+
+		link_page = pram_alloc_page(stream->gfp_mask | __GFP_ZERO);
+		if (!link_page)
+			return -ENOMEM;
+
+		link_pfn = page_to_pfn(link_page);
+		if (link)
+			link->link_pfn = link_pfn;
+		else
+			stream->chain->link_pfn = link_pfn;
+
+		stream->link = link = page_address(link_page);
+		offset = 0;
+
+		link->magic = PRAM_MAGIC_INCOMPLETE;
+	}
+
+	get_page(page);
+
+	link->page[offset].pfn = page_to_pfn(page);
+	offset++;
+
+	stream->offset = offset;
+	SetPageDirty(virt_to_page(stream->chain));
+	return 0;
+}
+
+/**
+ * pram_push_page - save page to persistent memory storage
+ * @stream: storage stream
+ * @page: page to save
+ * @ppfn: if not NULL, saved page pfn is stored there
+ *
+ * Saving a page to a persistent memory storage usually is equivalent to
+ * getting the page and requires no data copying unless the page resides in a
+ * banned region. In the latter case, a new page is allocated and the content
+ * of the page passed to the function is copied to the new page.
+ *
+ * The function may block iff __GFP_WAIT was passed to __pram_open().
+ *
+ * Returns 0 on success, -errno on failure.
+ *
+ * Error values:
+ *    %-EINVAL: stream is not opened for writing
+ *    %-EFAULT: page is compound
+ *    %-ENOMEM: insufficient amount of memory available
+ */
+int pram_push_page(struct pram_stream *stream, struct page *page,
+		   unsigned long *ppfn)
+{
+	int ret;
+	struct page *new = NULL;
+
+	if (PRAM_CHAIN_STATE(stream->chain) != PRAM_STATE_SAVE)
+		return -EINVAL;
+
+	if (PageCompound(page))
+		return -EFAULT;
+
+	if (page_banned(page)) {
+		new = pram_alloc_page(stream->gfp_mask);
+		if (!new)
+			return -ENOMEM;
+		copy_highpage(new, page);
+		page = new;
+	}
+
+	ret = __pram_push_page(stream, page);
+	if (!ret) {
+		stream->data_page = NULL;
+		if (ppfn)
+			*ppfn = page_to_pfn(page);
+		if (new)
+			/* mark it clean (see __pram_pop_page) */
+			SetPageReserved(new);
+	}
+	if (new)
+		put_page(new);
+	return ret;
+}
+EXPORT_SYMBOL(pram_push_page);
+
+/**
+ * pram_write - write data to persistent memory storage
+ * @stream: storage stream
+ * @buf: data to write
+ * @count: data length
+ *
+ * The function may block iff __GFP_WAIT was passed to __pram_open().
+ *
+ * Returns the number of bytes written on success, -errno on failure.
+ *
+ * Error values:
+ *    %-EINVAL: storage is not opened for writing
+ *    %-ENOMEM: insufficient amount of memory available
+ */
+ssize_t pram_write(struct pram_stream *stream, const void *buf, size_t count)
+{
+	size_t copy_count, write_count = 0;
+	char *data;
+
+	if (PRAM_CHAIN_STATE(stream->chain) != PRAM_STATE_SAVE)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (!stream->data_page) {
+			struct page *page;
+			int ret = -ENOMEM;
+
+			page = pram_alloc_page(stream->gfp_mask | __GFP_ZERO);
+			if (page) {
+				ret = __pram_push_page(stream, page);
+				put_page(page);
+			}
+			if (ret)
+				return ret;
+
+			stream->data_page = page;
+			stream->data_offset = 0;
+		}
+
+		copy_count = min_t(size_t, count,
+				   PAGE_SIZE - stream->data_offset);
+		data = page_address(stream->data_page);
+		memcpy(data + stream->data_offset, buf, copy_count);
+
+		buf = (char *)buf + copy_count;
+		stream->data_offset += copy_count;
+		if (stream->data_offset >= PAGE_SIZE)
+			stream->data_page = NULL;
+
+		write_count += copy_count;
+		count -= copy_count;
+	}
+	return write_count;
+}
+EXPORT_SYMBOL(pram_write);
+
+static inline void pram_csum_data(struct pram_page *p, csum_func_t csum_func)
+{
+	void *datap = pfn_to_kaddr(p->pfn);
+
+	p->csum = csum_func(datap);
+}
+
+static inline int pram_check_data_csum(struct pram_chain *chain,
+				       struct pram_page *p)
+{
+	void *datap = pfn_to_kaddr(p->pfn);
+	__u32 csum;
+
+	if (chain->csum_mode < NR_PRAM_CSUM_MODES)
+		csum = csum_mode_list[chain->csum_mode]->func(datap);
+	else
+		csum = p->csum + 1;
+
+	if (p->csum != csum) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "PRAM: pfn:%lx corrupted\n",
+			       (unsigned long)p->pfn);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void pram_update_csum(struct pram_chain *chain)
+{
+	int i;
+	unsigned long link_pfn;
+	struct pram_link *link;
+	struct pram_page *p;
+	const struct pram_csum_mode *cur_csum_mode = pram_get_csum_mode();
+
+	chain->csum_mode = cur_csum_mode->id;
+	for (link_pfn = chain->link_pfn; link_pfn; link_pfn = link->link_pfn) {
+		link = pfn_to_kaddr(link_pfn);
+		for (i = 0; i < PRAM_LINK_CAPACITY; i++) {
+			p = &link->page[i];
+			if (!p->pfn)
+				break;
+			pram_csum_data(p, cur_csum_mode->func);
+		}
+		link->magic = PRAM_MAGIC_V2;
+		link->csum = pram_meta_csum(link);
+	}
+}
+
+static void pram_save(struct pram_stream *stream)
+{
+	struct pram_chain *chain = stream->chain;
+
+	chain->last_link_sz = stream->offset;
+	chain->last_page_sz =
+		stream->data_page ? stream->data_offset : PAGE_SIZE;
+
+	pram_update_csum(chain);
+
+	mutex_lock(&pram_mutex);
+	chain->magic = PRAM_MAGIC_V2;
+	chain->csum = pram_meta_csum(chain);
+	PRAM_SET_CHAIN_STATE(chain, 0);
+	mutex_unlock(&pram_mutex);
+}
+
+static void pram_discard(struct pram_stream *stream)
+{
+	struct pram_chain *chain = stream->chain;
+
+	mutex_lock(&pram_mutex);
+	pram_list_del(chain);
+	mutex_unlock(&pram_mutex);
+
+	PRAM_SET_CHAIN_STATE(chain, 0);
+	__pram_destroy(chain);
+}
+
+static void pram_prepare_data_load(struct pram_chain *chain)
+{
+	int i;
+	unsigned long link_pfn;
+	struct pram_link *link;
+	struct pram_page *p;
+	struct page *page;
+
+	for (link_pfn = chain->link_pfn; link_pfn; link_pfn = link->link_pfn) {
+		link = pfn_to_kaddr(link_pfn);
+		for (i = 0; i < PRAM_LINK_CAPACITY; i++) {
+			p = &link->page[i];
+			if (!p->pfn)
+				continue;
+			page = pfn_to_page(p->pfn);
+			if (!pram_check_data_csum(chain, p)) {
+				ClearPageReserved(page);
+				put_page(page);
+				p->pfn = 0;
+				continue;
+			}
+
+			VM_BUG_ON(page_mapped(page));
+			VM_BUG_ON(!PageAnon(page) && page->mapping);
+			page->mapping = (void *)chain + PAGE_MAPPING_ANON;
+		}
+		cond_resched();
+	}
+}
+
+static int pram_load(const char *name, struct pram_stream *stream)
+{
+	struct pram_chain *chain;
+	int ret = 0;
+
+	mutex_lock(&pram_mutex);
+	chain = pram_find_chain(name);
+	if (!chain) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	if (PRAM_CHAIN_BUSY(chain)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+	pram_list_del(chain);
+unlock:
+	mutex_unlock(&pram_mutex);
+
+	if (!ret) {
+		PRAM_SET_CHAIN_STATE(chain, PRAM_STATE_LOAD);
+		pram_prepare_data_load(chain);
+		pram_stream_init(stream, chain, 0);
+	}
+	return ret;
+}
+
+static int __pram_del_page(struct pram_chain *chain, struct page *page)
+{
+	void *mapping = (void *)page->mapping - PAGE_MAPPING_ANON;
+
+	if (mapping != chain)
+		return -EINVAL;
+
+	if (PageReserved(page)) {
+		page->mapping = NULL;
+		ClearPageReserved(page);
+	} else {
+		/* dirty mark; see pram_page_dirty */
+		page->mapping = (void *)PAGE_MAPPING_ANON;
+	}
+	return 0;
+}
+
+/**
+ * pram_del_page - mark page as not belonging to persistent memory storage
+ * @stream: storage stream opened for reading
+ * @page: page to remove
+ *
+ * On success, returns 0 and marks the page as not belonging to the storage, so
+ * that it will not be touched by pram code any more. On failure, returns
+ * -errno.
+ *
+ * The function never blocks.
+ *
+ * Error values:
+ *    %-EINVAL: stream is not opened for reading or page does not belong to it
+ */
+int pram_del_page(struct pram_stream *stream, struct page *page)
+{
+	return __pram_del_page(stream->chain, page);
+}
+EXPORT_SYMBOL(pram_del_page);
+
+static struct page *__pram_pop_page(struct pram_stream *stream)
+{
+	struct pram_link *link = stream->link;
+	unsigned long offset = stream->offset;
+	struct pram_page *p;
+	struct page *page;
+
+next:
+	if (!link)
+		return NULL;
+
+	p = &link->page[offset];
+	if (p->pfn) {
+		page = pfn_to_page(p->pfn);
+		if (__pram_del_page(stream->chain, page))
+			/* already removed */
+			page = NULL;
+	} else
+		page = ERR_PTR(-EIO);
+
+	p->pfn = 0;
+	offset++;
+
+	if (offset >= (link->link_pfn ? PRAM_LINK_CAPACITY :
+		       stream->chain->last_link_sz)) {
+		unsigned long link_pfn = link->link_pfn;
+		struct page *link_page;
+
+		link_page = virt_to_page(link);
+		ClearPageReserved(link_page);
+		put_page(link_page);
+
+		stream->chain->link_pfn = link_pfn;
+		stream->link = link = link_pfn ? pfn_to_kaddr(link_pfn) : NULL;
+		offset = 0;
+	}
+
+	stream->offset = offset;
+	if (!page)
+		goto next;
+
+	return page;
+}
+
+/**
+ * pram_pop_page - load page from persistent memory storage
+ * @stream: storage stream
+ *
+ * On success, returns the page loaded or NULL if the storage is empty. On
+ * failure, returns ERR_PTR(-errno).
+ *
+ * The function never blocks.
+ *
+ * Error values:
+ *    %-EINVAL: stream is not opened for reading
+ *    %-EIO: page has been corrupted
+ */
+struct page *pram_pop_page(struct pram_stream *stream)
+{
+	struct page *page;
+
+	if (PRAM_CHAIN_STATE(stream->chain) != PRAM_STATE_LOAD)
+		return ERR_PTR(-EINVAL);
+
+	page = __pram_pop_page(stream);
+	if (!IS_ERR(page) && stream->data_page) {
+		put_page(stream->data_page);
+		stream->data_page = NULL;
+	}
+	return page;
+}
+EXPORT_SYMBOL(pram_pop_page);
+
+/**
+ * pram_read - read data from persistent memory storage
+ * @stream: storage stream
+ * @buf: buffer to write data to
+ * @count: buffer length
+ *
+ * On success, the number of bytes read is returned (zero indicates end of
+ * stream), and the stream position is advanced by this number. On failure,
+ * -errno is returned. In this case it is left unspecified whether the stream
+ * position changes.
+ *
+ * The function never blocks.
+ *
+ * Error values:
+ *    %-EINVAL: storage is not opened for reading
+ *    %-EIO: data have been corrupted
+ */
+ssize_t pram_read(struct pram_stream *stream, void *buf, size_t count)
+{
+	size_t copy_count, read_count = 0;
+	unsigned int data_size;
+	char *data;
+
+	if (PRAM_CHAIN_STATE(stream->chain) != PRAM_STATE_LOAD)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (!stream->data_page) {
+			struct page *page;
+
+			page = __pram_pop_page(stream);
+			if (IS_ERR(page))
+				return PTR_ERR(page);
+			if (!page)
+				break;
+
+			stream->data_page = page;
+			stream->data_offset = 0;
+		}
+
+		data_size = stream->link ? PAGE_SIZE :
+			stream->chain->last_page_sz;
+
+		copy_count = min_t(size_t, count,
+				   data_size - stream->data_offset);
+		data = page_address(stream->data_page);
+		memcpy(buf, data + stream->data_offset, copy_count);
+
+		buf = (char *)buf + copy_count;
+		stream->data_offset += copy_count;
+		if (stream->data_offset >= data_size) {
+			put_page(stream->data_page);
+			stream->data_page = NULL;
+		}
+
+		read_count += copy_count;
+		count -= copy_count;
+	}
+	return read_count;
+}
+EXPORT_SYMBOL(pram_read);
+
+static void pram_release(struct pram_stream *stream)
+{
+	if (stream->data_page)
+		put_page(stream->data_page);
+	__pram_destroy(stream->chain);
+}
+
+/**
+ * pram_destroy - destroy persistent memory storage
+ * @name: storage name
+ *
+ * Returns 0 on success, -errno on failure.
+ *
+ * Error values:
+ *    %-ENOENT: storage does not exist
+ *    %-EBUSY: storage is currently being written to
+ */
+int pram_destroy(const char *name)
+{
+	struct pram_chain *chain;
+	int ret = 0;
+
+	mutex_lock(&pram_mutex);
+	chain = pram_find_chain(name);
+	if (!chain) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	if (PRAM_CHAIN_BUSY(chain)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+	pram_list_del(chain);
+unlock:
+	mutex_unlock(&pram_mutex);
+
+	if (!ret)
+		__pram_destroy(chain);
+	return ret;
+}
+EXPORT_SYMBOL(pram_destroy);
+
+/**
+ * __pram_open - open or create persistent memory storage
+ * @name: storage name
+ * @mode: specifies if storage is created or opened
+ * @gfp_mask: GFP flags to use for page allocations when writing to storage
+ * @stream: stream to be used for operating on storage
+ *
+ * Depending on the value of @mode, the function creates or opens a persistent
+ * memory storage with the given name and associates @stream with it.
+ *
+ * Possible values for @mode:
+ *    %PRAM_WRITE - create new storage and initialize stream for writing
+ *    %PRAM_READ - open existing storage and initialize stream for reading
+ *
+ * Returns 0 on success, -errno on failure.
+ *
+ * Error values:
+ *    %-EINVAL: storage name is too long or mode is invalid
+ *    %-EEXIST: create failed because storage with given name already exists
+ *    %-ENOENT: open failed because storage with given name does not exist
+ *    %-EBUSY: open failed because storage is currently being written to
+ *    %-ENOMEM: insufficient amount of memory available
+ */
+int __pram_open(const char *name, int mode, gfp_t gfp_mask,
+		struct pram_stream *stream)
+{
+	int ret;
+
+	if (!pram_pfn)
+		return -ENODEV;
+
+	switch (mode) {
+	case PRAM_WRITE:
+		ret = pram_create(name, gfp_mask, stream);
+		break;
+	case PRAM_READ:
+		ret = pram_load(name, stream);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__pram_open);
+
+/**
+ * pram_close - close stream and save or destroy persistent memory storage
+ * @stream: storage stream
+ * @how: if < 0, destroy storage, else save storage
+ *
+ * If @stream is opened for writing, depending on @how, the function saves or
+ * destroys the storage @stream is associated with. If @stream is opened for
+ * reading, @how is ignored, and the function frees all data left unread in the
+ * storage and releases all resources associated with it.
+ */
+void pram_close(struct pram_stream *stream, int how)
+{
+	switch (PRAM_CHAIN_STATE(stream->chain)) {
+	case PRAM_STATE_SAVE:
+		if (how < 0)
+			pram_discard(stream);
+		else
+			pram_save(stream);
+		break;
+	case PRAM_STATE_LOAD:
+		pram_release(stream);
+		break;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(pram_close);
+
+int pram_for_each_page(struct pram_stream *stream,
+		int (*fn)(struct page *page, void *data), void *data)
+{
+	struct pram_chain *chain;
+	struct pram_link *link;
+	unsigned long link_pfn, pfn;
+	int i, err = 0;
+
+	chain = stream->chain;
+	for (link_pfn = chain->link_pfn; link_pfn; link_pfn = link->link_pfn) {
+		link = pfn_to_kaddr(link_pfn);
+		for (i = 0; i < PRAM_LINK_CAPACITY; i++) {
+			pfn = link->page[i].pfn;
+			if (!pfn)
+				continue;
+			err = fn(pfn_to_page(pfn), data);
+			if (err)
+				goto out;
+		}
+	}
+out:
+	return err;
+}
+EXPORT_SYMBOL(pram_for_each_page);
+
+#define LRU_DEL_ATTEMPTS	3000
+
+struct lru_del_state {
+	int attempt;
+	unsigned long nr_busy;
+	struct lruvec *lruvec;
+};
+
+static int __pram_del_from_lru(struct page *page, void *data)
+{
+	struct lru_del_state *st = data;
+
+	if (!page_gang(page))
+		/* page does not belong to any gang
+		 * so it is definitely not on lru */
+		goto out;
+
+	if (page_count(page) != 1)
+		/* we are not the only page owner
+		 * so it is unsafe to del it from lru now */
+		goto out_busy;
+
+	st->lruvec = relock_lruvec(st->lruvec, page_lruvec(page));
+	if (unlikely(st->lruvec != __page_lruvec(page) ||
+		     page_count(page) != 1))
+		goto out_busy;
+	if (PageLRU(page)) {
+		ClearPageLRU(page);
+		del_page_from_lru(st->lruvec, page);
+		gang_del_user_page(page);
+	}
+	goto out;
+
+out_busy:
+	st->nr_busy++;
+	if (st->attempt >= LRU_DEL_ATTEMPTS && printk_ratelimit()) {
+		printk(KERN_WARNING "PRAM: failed to del page from lru: "
+		       "page:%p flags:%p count:%d "
+		       "mapcount:%d mapping:%p index:%lx\n",
+		       page, (void *)page->flags, page_count(page),
+		       page_mapcount(page), page->mapping, page->index);
+	}
+out:
+	return 0;
+}
+
+int pram_del_from_lru(struct pram_stream *stream, int wait)
+{
+	unsigned long flags;
+	struct lru_del_state st;
+
+	memset(&st, 0, sizeof(st));
+again:
+	st.attempt++;
+	st.nr_busy = 0;
+	st.lruvec = NULL;
+
+	local_irq_save(flags);
+	pram_for_each_page(stream, __pram_del_from_lru, &st);
+	unlock_lruvec(st.lruvec);
+	local_irq_restore(flags);
+
+	if (st.nr_busy && st.attempt < LRU_DEL_ATTEMPTS && wait) {
+		schedule_timeout_uninterruptible(1);
+		goto again;
+	}
+
+	if (st.nr_busy && !wait)
+		return -EAGAIN;
+	if (st.nr_busy) {
+		printk(KERN_WARNING "PRAM: %s failed: %lu pages busy\n",
+		       __func__, st.nr_busy);
+		return -EBUSY;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(pram_del_from_lru);
+
+int pram_dirty(struct pram_stream *stream)
+{
+	return PageDirty(virt_to_page(stream->chain));
+}
+EXPORT_SYMBOL(pram_dirty);
+
+static void __banned_pages_shrink(int nr_to_scan)
+{
+	struct page *page;
+
+	if (nr_to_scan <= 0)
+		return;
+
+	while (!list_empty(&banned_pages)) {
+		page = list_entry(banned_pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		__free_page(page);
+		BUG_ON(!nr_banned_pages);
+		nr_banned_pages--;
+		nr_to_scan--;
+		if (!nr_to_scan)
+			break;
+	}
+}
+
+static int banned_pages_shrink(struct shrinker *shrink,
+			       int nr_to_scan, gfp_t gfp_mask)
+{
+	int nr_left = nr_banned_pages;
+
+	if (!nr_to_scan || !nr_left)
+		return nr_left;
+
+	spin_lock(&banned_pages_lock);
+	__banned_pages_shrink(nr_to_scan);
+	nr_left = nr_banned_pages;
+	spin_unlock(&banned_pages_lock);
+
+	return nr_left;
+}
+
+static struct shrinker banned_pages_shrinker = {
+	.shrink = banned_pages_shrink,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static int pram_callback(struct notifier_block *nfb,
+			 unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct page *page;
+	struct pram_prealloc_struct *p;
+
+	/* Free per-cpu pool of preallocated pages */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		p = &per_cpu(pram_preallocs, cpu);
+		p->nr_pages = 0;
+		while (!list_empty(&p->pages)) {
+			page = list_entry(p->pages.next, struct page, lru);
+			list_del_init(&page->lru);
+			__free_page(page);
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static ssize_t pram_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	return sprintf(buf, "%lx\n", pram_pfn);
+}
+
+static ssize_t pram_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 16, &val) || val)
+		return -EINVAL;
+	pram_destroy_all();
+	return count;
+}
+
+static struct kobj_attribute pram_attr =
+	__ATTR(pram, 0644, pram_show, pram_store);
+
+static ssize_t pram_low_show(struct kobject *kboj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "%llu\n", pram_low);
+}
+
+static struct kobj_attribute pram_low_attr = __ATTR_RO(pram_low);
+
+static ssize_t pram_banned_show(struct kobject *kboj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", total_banned);
+}
+
+static struct kobj_attribute pram_banned_attr = __ATTR_RO(pram_banned);
+
+static int page_pool_grow(unsigned long target_size)
+{
+	struct page *page;
+	LIST_HEAD(allocated);
+	unsigned long nr_allocated = 0;
+	int err = 0;
+
+	while (nr_allocated + page_pool_size < target_size) {
+		page = __pram_alloc_new_page(GFP_KERNEL);
+		if (!page) {
+			err = -ENOMEM;
+			break;
+		}
+		list_add(&page->lru, &allocated);
+		nr_allocated++;
+	}
+
+	spin_lock(&page_pool_lock);
+	list_splice(&allocated, &page_pool);
+	page_pool_size += nr_allocated;
+	spin_unlock(&page_pool_lock);
+
+	return err;
+}
+
+static void page_pool_shrink(unsigned long target_size)
+{
+	struct page *page, *tmp;
+	LIST_HEAD(throw_away);
+
+	spin_lock(&page_pool_lock);
+	if (page_pool_size <= target_size) {
+		spin_unlock(&page_pool_lock);
+		return;
+	}
+	list_for_each_entry(page, &page_pool, lru)
+		if (--page_pool_size <= target_size)
+			break;
+	list_cut_position(&throw_away, &page_pool, &page->lru);
+	spin_unlock(&page_pool_lock);
+
+	list_for_each_entry_safe(page, tmp, &throw_away, lru)
+		__free_page(page);
+}
+
+static ssize_t pram_prealloc_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", page_pool_size);
+}
+
+static ssize_t pram_prealloc_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	static DEFINE_MUTEX(mutex);
+	unsigned long target_size;
+	int err = 0;
+
+	if (strict_strtoul(buf, 10, &target_size))
+		return -EINVAL;
+
+	mutex_lock(&mutex);
+	if (page_pool_size > target_size)
+		page_pool_shrink(target_size);
+	else if (page_pool_size < target_size)
+		err = page_pool_grow(target_size);
+	mutex_unlock(&mutex);
+
+	return err ? err : count;
+}
+
+static struct kobj_attribute pram_prealloc_attr =
+	__ATTR(pram_prealloc, 0644, pram_prealloc_show, pram_prealloc_store);
+
+static ssize_t pram_csum_mode_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	const struct pram_csum_mode *cur_csum_mode = pram_get_csum_mode();
+	const struct pram_csum_mode * const *p, *m;
+	int len = 0;
+
+	for (p = csum_mode_list; (m = *p); p++) {
+		if (!strcmp(cur_csum_mode->name, m->name))
+			len += sprintf(buf + len, "[%s] ", m->name);
+		else
+			len += sprintf(buf + len, "%s ", m->name);
+	}
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+static ssize_t pram_csum_mode_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	char raw_name[PRAM_CSUM_NAME_MAX], *name;
+	const struct pram_csum_mode * const *p, *m;
+
+	strlcpy(raw_name, buf, sizeof(raw_name));
+	name = strstrip(raw_name);
+
+	for (p = csum_mode_list; (m = *p); p++) {
+		if (!strcmp(name, m->name))
+			break;
+	}
+	if (!m)
+		return -EINVAL;
+	pram_set_csum_mode(m);
+	return count;
+}
+
+static struct kobj_attribute pram_csum_mode_attr = __ATTR(pram_csum_mode,
+		0644, pram_csum_mode_show, pram_csum_mode_store);
+
+static struct attribute *pram_attrs[] = {
+	&pram_attr.attr,
+	&pram_low_attr.attr,
+	&pram_banned_attr.attr,
+	&pram_prealloc_attr.attr,
+	&pram_csum_mode_attr.attr,
+	NULL,
+};
+
+static struct attribute_group pram_attr_group = {
+	.attrs = pram_attrs,
+};
+
+void __init pram_init(void)
+{
+	int ret;
+
+	pram_select_csum_mode();
+	pram_init_preallocs();
+	ret = pram_build_list();
+	if (ret)
+		printk(KERN_ERR "PRAM: failed to build list: %d\n", ret);
+}
+
+static int __init pram_init_late(void)
+{
+	hotcpu_notifier(pram_callback, 0);
+	register_shrinker(&banned_pages_shrinker);
+	sysfs_update_group(kernel_kobj, &pram_attr_group);
+	return 0;
+}
+module_init(pram_init_late);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/prio_tree.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/prio_tree.c
--- linux-2.6.32-504.3.3.el6.orig/mm/prio_tree.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/prio_tree.c	2015-01-21 12:02:48.695081128 +0300
@@ -205,3 +205,6 @@ struct vm_area_struct *vma_prio_tree_nex
 	} else
 		return NULL;
 }
+
+#include <linux/module.h>
+EXPORT_SYMBOL(vma_prio_tree_next);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/readahead.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/readahead.c
--- linux-2.6.32-504.3.3.el6.orig/mm/readahead.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/readahead.c	2015-01-21 12:02:58.686815895 +0300
@@ -111,6 +111,8 @@ static int read_pages(struct address_spa
 	unsigned page_idx;
 	int ret;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
@@ -507,6 +509,10 @@ void page_cache_sync_readahead(struct ad
 		return;
 	}
 
+	if (virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_READAHEAD,
+				NULL) & NOTIFY_FAIL)
+		return;
+
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
 }
@@ -551,6 +557,10 @@ page_cache_async_readahead(struct addres
 	if (bdi_read_congested(mapping->backing_dev_info))
 		return;
 
+	if (virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_READAHEAD,
+				NULL) & NOTIFY_FAIL)
+		return;
+
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/rmap.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/rmap.c
--- linux-2.6.32-504.3.3.el6.orig/mm/rmap.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/rmap.c	2015-01-21 12:02:58.931809393 +0300
@@ -54,12 +54,17 @@
 #include <linux/rcupdate.h>
 #include <linux/module.h>
 #include <linux/memcontrol.h>
+#include <linux/mmgang.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
 #include <trace/events/kmem.h>
 
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#include <bc/kmem.h>
+
 #include <asm/tlbflush.h>
 
 #include "internal.h"
@@ -67,25 +72,30 @@
 static struct kmem_cache *anon_vma_cachep;
 static struct kmem_cache *anon_vma_chain_cachep;
 
-static inline struct anon_vma *anon_vma_alloc(void)
+static inline struct anon_vma *anon_vma_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+	struct user_beancounter *ub = mm->mm_ub;
+	struct anon_vma *anon_vma;
+
+	anon_vma = ub_kmem_alloc(ub, anon_vma_cachep, GFP_KERNEL);
+	if (anon_vma)
+		anon_vma->anon_vma_ub = get_beancounter(ub);
+
+	return anon_vma;
 }
 
 void anon_vma_free(struct anon_vma *anon_vma)
 {
-	kmem_cache_free(anon_vma_cachep, anon_vma);
-}
+	struct user_beancounter *ub = anon_vma->anon_vma_ub;
 
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
-{
-	return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+	ub_kmem_free(ub, anon_vma_cachep, anon_vma);
+	put_beancounter(ub);
 }
 
-void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
-{
-	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
-}
+#define anon_vma_chain_alloc(mm) \
+	ub_kmem_alloc(mm->mm_ub, anon_vma_chain_cachep, GFP_KERNEL)
+#define anon_vma_chain_free(mm, avc) \
+	ub_kmem_free(mm->mm_ub, anon_vma_chain_cachep, avc)
 
 /**
  * anon_vma_prepare - attach an anon_vma to a memory region
@@ -124,14 +134,14 @@ int anon_vma_prepare(struct vm_area_stru
 		struct mm_struct *mm = vma->vm_mm;
 		struct anon_vma *allocated;
 
-		avc = anon_vma_chain_alloc();
+		avc = anon_vma_chain_alloc(mm);
 		if (!avc)
 			goto out_enomem;
 
 		anon_vma = find_mergeable_anon_vma(vma);
 		allocated = NULL;
 		if (!anon_vma) {
-			anon_vma = anon_vma_alloc();
+			anon_vma = anon_vma_alloc(mm);
 			if (unlikely(!anon_vma))
 				goto out_enomem_free_avc;
 			allocated = anon_vma;
@@ -160,15 +170,16 @@ int anon_vma_prepare(struct vm_area_stru
 		if (unlikely(allocated))
 			anon_vma_free(allocated);
 		if (unlikely(avc))
-			anon_vma_chain_free(avc);
+			anon_vma_chain_free(mm, avc);
 	}
 	return 0;
 
  out_enomem_free_avc:
-	anon_vma_chain_free(avc);
+	anon_vma_chain_free(vma->vm_mm, avc);
  out_enomem:
 	return -ENOMEM;
 }
+EXPORT_SYMBOL(anon_vma_prepare);
 
 static void anon_vma_chain_link(struct vm_area_struct *vma,
 				struct anon_vma_chain *avc,
@@ -196,7 +207,7 @@ int anon_vma_clone(struct vm_area_struct
 	struct anon_vma_chain *avc, *pavc;
 
 	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
-		avc = anon_vma_chain_alloc();
+		avc = anon_vma_chain_alloc(dst->vm_mm);
 		if (!avc)
 			goto enomem_failure;
 		anon_vma_chain_link(dst, avc, pavc->anon_vma);
@@ -230,10 +241,10 @@ int anon_vma_fork(struct vm_area_struct 
 		return -ENOMEM;
 
 	/* Then add our own anon_vma. */
-	anon_vma = anon_vma_alloc();
+	anon_vma = anon_vma_alloc(vma->vm_mm);
 	if (!anon_vma)
 		goto out_error;
-	avc = anon_vma_chain_alloc();
+	avc = anon_vma_chain_alloc(vma->vm_mm);
 	if (!avc)
 		goto out_error_free_anon_vma;
 
@@ -257,6 +268,22 @@ int anon_vma_fork(struct vm_area_struct 
 	return -ENOMEM;
 }
 
+int anon_vma_link(struct vm_area_struct *vma)
+{
+	struct anon_vma_chain *avc;
+
+	avc = anon_vma_chain_alloc(vma->vm_mm);
+	if (!avc)
+		goto enomem_failure;
+
+	anon_vma_chain_link(vma, avc, vma->anon_vma);
+	return 0;
+
+enomem_failure:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(anon_vma_link);
+
 static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 {
 	struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
@@ -289,7 +316,7 @@ void unlink_anon_vmas(struct vm_area_str
 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 		anon_vma_unlink(avc);
 		list_del(&avc->same_vma);
-		anon_vma_chain_free(avc);
+		anon_vma_chain_free(vma->vm_mm, avc);
 	}
 }
 
@@ -345,12 +372,14 @@ out:
 	rcu_read_unlock();
 	return NULL;
 }
+EXPORT_SYMBOL(page_lock_anon_vma);
 
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	anon_vma_unlock(anon_vma);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL(page_unlock_anon_vma);
 
 /*
  * At what user virtual address is page expected in @vma?
@@ -372,6 +401,7 @@ vma_address(struct page *page, struct vm
 	}
 	return address;
 }
+EXPORT_SYMBOL(vma_address);
 
 /*
  * At what user virtual address is page expected in vma?
@@ -513,8 +543,10 @@ int page_referenced_one(struct page *pag
 		}
 
 		/* go ahead even if the pmd is pmd_trans_splitting() */
-		if (pmdp_clear_flush_young_notify(vma, address, pmd))
+		if (pmdp_clear_flush_young_notify(vma, address, pmd)) {
 			referenced++;
+			ClearPageIdle(page);
+		}
 		spin_unlock(&mm->page_table_lock);
 	} else {
 		pte_t *pte;
@@ -545,6 +577,7 @@ int page_referenced_one(struct page *pag
 			 */
 			if (likely(!VM_SequentialReadHint(vma)))
 				referenced++;
+			ClearPageIdle(page);
 		}
 		pte_unmap_unlock(pte, ptl);
 	}
@@ -617,7 +650,7 @@ static int page_referenced_file(struct p
 				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page->mapping, *peer;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
@@ -638,7 +671,7 @@ static int page_referenced_file(struct p
 	 */
 	BUG_ON(!PageLocked(page));
 
-	spin_lock(&mapping->i_mmap_lock);
+	spin_lock_nested(&mapping->i_mmap_lock, SINGLE_DEPTH_NESTING);
 
 	/*
 	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
@@ -646,6 +679,9 @@ static int page_referenced_file(struct p
 	 */
 	mapcount = page_mapcount(page);
 
+	if (!mapcount)
+		goto out;
+
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
@@ -660,9 +696,33 @@ static int page_referenced_file(struct p
 		referenced += page_referenced_one(page, vma, address,
 						  &mapcount, vm_flags);
 		if (!mapcount)
-			break;
+			goto out;
 	}
 
+	list_for_each_entry(peer, &mapping->i_peer_list, i_peer_list) {
+		if (!mapping_mapped(peer))
+			continue;
+
+		spin_lock(&peer->i_mmap_lock);
+
+		vma_prio_tree_foreach(vma, &iter, &peer->i_mmap, pgoff, pgoff) {
+			unsigned long address = vma_address(page, vma);
+			if (address == -EFAULT)
+				continue;
+			if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+				continue;
+			referenced += page_referenced_one(page, vma, address,
+							&mapcount, vm_flags);
+			if (!mapcount)
+				break;
+		}
+
+		spin_unlock(&peer->i_mmap_lock);
+
+		if (!mapcount)
+			break;
+	}
+out:
 	spin_unlock(&mapping->i_mmap_lock);
 	return referenced;
 }
@@ -710,6 +770,11 @@ int page_referenced(struct page *page,
 			referenced++;
 	}
 out:
+	if (PageYoung(page)) {
+		ClearPageYoung(page);
+		if (!referenced)
+			referenced++;
+	}
 
 	return referenced;
 }
@@ -888,6 +953,15 @@ void do_page_add_anon_rmap(struct page *
 {
 	int first = atomic_inc_and_test(&page->_mapcount);
 	if (first) {
+		struct gang_set *gs = get_mm_gang(vma->vm_mm);
+
+		if (PageLRU(page) && !page_in_gang(page, gs) &&
+				!isolate_lru_page(page)) {
+			gang_mod_user_page(page, gs,
+					GFP_ATOMIC|__GFP_NOFAIL);
+			putback_lru_page(page);
+		}
+
 		if (!PageTransHuge(page))
 			__inc_zone_page_state(page, NR_ANON_PAGES);
 		else
@@ -899,9 +973,9 @@ void do_page_add_anon_rmap(struct page *
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-	if (first)
+	if (first) {
 		__page_set_anon_rmap(page, vma, address, exclusive);
-	else
+	} else
 		__page_check_anon_rmap(page, vma, address);
 }
 
@@ -931,6 +1005,7 @@ void page_add_new_anon_rmap(struct page 
 	else
 		add_page_to_unevictable_list(page);
 }
+EXPORT_SYMBOL(page_add_new_anon_rmap);
 
 /**
  * page_add_file_rmap - add pte mapping to a file page
@@ -938,9 +1013,18 @@ void page_add_new_anon_rmap(struct page 
  *
  * The caller needs to hold the pte lock.
  */
-void page_add_file_rmap(struct page *page)
+void page_add_file_rmap(struct page *page, struct mm_struct *mm)
 {
 	if (atomic_inc_and_test(&page->_mapcount)) {
+		struct gang_set *gs = get_mm_gang(mm);
+
+		if (PageLRU(page) && !page_in_gang(page, gs) &&
+				!isolate_lru_page(page)) {
+			gang_mod_user_page(page, gs,
+					GFP_ATOMIC|__GFP_NOFAIL);
+			putback_lru_page(page);
+		}
+
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
 		mem_cgroup_update_file_mapped(page, 1);
 	}
@@ -989,6 +1073,12 @@ void page_remove_rmap(struct page *page)
 	 */
 	if (unlikely(PageHuge(page)))
 		return;
+	/*
+	 * Well, when a page is unmapped, we cannot keep PG_checkpointed
+	 * flag, it is not accessible via process VM and we have no way
+	 * to reset its state
+	 */
+	ClearPageCheckpointed(page);
 	if (PageAnon(page)) {
 		mem_cgroup_uncharge_page(page);
 		if (!PageTransHuge(page))
@@ -1054,7 +1144,7 @@ int try_to_unmap_one(struct page *page, 
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
-		set_page_dirty(page);
+		set_page_dirty_mm(page, mm);
 
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
@@ -1087,14 +1177,23 @@ int try_to_unmap_one(struct page *page, 
 			}
 			dec_mm_counter(mm, anon_rss);
 			inc_mm_counter(mm, swap_usage);
-		} else if (PAGE_MIGRATION) {
+		} else if (PAGE_MIGRATION &
+			   (TTU_ACTION(flags) == TTU_MIGRATION)) {
 			/*
 			 * Store the pfn of the page in a special migration
 			 * pte. do_swap_page() will wait until the migration
 			 * pte is removed and then restart fault handling.
 			 */
-			BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
 			entry = make_migration_entry(page, pte_write(pteval));
+		} else if (SWP_VSWAP_NUM && (TTU_ACTION(flags) == TTU_VSWAP)) {
+			entry = make_vswap_entry(page, pte_write(pteval));
+			__count_vm_event(VSWPOUT);
+			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
+			get_vswap_page(page);
+			set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+			page_remove_rmap(page);
+			goto out_unmap;
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
@@ -1116,7 +1215,7 @@ out_unmap:
 		ret = SWAP_AGAIN;
 		if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
 			if (vma->vm_flags & VM_LOCKED) {
-				mlock_vma_page(page);
+				mlock_vma_page(vma, page);
 				ret = SWAP_MLOCK;
 			}
 			up_read(&vma->vm_mm->mmap_sem);
@@ -1213,7 +1312,7 @@ static int try_to_unmap_cluster(unsigned
 		if (locked_vma) {
 			if (page == check_page) {
 				/* we know we have check_page locked */
-				mlock_vma_page(page);
+				mlock_vma_page(vma, page);
 				ret = SWAP_MLOCK;
 			} else if (trylock_page(page)) {
 				/*
@@ -1221,7 +1320,7 @@ static int try_to_unmap_cluster(unsigned
 				 * Otherwise leave the page alone, it will be
 				 * eventually encountered again later.
 				 */
-				mlock_vma_page(page);
+				mlock_vma_page(vma, page);
 				unlock_page(page);
 			}
 			continue;	/* don't unmap */
@@ -1240,7 +1339,7 @@ static int try_to_unmap_cluster(unsigned
 
 		/* Move the dirty bit to the physical page now the pte is gone. */
 		if (pte_dirty(pteval))
-			set_page_dirty(page);
+			set_page_dirty_mm(page, mm);
 
 		page_remove_rmap(page);
 		page_cache_release(page);
@@ -1323,9 +1422,10 @@ static int try_to_unmap_anon(struct page
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+static int try_to_unmap_mapping(struct page *page,
+				struct address_space *mapping,
+				enum ttu_flags flags)
 {
-	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
@@ -1420,6 +1520,35 @@ out:
 	return ret;
 }
 
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+{
+	struct address_space *mapping = page->mapping, *peer;
+	int ret;
+
+	ret = try_to_unmap_mapping(page, mapping, flags);
+
+	if (ret != SWAP_AGAIN || !page_mapped(page) ||
+	    list_empty(&mapping->i_peer_list) ||
+	    TTU_ACTION(flags) == TTU_MUNLOCK)
+		return ret;
+
+	/*
+	 * Ignore TTU_MUNLOCK, reclaimer can handle it.
+	 * Handle TTU_MIGRATION like TTU_UNMAP, without migration ptes.
+	 */
+	flags = TTU_UNMAP | (flags & ~TTU_ACTION_MASK);
+
+	spin_lock_nested(&mapping->i_mmap_lock, SINGLE_DEPTH_NESTING);
+	list_for_each_entry(peer, &mapping->i_peer_list, i_peer_list) {
+		ret = try_to_unmap_mapping(page, peer, flags);
+		if (ret != SWAP_AGAIN || !page_mapped(page))
+			break;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	return ret;
+}
+
 /**
  * try_to_unmap - try to remove all page table mappings to a page
  * @page: the page to get unmapped
@@ -1515,7 +1644,7 @@ void drop_anon_vma(struct anon_vma *anon
 }
 #endif
 
-#ifdef CONFIG_MIGRATION
+#if defined(CONFIG_MIGRATION) || defined(CONFIG_MEMORY_VSWAP)
 /*
  * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
  * Called by migrate.c to remove migration ptes, but might be used more later.
@@ -1591,7 +1720,7 @@ int rmap_walk(struct page *page, int (*r
 	else
 		return rmap_walk_file(page, rmap_one, arg);
 }
-#endif /* CONFIG_MIGRATION */
+#endif /* defined(CONFIG_MIGRATION) || defined(CONFIG_MEMORY_VSWAP) */
 
 #ifdef CONFIG_HUGETLB_PAGE
 /*
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/shmem.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/shmem.c
--- linux-2.6.32-504.3.3.el6.orig/mm/shmem.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/shmem.c	2015-01-21 12:02:58.908810004 +0300
@@ -30,8 +30,13 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/mmgang.h>
 
+#ifdef CONFIG_VE
+#define shm_mnt		(get_exec_env()->shmem_mnt)
+#else
 static struct vfsmount *shm_mnt;
+#endif
 
 #ifdef CONFIG_SHMEM
 /*
@@ -64,6 +69,10 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/pram.h>
+
+#include <bc/vmpages.h>
+#include <bc/kmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -83,14 +92,39 @@ enum sgp_type {
 };
 
 #ifdef CONFIG_TMPFS
+
+#include <linux/virtinfo.h>
+
+static unsigned long tmpfs_ram_pages(void)
+{
+	unsigned long ub_rampages = ULONG_MAX;
+	struct user_beancounter *ub;
+
+	/*
+	 * tmpfs can be mounted by a kthread
+	 * (e.g. by init during devtmpfs intialization)
+	 */
+	if (unlikely(!current->mm))
+		goto out;
+
+	ub = current->mm->mm_ub;
+	if (ub != get_ub0()) {
+		ub_rampages = ub->ub_parms[UB_PHYSPAGES].limit;
+		if (ub_rampages == UB_MAXVALUE)
+			ub_rampages = ub->ub_parms[UB_PRIVVMPAGES].limit;
+	}
+out:
+	return min(totalram_pages, ub_rampages);
+}
+
 static unsigned long shmem_default_max_blocks(void)
 {
-	return totalram_pages / 2;
+	return tmpfs_ram_pages() / 2;
 }
 
 static unsigned long shmem_default_max_inodes(void)
 {
-	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+	return min(totalram_pages - totalhigh_pages, tmpfs_ram_pages() / 2);
 }
 #endif
 
@@ -115,16 +149,46 @@ static inline struct shmem_sb_info *SHME
  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
  * consistent with the pre-accounting of private mappings ...
  */
-static inline int shmem_acct_size(unsigned long flags, loff_t size)
+static inline int shmem_acct_size(unsigned long flags, loff_t size,
+				  struct user_beancounter *ub)
 {
-	return (flags & VM_NORESERVE) ?
-		0 : security_vm_enough_memory_kern(VM_ACCT(size));
+	long pages = VM_ACCT(size);
+	int ret;
+
+	if (flags & VM_NORESERVE)
+		return 0;
+
+	ret = charge_beancounter(ub, UB_SHMPAGES, pages, UB_HARD);
+	if (ret)
+		goto no_shm;
+
+	ret = charge_beancounter_fast(ub, UB_PRIVVMPAGES, pages, UB_HARD);
+	if (ret)
+		goto no_privvm;
+
+	ret = security_vm_enough_memory_kern(pages);
+	if (ret)
+		goto no_vm;
+	return 0;
+
+no_vm:
+	uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, pages);
+no_privvm:
+	uncharge_beancounter(ub, UB_SHMPAGES, pages);
+no_shm:
+	return ret;
 }
 
-static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+static inline void shmem_unacct_size(unsigned long flags, loff_t size,
+				     struct user_beancounter *ub)
 {
-	if (!(flags & VM_NORESERVE))
-		vm_unacct_memory(VM_ACCT(size));
+	long pages = VM_ACCT(size);
+
+	if (!(flags & VM_NORESERVE)) {
+		vm_unacct_memory(pages);
+		uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, pages);
+		uncharge_beancounter(ub, UB_SHMPAGES, pages);
+	}
 }
 
 /*
@@ -133,21 +197,21 @@ static inline void shmem_unacct_size(uns
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
-static inline int shmem_acct_block(unsigned long flags)
+static inline int shmem_acct_block(struct shmem_inode_info *info)
 {
-	return (flags & VM_NORESERVE) ?
-		security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+	return shmem_acct_size(info->flags ^ VM_NORESERVE,
+			       PAGE_CACHE_SIZE, info->shmi_ub);
 }
 
-static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+static inline void shmem_unacct_blocks(struct shmem_inode_info *info, long pages)
 {
-	if (flags & VM_NORESERVE)
-		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+	shmem_unacct_size(info->flags ^ VM_NORESERVE,
+			  pages << PAGE_SHIFT, info->shmi_ub);
 }
 
 static const struct super_operations shmem_ops;
 static const struct address_space_operations shmem_aops;
-static const struct file_operations shmem_file_operations;
+const struct file_operations shmem_file_operations;
 static const struct inode_operations shmem_inode_operations;
 static const struct inode_operations shmem_dir_inode_operations;
 static const struct inode_operations shmem_special_inode_operations;
@@ -211,7 +275,7 @@ static void shmem_recalc_inode(struct in
 			percpu_counter_add(&sbinfo->used_blocks, -freed);
 		info->alloced -= freed;
 		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
-		shmem_unacct_blocks(info->flags, freed);
+		shmem_unacct_blocks(info, freed);
 	}
 }
 
@@ -544,7 +608,7 @@ static void shmem_delete_inode(struct in
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_mapping->a_ops == &shmem_aops) {
-		shmem_unacct_size(info->flags, inode->i_size);
+		shmem_unacct_size(info->flags, inode->i_size, info->shmi_ub);
 		inode->i_size = 0;
 		shmem_truncate_range(inode, 0, (loff_t)-1);
 		if (!list_empty(&info->swaplist)) {
@@ -686,7 +750,7 @@ static int shmem_writepage(struct page *
 		goto redirty;
 	}
 
-	swap = get_swap_page();
+	swap = get_swap_page(info->shmi_ub);
 	if (!swap.val)
 		goto redirty;
 
@@ -727,6 +791,58 @@ redirty:
 	return 0;
 }
 
+/* Insert a swap entry to shmem inode address space. */
+int shmem_insertpage(struct inode * inode, unsigned long index,
+		     swp_entry_t swap)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct address_space *mapping = inode->i_mapping;
+	int err;
+
+	if (sbinfo->max_blocks && percpu_counter_compare(&sbinfo->used_blocks,
+				sbinfo->max_blocks) >= 0)
+		return -ENOSPC;
+
+	err = shmem_acct_block(info);
+	if (err)
+		return err;
+
+	err = radix_tree_preload(GFP_KERNEL);
+	if (err)
+		goto out;
+
+	spin_lock_irq(&mapping->tree_lock);
+	err = radix_tree_insert(&mapping->page_tree, index,
+				swp_to_radix_entry(swap));
+	spin_unlock_irq(&mapping->tree_lock);
+	radix_tree_preload_end();
+
+	if (err == -EEXIST)
+		err = -EBUSY;
+	if (err)
+		goto out;
+
+	spin_lock(&info->lock);
+	info->alloced++;
+	inode->i_blocks += BLOCKS_PER_PAGE;
+	info->swapped++;
+	spin_unlock(&info->lock);
+
+	if (list_empty(&info->swaplist)) {
+		mutex_lock(&shmem_swaplist_mutex);
+		/* move instead of add in case we're racing */
+		list_move_tail(&info->swaplist, &shmem_swaplist);
+		mutex_unlock(&shmem_swaplist_mutex);
+	}
+out:
+	if (err)
+		shmem_unacct_blocks(info, 1);
+	return err;
+}
+EXPORT_SYMBOL(shmem_insertpage);
+
+
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -760,6 +876,9 @@ static struct page *shmem_swapin(swp_ent
 	struct mempolicy mpol, *spol;
 	struct vm_area_struct pvma;
 
+	if (ub_check_ram_limits(get_exec_ub(), gfp))
+		return NULL;
+
 	spol = mpol_cond_copy(&mpol,
 			mpol_shared_policy_lookup(&info->policy, index));
 
@@ -777,6 +896,9 @@ static struct page *shmem_alloc_page(gfp
 {
 	struct vm_area_struct pvma;
 
+	if (ub_check_ram_limits(get_exec_ub(), gfp))
+		return NULL;
+
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
 	/* Bias interleave by inode number to distribute better across nodes */
@@ -799,12 +921,16 @@ static inline void shmem_show_mpol(struc
 static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
+	if (ub_check_ram_limits(get_exec_ub(), gfp))
+		return NULL;
 	return swapin_readahead(swap, gfp, NULL, 0);
 }
 
 static inline struct page *shmem_alloc_page(gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
+	if (ub_check_ram_limits(get_exec_ub(), gfp))
+		return NULL;
 	return alloc_page(gfp);
 }
 #endif /* CONFIG_NUMA */
@@ -920,7 +1046,7 @@ repeat:
 		swap_free(swap);
 
 	} else {
-		if (shmem_acct_block(info->flags)) {
+		if (shmem_acct_block(info)) {
 			error = -ENOSPC;
 			goto failed;
 		}
@@ -941,13 +1067,20 @@ repeat:
 
 		SetPageSwapBacked(page);
 		__set_page_locked(page);
+		error = gang_add_user_page(page, get_mapping_gang(mapping), gfp);
+		if (error)
+			goto decused;
 		error = mem_cgroup_cache_charge(page, current->mm,
 						gfp & GFP_RECLAIM_MASK);
-		if (!error)
+		if (error)
+			gang_del_user_page(page);
+		else
 			error = shmem_add_to_page_cache(page, mapping, index,
 						gfp, NULL);
-		if (error)
+		if (error) {
+			gang_del_user_page(page);
 			goto decused;
+		}
 		lru_cache_add_anon(page);
 
 		spin_lock(&info->lock);
@@ -987,7 +1120,7 @@ decused:
 	if (sbinfo->max_blocks)
 		percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
-	shmem_unacct_blocks(info->flags, 1);
+	shmem_unacct_blocks(info, 1);
 failed:
 	if (swap.val && error != -EINVAL) {
 		struct page *test = find_get_page(mapping, index);
@@ -1026,6 +1159,63 @@ static int shmem_fault(struct vm_area_st
 	return ret;
 }
 
+int install_shmem_page(struct vm_area_struct *vma,
+		       unsigned long addr, struct page *page)
+{
+	unsigned long index = (((addr & PAGE_MASK)
+			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	bool in_lru = pram_page_dirty(page);
+	int err;
+
+	if (sbinfo->max_blocks && percpu_counter_compare(&sbinfo->used_blocks,
+				sbinfo->max_blocks) >= 0)
+		return -ENOSPC;
+
+	if (!in_lru) {
+		SetPageSwapBacked(page);
+		err = gang_add_user_page(page, get_mapping_gang(mapping),
+					 GFP_KERNEL);
+		if (err)
+			return err;
+		lru_cache_add_anon(page);
+	}
+
+	err = shmem_acct_block(info);
+	if (err)
+		return err;
+
+	lock_page(page);
+
+	err = shmem_add_to_page_cache(page, mapping, index,
+			mapping_gfp_mask(mapping), NULL);
+	if (err) {
+		shmem_unacct_blocks(info, 1);
+		unlock_page(page);
+		return err;
+	}
+
+	if (sbinfo->max_blocks)
+		percpu_counter_inc(&sbinfo->used_blocks);
+
+	spin_lock(&info->lock);
+	info->alloced++;
+	inode->i_blocks += BLOCKS_PER_PAGE;
+	spin_unlock(&info->lock);
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	set_page_dirty(page);
+
+	unlock_page(page);
+
+	return 0;
+}
+EXPORT_SYMBOL(install_shmem_page);
+
 #ifdef CONFIG_NUMA
 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
@@ -1052,20 +1242,27 @@ int shmem_lock(struct file *file, int lo
 
 	spin_lock(&info->lock);
 	if (lock && !(info->flags & VM_LOCKED)) {
+		if (ub_lockedshm_charge(info, inode->i_size) < 0)
+			goto out_ch;
+
 		if (!user_shm_lock(inode->i_size, user))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
 		mapping_set_unevictable(file->f_mapping);
 	}
 	if (!lock && (info->flags & VM_LOCKED) && user) {
+		ub_lockedshm_uncharge(info, inode->i_size);
 		user_shm_unlock(inode->i_size, user);
 		info->flags &= ~VM_LOCKED;
 		mapping_clear_unevictable(file->f_mapping);
 		scan_mapping_unevictable_pages(file->f_mapping);
 	}
-	retval = 0;
+	spin_unlock(&info->lock);
+	return 0;
 
 out_nomem:
+	ub_lockedshm_uncharge(info, inode->i_size);
+out_ch:
 	spin_unlock(&info->lock);
 	return retval;
 }
@@ -1749,8 +1946,10 @@ static int shmem_encode_fh(struct dentry
 {
 	struct inode *inode = dentry->d_inode;
 
-	if (*len < 3)
+	if (*len < 3) {
+		*len = 3;
 		return 255;
+	}
 
 	if (hlist_unhashed(&inode->i_hash)) {
 		/* Unfortunately insert_inode_hash is not idempotent,
@@ -1804,6 +2003,8 @@ static int shmem_parse_options(char *opt
 		}
 		if (!*this_char)
 			continue;
+		if (!strcmp(this_char, "relatime"))
+			continue;
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
@@ -1818,7 +2019,7 @@ static int shmem_parse_options(char *opt
 			size = memparse(value,&rest);
 			if (*rest == '%') {
 				size <<= PAGE_SHIFT;
-				size *= totalram_pages;
+				size *= tmpfs_ram_pages();
 				do_div(size, 100);
 				rest++;
 			}
@@ -2018,20 +2219,25 @@ static struct kmem_cache *shmem_inode_ca
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
+	struct user_beancounter *ub = get_exec_ub();
 	struct shmem_inode_info *info;
-	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+	info = ub_kmem_alloc(ub, shmem_inode_cachep, GFP_KERNEL);
 	if (!info)
 		return NULL;
+	info->shmi_ub = get_beancounter(ub);
 	return &info->vfs_inode;
 }
 
 static void shmem_destroy_inode(struct inode *inode)
 {
+	struct user_beancounter *ub = SHMEM_I(inode)->shmi_ub;
+
 	if ((inode->i_mode & S_IFMT) == S_IFREG) {
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
-	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+	ub_kmem_free(ub, shmem_inode_cachep, SHMEM_I(inode));
+	put_beancounter(ub);
 }
 
 static void shmem_init_inode(void *foo)
@@ -2064,7 +2270,7 @@ static const struct address_space_operat
 	.error_remove_page = generic_error_remove_page,
 };
 
-static const struct file_operations shmem_file_operations = {
+const struct file_operations shmem_file_operations = {
 	.mmap		= shmem_mmap,
 #ifdef CONFIG_TMPFS
 	.llseek		= generic_file_llseek,
@@ -2077,6 +2283,7 @@ static const struct file_operations shme
 	.splice_write	= generic_file_splice_write,
 #endif
 };
+EXPORT_SYMBOL(shmem_file_operations);
 
 static const struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
@@ -2146,6 +2353,17 @@ static const struct vm_operations_struct
 #endif
 };
 
+static int is_shmem_mapping(struct address_space *map)
+{
+	return (map != NULL && map->a_ops == &shmem_aops);
+}
+
+int is_shmem_vma(struct vm_area_struct *vma)
+{
+	return (vma->vm_file && is_shmem_mapping(
+			vma->vm_file->f_path.dentry->d_inode->i_mapping));
+}
+EXPORT_SYMBOL(is_shmem_vma);
 
 static int shmem_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
@@ -2153,12 +2371,13 @@ static int shmem_get_sb(struct file_syst
 	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
 }
 
-static struct file_system_type shmem_fs_type = {
+struct file_system_type shmem_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "tmpfs",
 	.get_sb		= shmem_get_sb,
 	.kill_sb	= kill_litter_super,
 };
+EXPORT_SYMBOL(shmem_fs_type);
 
 int __init shmem_init(void)
 {
@@ -2246,8 +2465,8 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 #define shmem_vm_ops				generic_file_vm_ops
 #define shmem_file_operations			ramfs_file_operations
 #define shmem_get_inode(sb, mode, dev, flags)	ramfs_get_inode(sb, mode, dev)
-#define shmem_acct_size(flags, size)		0
-#define shmem_unacct_size(flags, size)		do {} while (0)
+#define shmem_acct_size(flags, size, ub)	0
+#define shmem_unacct_size(flags, size, ub)	do {} while (0)
 
 #endif /* CONFIG_SHMEM */
 
@@ -2274,7 +2493,7 @@ struct file *shmem_file_setup(const char
 	if (size < 0 || size > MAX_LFS_FILESIZE)
 		return ERR_PTR(-EINVAL);
 
-	if (shmem_acct_size(flags, size))
+	if (shmem_acct_size(flags, size, get_exec_ub()))
 		return ERR_PTR(-ENOMEM);
 
 	error = -ENOMEM;
@@ -2282,15 +2501,15 @@ struct file *shmem_file_setup(const char
 	this.len = strlen(name);
 	this.hash = 0; /* will go */
 	root = shm_mnt->mnt_root;
+	path.mnt = mntget(shm_mnt);
 	path.dentry = d_alloc(root, &this);
 	if (!path.dentry)
 		goto put_memory;
-	path.mnt = mntget(shm_mnt);
 
 	error = -ENOSPC;
 	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
 	if (!inode)
-		goto put_dentry;
+		goto put_memory;
 
 	d_instantiate(path.dentry, inode);
 	inode->i_size = size;
@@ -2309,10 +2528,10 @@ struct file *shmem_file_setup(const char
 
 	return file;
 
+put_memory:
+	shmem_unacct_size(flags, size, get_exec_ub());
 put_dentry:
 	path_put(&path);
-put_memory:
-	shmem_unacct_size(flags, size);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
@@ -2332,6 +2551,10 @@ int shmem_zero_setup(struct vm_area_stru
 
 	if (vma->vm_file)
 		fput(vma->vm_file);
+	else if (vma->vm_flags & VM_WRITE)
+               uncharge_beancounter_fast(vma->vm_mm->mm_ub, UB_PRIVVMPAGES,
+                               size >> PAGE_SHIFT);
+
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
 	vma->vm_flags |= VM_CAN_NONLINEAR;
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/slab.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/slab.c
--- linux-2.6.32-504.3.3.el6.orig/mm/slab.c	2014-12-12 23:29:22.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/slab.c	2015-01-21 12:02:58.902810163 +0300
@@ -116,32 +116,16 @@
 #include	<linux/debugobjects.h>
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
+#include	<linux/nmi.h>
+#include	<linux/vzstat.h>
 
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
 #include	<asm/page.h>
 
-#include <linux/kmemtrace.h>
-
-/*
- * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
- *		  0 for faster, smaller code (especially in the critical paths).
- *
- * STATS	- 1 to collect stats for /proc/slabinfo.
- *		  0 for faster, smaller code (especially in the critical paths).
- *
- * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
- */
+#include	<bc/kmem.h>
 
-#ifdef CONFIG_DEBUG_SLAB
-#define	DEBUG		1
-#define	STATS		1
-#define	FORCED_DEBUG	1
-#else
-#define	DEBUG		0
-#define	STATS		0
-#define	FORCED_DEBUG	0
-#endif
+#include <linux/kmemtrace.h>
 
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
@@ -176,19 +160,21 @@
 #endif
 
 /* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
+#if SLAB_DEBUG
 # define CREATE_MASK	(SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | \
 			 SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+			 SLAB_UBC | SLAB_NO_CHARGE | \
 			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+			 SLAB_UBC | SLAB_NO_CHARGE | \
 			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
 #endif
 
@@ -392,12 +378,14 @@ static void kmem_list3_init(struct kmem_
 #define REAPTIMEOUT_CPUC	(2*HZ)
 #define REAPTIMEOUT_LIST3	(4*HZ)
 
-#if STATS
+#if SLAB_STATS
+#define	STATS_INC_GROWN(x)	((x)->grown++)
+#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
+#define	STATS_INC_SHRUNK(x)	((x)->shrunk++)
+
 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
-#define	STATS_INC_GROWN(x)	((x)->grown++)
-#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
 #define	STATS_SET_HIGH(x)						\
 	do {								\
 		if ((x)->num_active > (x)->high_mark)			\
@@ -417,11 +405,12 @@ static void kmem_list3_init(struct kmem_
 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
 #else
+#define	STATS_INC_GROWN(x)	do { } while (0)
+#define	STATS_ADD_REAPED(x,y)	do { } while (0)
+#define	STATS_INC_SHRUNK(x)	do { } while (0)
 #define	STATS_INC_ACTIVE(x)	do { } while (0)
 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
 #define	STATS_INC_ALLOCED(x)	do { } while (0)
-#define	STATS_INC_GROWN(x)	do { } while (0)
-#define	STATS_ADD_REAPED(x,y)	do { } while (0)
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
@@ -434,7 +423,7 @@ static void kmem_list3_init(struct kmem_
 #define STATS_INC_FREEMISS(x)	do { } while (0)
 #endif
 
-#if DEBUG
+#if SLAB_DEBUG
 
 /*
  * memory layout of objects:
@@ -574,6 +563,8 @@ struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
 	CACHE(ULONG_MAX)
+#include <linux/kmalloc_sizes.h>
+	CACHE(ULONG_MAX)
 #undef CACHE
 };
 EXPORT_SYMBOL(malloc_sizes);
@@ -587,17 +578,26 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
+	{NULL,},
+#undef CACHE
+#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" },
+#include <linux/kmalloc_sizes.h>
 	{NULL,}
 #undef CACHE
 };
 
+int malloc_cache_num;
+EXPORT_SYMBOL(malloc_cache_num);
+
 static struct arraycache_init initarray_cache __initdata =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
 static struct kmem_cache cache_cache = {
+	.nodelists = cache_cache_nodelists,
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
@@ -666,7 +666,8 @@ static inline void init_lock_keys(void)
  * Guard access to the cache-chain.
  */
 static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
+static LIST_HEAD(cache_chain);
+static DEFINE_SPINLOCK(cache_chain_lock);
 
 /*
  * chicken and egg problem: delay the per-cpu array allocation
@@ -700,7 +701,9 @@ static inline struct kmem_cache *__find_
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
-#if DEBUG
+	if (gfpflags & __GFP_UBC)
+		csizep += malloc_cache_num;
+#if SLAB_DEBUG
 	/* This happens if someone tries to call
 	 * kmem_cache_create(), or __kmalloc(), before
 	 * the generic caches are initialized.
@@ -730,9 +733,102 @@ static struct kmem_cache *kmem_find_gene
 	return __find_general_cachep(size, gfpflags);
 }
 
-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+{
+	return (kmem_bufctl_t *) (slabp + 1);
+}
+
+#ifdef CONFIG_BEANCOUNTERS
+#define init_slab_ubps(cachep, slabp)	do {				\
+		if (!((cachep)->flags & SLAB_UBC))			\
+			break;						\
+		memset(slab_ubcs(cachep, slabp), 0,			\
+				(cachep)->num * sizeof(void *));	\
+	} while (0)
+
+#define UB_ALIGN(flags)		(flags & SLAB_UBC ? sizeof(void *) : 1)
+#define UB_EXTRA(flags)		(flags & SLAB_UBC ? sizeof(void *) : 0)
+#define set_cache_objuse(cachep)	do {				\
+		(cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) +	\
+				(cachep)->num - 1) / (cachep)->num;	\
+		if (!OFF_SLAB(cachep))					\
+			break;						\
+		(cachep)->objuse += ((cachep)->slabp_cache->objuse +	\
+				(cachep)->num - 1) / (cachep)->num;	\
+	} while (0)
+
+void kmem_mark_nocharge(struct kmem_cache *cachep)
+{
+	cachep->flags |= SLAB_NO_CHARGE;
+}
+
+int kmem_cache_objuse(struct kmem_cache *cachep)
+{
+	return cachep->objuse;
+}
+
+EXPORT_SYMBOL(kmem_cache_objuse);
+
+int kmem_obj_objuse(void *obj)
+{
+	return virt_to_cache(obj)->objuse;
+}
+
+int kmem_dname_objuse(void *obj)
+{
+	return virt_to_cache(obj)->objuse;
+}
+
+unsigned long ub_cache_growth(struct kmem_cache *cachep)
+{
+#if SLAB_STATS
+	return (cachep->grown - cachep->reaped - cachep->shrunk)
+		<< cachep->gfporder;
+#else
+	return 0;
+#endif
+}
+
+#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\
+		(ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\
+		       sizeof(void *))))
+
+struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj)
+{
+	struct slab *slabp;
+	int objnr;
+
+	BUG_ON(!(cachep->flags & SLAB_UBC));
+	slabp = virt_to_slab(obj);
+	objnr = (obj - slabp->s_mem) / cachep->buffer_size;
+	return slab_ubcs(cachep, slabp) + objnr;
+}
+
+struct user_beancounter *slab_ub(void *obj)
 {
-	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+	return *ub_slab_ptr(virt_to_cache(obj), obj);
+}
+
+EXPORT_SYMBOL(slab_ub);
+
+#else
+#define UB_ALIGN(flags)		1
+#define UB_EXTRA(flags)		0
+#define set_cache_objuse(c)	do { } while (0)
+#define init_slab_ubps(c, s)	do { } while (0)
+#endif
+
+static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags)
+{
+	size_t size_noub;
+
+	size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t);
+	return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags);
+}
+
+static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags)
+{
+	return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align);
 }
 
 /*
@@ -777,20 +873,23 @@ static void cache_estimate(unsigned long
 		 * into account.
 		 */
 		nr_objs = (slab_size - sizeof(struct slab)) /
-			  (buffer_size + sizeof(kmem_bufctl_t));
+			  (buffer_size + sizeof(kmem_bufctl_t) +
+			   	UB_EXTRA(flags));
 
 		/*
 		 * This calculated number will be either the right
 		 * amount, or one greater than what we want.
 		 */
-		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
-		       > slab_size)
+		if (slab_mgmt_size(nr_objs, align, flags) +
+				nr_objs * buffer_size > slab_size)
 			nr_objs--;
+		BUG_ON(slab_mgmt_size(nr_objs, align, flags) +
+				nr_objs * buffer_size > slab_size);
 
 		if (nr_objs > SLAB_LIMIT)
 			nr_objs = SLAB_LIMIT;
 
-		mgmt_size = slab_mgmt_size(nr_objs, align);
+		mgmt_size = slab_mgmt_size(nr_objs, align, flags);
 	}
 	*num = nr_objs;
 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -1425,6 +1524,7 @@ static void __init init_list(struct kmem
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
 }
+static int offslab_limit;
 
 /*
  * For setting up all the kmem_list3s for cache whose buffer_size is same as
@@ -1495,19 +1595,17 @@ void __init kmem_cache_init(void)
 	node = numa_node_id();
 
 	/* 1) create the cache_cache */
-	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
 
 	/*
-	 * struct kmem_cache size depends on nr_node_ids, which
-	 * can be less than MAX_NUMNODES.
+	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
 	 */
-	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
-				 nr_node_ids * sizeof(struct kmem_list3 *);
-#if DEBUG
+	cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+				  nr_node_ids * sizeof(struct kmem_list3 *);
+#if SLAB_DEBUG
 	cache_cache.obj_size = cache_cache.buffer_size;
 #endif
 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
@@ -1554,6 +1652,7 @@ void __init kmem_cache_init(void)
 
 	slab_early_init = 0;
 
+	for (i = 0; i < 2; i++) {
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
 		 * For performance, all the general caches are L1 aligned.
@@ -1566,21 +1665,30 @@ void __init kmem_cache_init(void)
 			sizes->cs_cachep = kmem_cache_create(names->name,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
-					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC|
+					(i ? SLAB_UBC : 0)|SLAB_NO_CHARGE,
 					NULL);
 		}
+		if (!(OFF_SLAB(sizes->cs_cachep)))
+			offslab_limit = sizes->cs_size;
 #ifdef CONFIG_ZONE_DMA
-		sizes->cs_dmacachep = kmem_cache_create(
-					names->name_dma,
+		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+					(i ? SLAB_UBC : 0) | SLAB_NO_CHARGE|
 						SLAB_PANIC,
 					NULL);
 #endif
 		sizes++;
 		names++;
 	}
+
+	sizes++;
+	names++;
+	if (!i)
+		malloc_cache_num = sizes - malloc_sizes;
+	}
 	/* 4) Replace the bootstrap head arrays */
 	{
 		struct array_cache *ptr;
@@ -1769,7 +1877,7 @@ static void kmem_rcu_free(struct rcu_hea
 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
 }
 
-#if DEBUG
+#if SLAB_DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1846,7 +1954,7 @@ static void dump_line(char *data, int of
 }
 #endif
 
-#if DEBUG
+#if SLAB_DEBUG
 
 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 {
@@ -1939,7 +2047,7 @@ static void check_poison_obj(struct kmem
 }
 #endif
 
-#if DEBUG
+#if SLAB_DEBUG
 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
@@ -2039,7 +2147,6 @@ static void __kmem_cache_destroy(struct 
 static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
-	unsigned long offslab_limit;
 	size_t left_over = 0;
 	int gfporder;
 
@@ -2052,15 +2159,10 @@ static size_t calculate_slab_order(struc
 			continue;
 
 		if (flags & CFLGS_OFF_SLAB) {
-			/*
-			 * Max number of objs-per-slab for caches which
-			 * use off-slab slabs. Needed to avoid a possible
-			 * looping condition in cache_grow().
-			 */
-			offslab_limit = size - sizeof(struct slab);
-			offslab_limit /= sizeof(kmem_bufctl_t);
+			int slab_size;
 
- 			if (num > offslab_limit)
+			slab_size = slab_mgmt_size_noalign(num, flags);
+			if (slab_size > offslab_limit)
 				break;
 		}
 
@@ -2182,6 +2284,7 @@ kmem_cache_create (const char *name, siz
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
+	unsigned long irq_flags;
 	gfp_t gfp;
 
 	/*
@@ -2228,9 +2331,9 @@ kmem_cache_create (const char *name, siz
 		}
 	}
 
-#if DEBUG
+#if SLAB_DEBUG
 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
-#if FORCED_DEBUG
+#if SLAB_FORCED_DEBUG
 	/*
 	 * Enable redzoning and last user accounting, except for caches with
 	 * large objects, if the increased size would increase the object size
@@ -2316,11 +2419,12 @@ kmem_cache_create (const char *name, siz
 		gfp = GFP_NOWAIT;
 
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, gfp);
+	cachep = kmem_cache_zalloc(&cache_cache, gfp | __GFP_REPEAT);
 	if (!cachep)
 		goto oops;
 
-#if DEBUG
+	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+#if SLAB_DEBUG
 	cachep->obj_size = size;
 
 	/*
@@ -2342,7 +2446,7 @@ kmem_cache_create (const char *name, siz
 		else
 			size += BYTES_PER_WORD;
 	}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
+#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
 	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
 		cachep->obj_offset += PAGE_SIZE - size;
@@ -2376,8 +2480,7 @@ kmem_cache_create (const char *name, siz
 		cachep = NULL;
 		goto oops;
 	}
-	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-			  + sizeof(struct slab), align);
+	slab_size = slab_mgmt_size(cachep->num, align, flags);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -2390,8 +2493,7 @@ kmem_cache_create (const char *name, siz
 
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
-		slab_size =
-		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+		slab_size = slab_mgmt_size_noalign(cachep->num, flags);
 
 #ifdef CONFIG_PAGE_POISONING
 		/* If we're going to use the generic kernel_map_pages()
@@ -2437,7 +2539,10 @@ kmem_cache_create (const char *name, siz
 	}
 
 	/* cache setup completed, link it into the list */
+	spin_lock_irqsave(&cache_chain_lock, irq_flags);
 	list_add(&cachep->next, &cache_chain);
+	spin_unlock_irqrestore(&cache_chain_lock, irq_flags);
+	set_cache_objuse(cachep);
 oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
@@ -2450,7 +2555,7 @@ oops:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
-#if DEBUG
+#if SLAB_DEBUG
 static void check_irq_off(void)
 {
 	BUG_ON(!irqs_disabled());
@@ -2546,10 +2651,11 @@ static int drain_freelist(struct kmem_ca
 		}
 
 		slabp = list_entry(p, struct slab, list);
-#if DEBUG
+#if SLAB_DEBUG
 		BUG_ON(slabp->inuse);
 #endif
 		list_del(&slabp->list);
+		STATS_INC_SHRUNK(cache);
 		/*
 		 * Safe to drop the lock. The slab is no longer linked
 		 * to the cache.
@@ -2632,10 +2738,14 @@ void kmem_cache_destroy(struct kmem_cach
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
 	 */
+	spin_lock_irq(&cache_chain_lock);
 	list_del(&cachep->next);
+	spin_unlock_irq(&cache_chain_lock);
 	if (__cache_shrink(cachep)) {
 		slab_error(cachep, "Can't free all objects");
+		spin_lock_irq(&cache_chain_lock);
 		list_add(&cachep->next, &cache_chain);
+		spin_unlock_irq(&cache_chain_lock);
 		mutex_unlock(&cache_chain_mutex);
 		put_online_cpus();
 		return;
@@ -2644,6 +2754,8 @@ void kmem_cache_destroy(struct kmem_cach
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
 		rcu_barrier();
 
+
+	ub_kmemcache_free(cachep);
 	__kmem_cache_destroy(cachep);
 	mutex_unlock(&cache_chain_mutex);
 	put_online_cpus();
@@ -2670,15 +2782,15 @@ static struct slab *alloc_slabmgmt(struc
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
-					      local_flags, nodeid);
+					(local_flags & ~__GFP_UBC), nodeid);
 		/*
 		 * If the first object in the slab is leaked (it's allocated
 		 * but no one has a reference to it), we want to make sure
 		 * kmemleak does not treat the ->s_mem pointer as a reference
 		 * to the object. Otherwise we will not report the leak.
 		 */
-		kmemleak_scan_area(slabp, offsetof(struct slab, list),
-				   sizeof(struct list_head), local_flags);
+		kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
+				   local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -2690,14 +2802,10 @@ static struct slab *alloc_slabmgmt(struc
 	slabp->s_mem = objp + colour_off;
 	slabp->nodeid = nodeid;
 	slabp->free = 0;
+	init_slab_ubps(cachep, slabp);
 	return slabp;
 }
 
-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
-{
-	return (kmem_bufctl_t *) (slabp + 1);
-}
-
 static void cache_init_objs(struct kmem_cache *cachep,
 			    struct slab *slabp)
 {
@@ -2705,7 +2813,7 @@ static void cache_init_objs(struct kmem_
 
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, slabp, i);
-#if DEBUG
+#if SLAB_DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
 			poison_obj(cachep, objp, POISON_FREE);
@@ -2763,7 +2871,7 @@ static void *slab_get_obj(struct kmem_ca
 
 	slabp->inuse++;
 	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
+#if SLAB_DEBUG
 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 	WARN_ON(slabp->nodeid != nodeid);
 #endif
@@ -2777,7 +2885,7 @@ static void slab_put_obj(struct kmem_cac
 {
 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 
-#if DEBUG
+#if SLAB_DEBUG
 	/* Verify that the slab belongs to the intended node */
 	WARN_ON(slabp->nodeid != nodeid);
 
@@ -2865,7 +2973,7 @@ static int cache_grow(struct kmem_cache 
 	 * 'nodeid'.
 	 */
 	if (!objp)
-		objp = kmem_getpages(cachep, local_flags, nodeid);
+		objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid);
 	if (!objp)
 		goto failed;
 
@@ -2884,10 +2992,22 @@ static int cache_grow(struct kmem_cache 
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 
+	/*
+	 * The cache could have been grown by another process while we were
+	 * allocating the slab. Bail out if there are too many free objects.
+	 */
+	if (unlikely(l3->free_objects > l3->free_limit)) {
+		kmem_freepages(cachep, objp);
+		if (OFF_SLAB(cachep))
+			kmem_cache_free(cachep->slabp_cache, slabp);
+		goto out;
+	}
+
 	/* Make slab active. */
 	list_add_tail(&slabp->list, &(l3->slabs_free));
 	STATS_INC_GROWN(cachep);
 	l3->free_objects += cachep->num;
+out:
 	spin_unlock(&l3->list_lock);
 	return 1;
 opps1:
@@ -2898,7 +3018,7 @@ failed:
 	return 0;
 }
 
-#if DEBUG
+#if SLAB_DEBUG
 
 /*
  * Perform extra freeing checks:
@@ -3111,12 +3231,12 @@ static inline void cache_alloc_debugchec
 						gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
-#if DEBUG
+#if SLAB_DEBUG
 	kmem_flagcheck(cachep, flags);
 #endif
 }
 
-#if DEBUG
+#if SLAB_DEBUG
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 				gfp_t flags, void *objp, void *caller)
 {
@@ -3403,6 +3523,7 @@ __cache_alloc_node(struct kmem_cache *ca
 	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
+	WARN_ON((flags & __GFP_FS) && current->journal_info);
 
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
@@ -3500,6 +3621,11 @@ __cache_alloc(struct kmem_cache *cachep,
 	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
 				 flags);
 	prefetchw(objp);
+	if (objp && should_charge(cachep->flags, flags) &&
+			ub_slab_charge(cachep, objp, flags)) {
+		kmem_cache_free(cachep, objp);
+		objp = NULL;
+	}
 
 	if (likely(objp))
 		kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
@@ -3536,6 +3662,7 @@ static void free_block(struct kmem_cache
 		/* fixup slab chains */
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
+				STATS_INC_SHRUNK(cachep);
 				l3->free_objects -= cachep->num;
 				/* No need to drop any previously held
 				 * lock here, even if we have a off-slab slab
@@ -3564,7 +3691,7 @@ static void cache_flusharray(struct kmem
 	int node = numa_node_id();
 
 	batchcount = ac->batchcount;
-#if DEBUG
+#if SLAB_DEBUG
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
@@ -3585,7 +3712,7 @@ static void cache_flusharray(struct kmem
 
 	free_block(cachep, ac->entry, batchcount, node);
 free_done:
-#if STATS
+#if SLAB_STATS
 	{
 		int i = 0;
 		struct list_head *p;
@@ -3622,6 +3749,9 @@ static inline void __cache_free(struct k
 
 	kmemcheck_slab_free(cachep, objp, obj_size(cachep));
 
+	if (should_uncharge(cachep->flags))
+		ub_slab_uncharge(cachep, objp);
+
 	/*
 	 * Skip calling cache_free_alien() when the platform is not numa.
 	 * This will avoid cache misses that happen while accessing slabp (which
@@ -4002,7 +4132,7 @@ fail:
 
 struct ccupdate_struct {
 	struct kmem_cache *cachep;
-	struct array_cache *new[NR_CPUS];
+	struct array_cache *new[0];
 };
 
 static void do_ccupdate_local(void *info)
@@ -4024,7 +4154,8 @@ static int do_tune_cpucache(struct kmem_
 	struct ccupdate_struct *new;
 	int i;
 
-	new = kzalloc(sizeof(*new), gfp);
+	new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+		      gfp);
 	if (!new)
 		return -ENOMEM;
 
@@ -4099,7 +4230,7 @@ static int enable_cpucache(struct kmem_c
 	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
 		shared = 8;
 
-#if DEBUG
+#if SLAB_DEBUG
 	/*
 	 * With debugging enabled, large batchcount lead to excessively long
 	 * periods with disabled local interrupts. Limit the batchcount
@@ -4166,6 +4297,7 @@ static void cache_reap(struct work_struc
 		/* Give up. Setup the next iteration. */
 		goto out;
 
+	{KSTAT_PERF_ENTER(cache_reap)
 	list_for_each_entry(searchp, &cache_chain, next) {
 		check_irq_on();
 
@@ -4206,6 +4338,7 @@ next:
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
+	KSTAT_PERF_LEAVE(cache_reap)}
 out:
 	/* Set up the next iteration */
 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
@@ -4219,7 +4352,7 @@ static void print_slabinfo_header(struct
 	 * Output format version, so at least we can change it
 	 * without _too_ many complaints.
 	 */
-#if STATS
+#if SLAB_STATS
 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
 	seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4228,14 +4361,83 @@ static void print_slabinfo_header(struct
 		 "<objperslab> <pagesperslab>");
 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
+#if SLAB_STATS
 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow> <shrunk>");
 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 	seq_putc(m, '\n');
 }
 
+#define SHOW_TOP_SLABS	10
+
+static unsigned long get_cache_size(struct kmem_cache *cachep)
+{
+	unsigned long flags;
+	unsigned long slabs;
+	struct kmem_list3 *l3;
+	struct list_head *lh;
+	int node;
+
+	slabs = 0;
+
+	for_each_online_node (node) {
+		l3 = cachep->nodelists[node];
+		if (l3 == NULL)
+			continue;
+
+		spin_lock_irqsave(&l3->list_lock, flags);
+		list_for_each (lh, &l3->slabs_full)
+			slabs++;
+		list_for_each (lh, &l3->slabs_partial)
+			slabs++;
+		list_for_each (lh, &l3->slabs_free)
+			slabs++;
+		spin_unlock_irqrestore(&l3->list_lock, flags);
+	}
+
+	return slabs * (PAGE_SIZE << cachep->gfporder) +
+		(OFF_SLAB(cachep) ?
+		 cachep->slabp_cache->buffer_size * slabs : 0);
+}
+
+void show_slab_info(void)
+{
+	int i, j;
+	unsigned long size;
+	struct kmem_cache *ptr;
+	unsigned long sizes[SHOW_TOP_SLABS];
+	struct kmem_cache *top[SHOW_TOP_SLABS];
+	unsigned long flags;
+
+	memset(top, 0, sizeof(top));
+	memset(sizes, 0, sizeof(sizes));
+
+	printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+	spin_lock_irqsave(&cache_chain_lock, flags);
+	list_for_each_entry (ptr, &cache_chain, next) {
+		size = get_cache_size(ptr);
+
+		j = 0;
+		for (i = 1; i < SHOW_TOP_SLABS; i++)
+			if (sizes[i] < sizes[j])
+				j = i;
+
+		if (size > sizes[j]) {
+			sizes[j] = size;
+			top[j] = ptr;
+		}
+	}
+
+	for (i = 0; i < SHOW_TOP_SLABS; i++)
+		if (top[i])
+			printk("%-21s: size %10lu objsize %10u\n",
+					top[i]->name, sizes[i],
+					top[i]->buffer_size);
+	spin_unlock_irqrestore(&cache_chain_lock, flags);
+}
+
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t n = *pos;
@@ -4314,19 +4516,20 @@ static int s_show(struct seq_file *m, vo
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
-	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+	seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d",
 		   name, active_objs, num_objs, cachep->buffer_size,
 		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
 		   cachep->limit, cachep->batchcount, cachep->shared);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
 		   active_slabs, num_slabs, shared_avail);
-#if STATS
+#if SLAB_STATS
 	{			/* list3 stats */
 		unsigned long high = cachep->high_mark;
 		unsigned long allocs = cachep->num_allocations;
 		unsigned long grown = cachep->grown;
 		unsigned long reaped = cachep->reaped;
+		unsigned long shrunk = cachep->shrunk;
 		unsigned long errors = cachep->errors;
 		unsigned long max_freeable = cachep->max_freeable;
 		unsigned long node_allocs = cachep->node_allocs;
@@ -4334,9 +4537,10 @@ static int s_show(struct seq_file *m, vo
 		unsigned long overflows = cachep->node_overflow;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
+				%4lu %4lu %4lu %4lu %4lu %4lu",
+				allocs, high, grown,
 				reaped, errors, max_freeable, node_allocs,
-				node_frees, overflows);
+				node_frees, overflows, shrunk);
 	}
 	/* cpu stats */
 	{
@@ -4374,6 +4578,61 @@ static const struct seq_operations slabi
 	.show = s_show,
 };
 
+/* Show object, belonging to each beancounter */
+static int check_ubcs_on_slab(struct kmem_cache *c, struct slab *s,
+		struct user_beancounter *ub)
+{
+	int i, sum = 0;
+	struct user_beancounter **ubcs;
+
+	ubcs = slab_ubcs(c, s);
+	for (i = 0; i < c->num; i++)
+		if (ubcs[i] == ub)
+			sum++;
+
+	return sum;
+}
+
+static int check_ubcs_on_cache(struct kmem_cache *c,
+		struct user_beancounter *ub)
+{
+	int node, sum = 0;
+	struct kmem_list3 *l3;
+	unsigned long flags;
+	struct slab *slab;
+
+	for_each_online_node(node) {
+		l3 = c->nodelists[node];
+		if (l3 == NULL)
+			continue;
+
+		spin_lock_irqsave(&l3->list_lock, flags);
+		list_for_each_entry(slab, &l3->slabs_full, list)
+			sum += check_ubcs_on_slab(c, slab, ub);
+		list_for_each_entry(slab, &l3->slabs_partial, list)
+			sum += check_ubcs_on_slab(c, slab, ub);
+		spin_unlock_irqrestore(&l3->list_lock, flags);
+	}
+
+	return sum;
+}
+
+void slab_walk_ub(struct user_beancounter *ub,
+		void (*show)(const char *name, int count, void *v), void *v)
+{
+	struct kmem_cache *c;
+	int cnt;
+
+	mutex_lock(&cache_chain_mutex);
+	list_for_each_entry(c, &cache_chain, next) {
+		if (c->flags & SLAB_UBC) {
+			cnt = check_ubcs_on_cache(c, ub);
+			show(c->name, cnt, v);
+		}
+	}
+	mutex_unlock(&cache_chain_mutex);
+}
+
 #define MAX_SLABINFO_WRITE 128
 /**
  * slabinfo_write - Tuning for the slab allocator
@@ -4630,3 +4889,33 @@ size_t ksize(const void *objp)
 	return obj_size(virt_to_cache(objp));
 }
 EXPORT_SYMBOL(ksize);
+
+static void __slab_obj_walk(struct kmem_cache *c, struct slab *s, void (*f)(void *))
+{
+	int i;
+
+	for (i = 0; i < c->num; i++)
+		f(index_to_obj(c, s, i));
+}
+
+void slab_obj_walk(struct kmem_cache *c, void (*f)(void *))
+{
+	int node;
+	struct kmem_list3 *l3;
+	unsigned long flags;
+	struct slab *slab;
+
+	for_each_online_node(node) {
+		l3 = c->nodelists[node];
+		if (l3 == NULL)
+			continue;
+
+		spin_lock_irqsave(&l3->list_lock, flags);
+		list_for_each_entry(slab, &l3->slabs_full, list)
+			__slab_obj_walk(c, slab, f);
+		list_for_each_entry(slab, &l3->slabs_partial, list)
+			__slab_obj_walk(c, slab, f);
+		spin_unlock_irqrestore(&l3->list_lock, flags);
+	}
+}
+EXPORT_SYMBOL(slab_obj_walk);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/slub.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/slub.c
--- linux-2.6.32-504.3.3.el6.orig/mm/slub.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/slub.c	2015-01-21 12:02:58.903810136 +0300
@@ -29,6 +29,8 @@
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
 
+#include <bc/kmem.h>
+
 /*
  * Lock order:
  *   1. slab_lock(page)
@@ -149,9 +151,11 @@
 
 /*
  * Set of flags that will prevent slab merging
+ *
+ * FIXME - think over how to allow merging accountable slubs
  */
 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
+		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | SLAB_UBC)
 
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 		SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -201,6 +205,8 @@ struct track {
 
 enum track_item { TRACK_ALLOC, TRACK_FREE };
 
+static DEFINE_SPINLOCK(cache_chain_lock);
+
 #ifdef CONFIG_SLUB_DEBUG
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
@@ -321,6 +327,90 @@ static inline int oo_objects(struct kmem
 	return x.x & OO_MASK;
 }
 
+#ifdef CONFIG_BEANCOUNTERS
+static inline void inc_cache_grown(struct kmem_cache *s)
+{
+	atomic_inc(&s->grown);
+}
+
+static inline void dec_cache_grown(struct kmem_cache *s)
+{
+	atomic_dec(&s->grown);
+}
+
+unsigned long ub_cache_growth(struct kmem_cache *cachep)
+{
+	return atomic_read(&cachep->grown) << cachep->oo.x; /* XXX huh? */
+}
+
+static void __flush_cpu_slab(struct kmem_cache *s, int cpu);
+
+int kmem_cache_objuse(struct kmem_cache *cachep)
+{
+	return cachep->objuse;
+}
+
+EXPORT_SYMBOL(kmem_cache_objuse);
+
+int kmem_obj_objuse(void *obj)
+{
+	return kmem_cache_objuse(virt_to_head_page(obj)->slab);
+}
+
+EXPORT_SYMBOL(kmem_obj_objuse);
+
+int kmem_dname_objuse(void *obj)
+{
+	struct kmem_cache *s;
+
+	/*
+	 * Allocations larger than PAGE_SIZE/2 go directly through
+	 * __get_free_pages() and aren't associated with any cache.
+	 */
+	s = virt_to_head_page(obj)->slab;
+	if (!s)
+		return PAGE_SIZE;
+	return kmem_cache_objuse(s);
+}
+
+#define page_ubs(pg)	(pg->slub_ubs)
+
+struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj)
+{
+	struct page *pg;
+
+	BUG_ON(!(s->flags & SLAB_UBC));
+	pg = virt_to_head_page(obj);
+	return page_ubs(pg) + slab_index(obj, s, page_address(pg));
+}
+
+EXPORT_SYMBOL(ub_slab_ptr);
+
+struct user_beancounter *slab_ub(void *obj)
+{
+	struct page *pg;
+
+	pg = virt_to_head_page(obj);
+	BUG_ON(!(pg->slab->flags & SLAB_UBC));
+	return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))];
+}
+
+EXPORT_SYMBOL(slab_ub);
+
+void kmem_mark_nocharge(struct kmem_cache *cachep)
+{
+	cachep->flags |= SLAB_NO_CHARGE;
+}
+#else
+static inline void inc_cache_grown(struct kmem_cache *s)
+{
+}
+
+static inline void dec_cache_grown(struct kmem_cache *s)
+{
+}
+#endif
+
 #ifdef CONFIG_SLUB_DEBUG
 /*
  * Debug settings:
@@ -1105,6 +1195,7 @@ static struct page *allocate_slab(struct
 	struct kmem_cache_order_objects oo = s->oo;
 	gfp_t alloc_gfp;
 
+	flags &= ~__GFP_UBC;
 	flags |= s->allocflags;
 
 	/*
@@ -1149,9 +1240,12 @@ static struct page *allocate_slab(struct
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		1 << oo_order(oo));
 
+	inc_cache_grown(s);
 	return page;
 }
 
+static void __free_slab(struct kmem_cache *s, struct page *page);
+
 static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
@@ -1174,6 +1268,18 @@ static struct page *new_slab(struct kmem
 	if (!page)
 		goto out;
 
+#ifdef CONFIG_BEANCOUNTERS
+	if (s->flags & SLAB_UBC) {
+		BUG_ON(page_ubs(page) != NULL);
+		page_ubs(page) = kzalloc(page->objects * sizeof(void *),
+				flags & ~__GFP_UBC);
+		if (page_ubs(page) == NULL) {
+			__free_slab(s, page);
+			page = NULL;
+			goto out;
+		}
+	}
+#endif
 	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
@@ -1225,6 +1331,13 @@ static void __free_slab(struct kmem_cach
 
 	__ClearPageSlab(page);
 	reset_page_mapcount(page);
+#ifdef CONFIG_BEANCOUNTERS
+	if (page_ubs(page) != NULL) {
+		BUG_ON(!(s->flags & SLAB_UBC));
+		kfree(page_ubs(page));
+		page_ubs(page) = NULL;
+	}
+#endif
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
 	__free_pages(page, order);
@@ -1249,6 +1362,8 @@ static void free_slab(struct kmem_cache 
 		call_rcu(head, rcu_free_slab);
 	} else
 		__free_slab(s, page);
+
+	dec_cache_grown(s);
 }
 
 static void discard_slab(struct kmem_cache *s, struct page *page)
@@ -1721,6 +1836,7 @@ static __always_inline void *slab_alloc(
 
 	lockdep_trace_alloc(gfpflags);
 	might_sleep_if(gfpflags & __GFP_WAIT);
+	WARN_ON((flags & __GFP_FS) && current->journal_info);
 
 	if (should_failslab(s->objsize, gfpflags))
 		return NULL;
@@ -1737,8 +1853,15 @@ static __always_inline void *slab_alloc(
 		c->freelist = object[c->offset];
 		stat(c, ALLOC_FASTPATH);
 	}
+
 	local_irq_restore(flags);
 
+	if (object && should_charge(s->flags, gfpflags) &&
+			ub_slab_charge(s, object, gfpflags)) {
+		kmem_cache_free(s, object);
+		object = NULL;
+	}
+
 	if (unlikely((gfpflags & __GFP_ZERO) && object))
 		memset(object, 0, objsize);
 
@@ -1879,6 +2002,9 @@ static __always_inline void slab_free(st
 	c = get_cpu_slab(s, smp_processor_id());
 	kmemcheck_slab_free(s, object, c->objsize);
 	debug_check_no_locks_freed(object, c->objsize);
+
+	if (should_uncharge(s->flags))
+		ub_slab_uncharge(s, x);
 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(object, c->objsize);
 	if (likely(page == c->page && c->node >= 0)) {
@@ -2501,6 +2627,9 @@ static int kmem_cache_open(struct kmem_c
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
 #endif
+#ifdef CONFIG_BEANCOUNTERS
+	s->objuse = s->size + (sizeof(struct page) / oo_objects(s->oo));
+#endif
 	if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
 		goto error;
 
@@ -2634,9 +2763,11 @@ static inline int kmem_cache_close(struc
 void kmem_cache_destroy(struct kmem_cache *s)
 {
 	down_write(&slub_lock);
+	spin_lock_irq(&cache_chain_lock);
 	s->refcount--;
 	if (!s->refcount) {
 		list_del(&s->list);
+		spin_unlock_irq(&cache_chain_lock);
 		up_write(&slub_lock);
 		if (kmem_cache_close(s)) {
 			printk(KERN_ERR "SLUB %s: %s called for cache that "
@@ -2646,8 +2777,10 @@ void kmem_cache_destroy(struct kmem_cach
 		if (s->flags & SLAB_DESTROY_BY_RCU)
 			rcu_barrier();
 		sysfs_slab_remove(s);
-	} else
+	} else {
+		spin_unlock_irq(&cache_chain_lock);
 		up_write(&slub_lock);
+	}
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
@@ -2657,6 +2790,10 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 
 struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
+#ifdef CONFIG_BEANCOUNTERS
+struct kmem_cache ub_kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
+EXPORT_SYMBOL(ub_kmalloc_caches);
+#endif
 
 static int __init setup_slub_min_order(char *str)
 {
@@ -2698,6 +2835,12 @@ static struct kmem_cache *create_kmalloc
 		const char *name, int size, gfp_t gfp_flags)
 {
 	unsigned int flags = 0;
+	unsigned long irq_flags;
+
+	if (gfp_flags & __GFP_UBC) {
+		flags = SLAB_UBC | SLAB_NO_CHARGE;
+		gfp_flags &= ~__GFP_UBC;
+	}
 
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
@@ -2710,7 +2853,9 @@ static struct kmem_cache *create_kmalloc
 								flags, NULL))
 		goto panic;
 
+	spin_lock_irqsave(&cache_chain_lock, irq_flags);
 	list_add(&s->list, &slab_caches);
+	spin_unlock_irqrestore(&cache_chain_lock, irq_flags);
 
 	if (sysfs_slab_add(s))
 		goto panic;
@@ -2745,6 +2890,7 @@ static noinline struct kmem_cache *dma_k
 	char *text;
 	size_t realsize;
 	unsigned long slabflags;
+	unsigned long irq_flags
 
 	s = kmalloc_caches_dma[index];
 	if (s)
@@ -2783,7 +2929,9 @@ static noinline struct kmem_cache *dma_k
 		goto unlock_out;
 	}
 
+	spin_lock_irqsave(&cache_chain_lock, irq_flags);
 	list_add(&s->list, &slab_caches);
+	spin_unlock_irqrestore(&cache_chain_lock, irq_flags);
 	kmalloc_caches_dma[index] = s;
 
 	if (slab_state >= SYSFS)
@@ -2847,11 +2995,14 @@ static struct kmem_cache *get_slab(size_
 		index = fls(size - 1);
 
 #ifdef CONFIG_ZONE_DMA
-	if (unlikely((flags & SLUB_DMA)))
+	if (unlikely((flags & SLUB_DMA))) {
+		BUG_ON(flags & __GFP_UBC);
 		return dma_kmalloc_cache(index, flags);
+	}
 
 #endif
-	return &kmalloc_caches[index];
+
+	return __kmalloc_cache(flags, index);
 }
 
 void *__kmalloc(size_t size, gfp_t flags)
@@ -3191,6 +3342,11 @@ void __init kmem_cache_init(void)
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
 		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
+#ifdef CONFIG_BEANCOUNTERS
+	create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc",
+		sizeof(struct kmem_cache_node), GFP_NOWAIT | __GFP_UBC);
+	ub_kmalloc_caches[0].refcount = -1;
+#endif
 	caches++;
 
 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
@@ -3203,17 +3359,29 @@ void __init kmem_cache_init(void)
 	if (KMALLOC_MIN_SIZE <= 32) {
 		create_kmalloc_cache(&kmalloc_caches[1],
 				"kmalloc-96", 96, GFP_NOWAIT);
+#ifdef CONFIG_BEANCOUNTERS
+		create_kmalloc_cache(&ub_kmalloc_caches[1],
+				"kmalloc-96-ubc", 96, GFP_NOWAIT | __GFP_UBC);
+#endif
 		caches++;
 	}
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[2],
 				"kmalloc-192", 192, GFP_NOWAIT);
+#ifdef CONFIG_BEANCOUNTERS
+		create_kmalloc_cache(&ub_kmalloc_caches[2],
+				"kmalloc-192-ubc", 192, GFP_NOWAIT | __GFP_UBC);
+#endif
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_NOWAIT);
+#ifdef CONFIG_BEANCOUNTERS
+		create_kmalloc_cache(&ub_kmalloc_caches[i],
+			"kmalloc-ubc", 1 << i, GFP_NOWAIT | __GFP_UBC);
+#endif
 		caches++;
 	}
 
@@ -3259,9 +3427,14 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+#ifdef CONFIG_BEANCOUNTERS
+		ub_kmalloc_caches[i].name =
+			kasprintf(GFP_NOWAIT | __GFP_UBC, "kmalloc-%d-ubc", 1 << i);
+#endif
+	}
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);
@@ -3387,11 +3560,15 @@ struct kmem_cache *kmem_cache_create(con
 	if (s) {
 		if (kmem_cache_open(s, GFP_KERNEL, name,
 				size, align, flags, ctor)) {
+			spin_lock_irq(&cache_chain_lock);
 			list_add(&s->list, &slab_caches);
+			spin_unlock_irq(&cache_chain_lock);
 			up_write(&slub_lock);
 			if (sysfs_slab_add(s)) {
 				down_write(&slub_lock);
+				spin_lock_irq(&cache_chain_lock);
 				list_del(&s->list);
+				spin_unlock_irq(&cache_chain_lock);
 				up_write(&slub_lock);
 				kfree(s);
 				goto err;
@@ -4559,6 +4736,8 @@ static char *create_unique_id(struct kme
 		*p++ = 'a';
 	if (s->flags & SLAB_DEBUG_FREE)
 		*p++ = 'F';
+	if (s->flags & SLAB_UBC)
+		*p++ = 'b';
 	if (!(s->flags & SLAB_NOTRACK))
 		*p++ = 't';
 	if (p != name + 1)
@@ -4711,6 +4890,77 @@ static void print_slabinfo_header(struct
 	seq_putc(m, '\n');
 }
 
+#define SHOW_TOP_SLABS	10
+
+static unsigned long get_cache_size(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	unsigned long slabs;
+	struct kmem_cache_node *n;
+	struct list_head *lh;
+	int cpu, node;
+
+	slabs = 0;
+
+	for_each_online_cpu(cpu)
+		slabs++;
+
+	for_each_online_node(node) {
+		n = get_node(cache, node);
+		if (!n)
+			continue;
+		spin_lock_irqsave(&n->list_lock, flags);
+#ifdef CONFIG_SLUB_DEBUG
+		list_for_each(lh, &n->full)
+			slabs++;
+#endif
+		list_for_each(lh, &n->partial)
+			slabs++;
+		spin_unlock_irqrestore(&n->list_lock, flags);
+	}
+
+	return slabs * (PAGE_SIZE << oo_order(cache->oo));
+}
+
+void show_slab_info(void)
+{
+	int i, j;
+	unsigned long size;
+	struct kmem_cache *ptr;
+	unsigned long sizes[SHOW_TOP_SLABS];
+	struct kmem_cache *top[SHOW_TOP_SLABS];
+	unsigned long flags;
+
+	memset(top, 0, sizeof(top));
+	memset(sizes, 0, sizeof(sizes));
+
+	printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+	spin_lock_irqsave(&cache_chain_lock, flags);
+	list_for_each_entry(ptr, &slab_caches, list) {
+		size = get_cache_size(ptr);
+
+		j = 0;
+		for (i = 1; i < SHOW_TOP_SLABS; i++) {
+			if (sizes[i] < sizes[j])
+				j = i;
+		}
+		if (size > sizes[j]) {
+			sizes[j] = size;
+			top[j] = ptr;
+		}
+	}
+
+	for (i = 0; i < SHOW_TOP_SLABS; i++) {
+		if (top[i])
+			printk("%-21s: size %10lu objsize %10u\n",
+				top[i]->name, sizes[i],
+				top[i]->size);
+	}
+
+	spin_unlock_irqrestore(&cache_chain_lock, flags);
+}
+
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t n = *pos;
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/swap.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/swap.c
--- linux-2.6.32-504.3.3.el6.orig/mm/swap.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/swap.c	2015-01-21 12:02:58.932809366 +0300
@@ -14,6 +14,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -23,13 +24,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm_inline.h>
-#include <linux/buffer_head.h>	/* for try_to_release_page() */
+#include <linux/mmgang.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
+#include <linux/rmap.h>
 
 #include "internal.h"
 
@@ -108,13 +110,15 @@ static void __page_cache_release(struct 
 {
 	if (PageLRU(page)) {
 		unsigned long flags;
-		struct zone *zone = page_zone(page);
+		struct lruvec *lruvec = page_lruvec(page);
 
-		spin_lock_irqsave(&zone->lru_lock, flags);
+		spin_lock_irqsave(&lruvec->lru_lock, flags);
 		VM_BUG_ON(!PageLRU(page));
+		VM_BUG_ON(__page_lruvec(page) != lruvec);
 		__ClearPageLRU(page);
-		del_page_from_lru(zone, page);
-		spin_unlock_irqrestore(&zone->lru_lock, flags);
+		del_page_from_lru(lruvec, page);
+		gang_del_user_page(page);
+		spin_unlock_irqrestore(&lruvec->lru_lock, flags);
 	}
 }
 
@@ -269,30 +273,20 @@ static void pagevec_move_tail(struct pag
 {
 	int i;
 	int pgmoved = 0;
-	struct zone *zone = NULL;
+	struct lruvec *lruvec = NULL;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
 
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock(&zone->lru_lock);
-		}
+		lruvec = relock_page_lru(lruvec, page);
 		if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
-			struct lruvec *lruvec;
 
-			lruvec = mem_cgroup_lru_move_lists(page_zone(page),
-							   page, lru, lru);
-			list_move_tail(&page->lru, &lruvec->lists[lru]);
+			list_move_tail(&page->lru, &lruvec->lru_list[lru]);
 			pgmoved++;
 		}
 	}
-	if (zone)
-		spin_unlock(&zone->lru_lock);
+	unlock_lruvec(lruvec);
 	__count_vm_events(PGROTATED, pgmoved);
 	release_pages(pvec->pages, pvec->nr, pvec->cold);
 	pagevec_reinit(pvec);
@@ -319,24 +313,13 @@ void  rotate_reclaimable_page(struct pag
 	}
 }
 
-static void update_page_reclaim_stat(struct zone *zone, struct page *page,
-				     int file, int rotated)
+static void update_page_reclaim_stat(struct lruvec *lruvec, enum lru_list lru)
 {
-	struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
-	struct zone_reclaim_stat *memcg_reclaim_stat;
-
-	memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
-
-	reclaim_stat->recent_scanned[file]++;
-	if (rotated)
-		reclaim_stat->recent_rotated[file]++;
-
-	if (!memcg_reclaim_stat)
-		return;
+	int file = is_file_lru(lru);
 
-	memcg_reclaim_stat->recent_scanned[file]++;
-	if (rotated)
-		memcg_reclaim_stat->recent_rotated[file]++;
+	lruvec->recent_scanned[file]++;
+	if (is_active_lru(lru))
+		lruvec->recent_rotated[file]++;
 }
 
 /*
@@ -344,22 +327,37 @@ static void update_page_reclaim_stat(str
  */
 void activate_page(struct page *page)
 {
-	struct zone *zone = page_zone(page);
+	struct lruvec *lruvec;
 
-	spin_lock_irq(&zone->lru_lock);
+	local_irq_disable();
+	lruvec = lock_page_lru(page);
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-		int file = page_is_file_cache(page);
 		int lru = page_lru_base_type(page);
-		del_page_from_lru_list(zone, page, lru);
+
+		del_page_from_lru_list(lruvec, page, lru);
+
+		if (page->mapping && !PageAnon(page) && !page_mapped(page)) {
+			struct gang_set *gs = get_mapping_gang(page->mapping);
+
+			if (!page_in_gang(page, gs)) {
+				ClearPageLRU(page);
+				spin_unlock_irq(&lruvec->lru_lock);
+				gang_mod_user_page(page, gs,
+						GFP_ATOMIC|__GFP_NOFAIL);
+				local_irq_disable();
+				lruvec = lock_page_lru(page);
+				SetPageLRU(page);
+			}
+		}
 
 		SetPageActive(page);
 		lru += LRU_ACTIVE;
-		add_page_to_lru_list(zone, page, lru);
+		add_page_to_lru_list(lruvec, page, lru);
 		__count_vm_event(PGACTIVATE);
 
-		update_page_reclaim_stat(zone, page, file, 1);
+		update_page_reclaim_stat(lruvec, lru);
 	}
-	spin_unlock_irq(&zone->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 }
 
 /*
@@ -378,14 +376,15 @@ void mark_page_accessed(struct page *pag
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
 	}
+	ClearPageIdle(page);
 }
-
 EXPORT_SYMBOL(mark_page_accessed);
 
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
+	VM_BUG_ON(!__page_lruvec(page));
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		____pagevec_lru_add(pvec, lru);
@@ -424,13 +423,14 @@ void lru_cache_add_lru(struct page *page
  */
 void add_page_to_unevictable_list(struct page *page)
 {
-	struct zone *zone = page_zone(page);
+	struct lruvec *lruvec;
 
-	spin_lock_irq(&zone->lru_lock);
+	local_irq_disable();
+	lruvec = lock_page_lru(page);
 	SetPageUnevictable(page);
 	SetPageLRU(page);
-	add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
-	spin_unlock_irq(&zone->lru_lock);
+	add_page_to_lru_list(lruvec, page, LRU_UNEVICTABLE);
+	spin_unlock_irq(&lruvec->lru_lock);
 }
 
 /*
@@ -454,9 +454,9 @@ void add_page_to_unevictable_list(struct
  * be write it out by flusher threads as this is much more effective
  * than the single-page writeout from reclaim.
  */
-static void lru_deactivate(struct page *page, struct zone *zone)
+static void lru_deactivate(struct page *page, struct lruvec *lruvec)
 {
-	int lru, file;
+	enum lru_list lru;
 	bool active;
 
 	if (!PageLRU(page))
@@ -470,13 +470,11 @@ static void lru_deactivate(struct page *
 		return;
 
 	active = PageActive(page);
-
-	file = page_is_file_cache(page);
 	lru = page_lru_base_type(page);
-	del_page_from_lru_list(zone, page, lru + active);
+	del_page_from_lru_list(lruvec, page, lru + active);
 	ClearPageActive(page);
 	ClearPageReferenced(page);
-	add_page_to_lru_list(zone, page, lru);
+	add_page_to_lru_list(lruvec, page, lru);
 
 	if (PageWriteback(page) || PageDirty(page)) {
 		/*
@@ -486,40 +484,35 @@ static void lru_deactivate(struct page *
 		 */
 		SetPageReclaim(page);
 	} else {
-		struct lruvec *lruvec;
 		/*
 		 * The page's writeback ends up during pagevec
 		 * We moves tha page into tail of inactive.
 		 */
-		lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
-		list_move_tail(&page->lru, &lruvec->lists[lru]);
+		list_move_tail(&page->lru, &lruvec->lru_list[lru]);
 		__count_vm_event(PGROTATED);
 	}
 
 	if (active)
 		__count_vm_event(PGDEACTIVATE);
-	update_page_reclaim_stat(zone, page, file, 0);
+	update_page_reclaim_stat(lruvec, lru);
 }
 
 static void ____pagevec_lru_deactivate(struct pagevec *pvec)
 {
 	int i;
-	struct zone *zone = NULL;
+	struct lruvec *lruvec = NULL;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
 
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		lru_deactivate(page, zone);
+
+		lruvec = relock_page_lru(lruvec, page);
+		lru_deactivate(page, lruvec);
 	}
-	if (zone)
-		spin_unlock_irq(&zone->lru_lock);
+	unlock_lruvec(lruvec);
+	local_irq_restore(flags);
 
 	release_pages(pvec->pages, pvec->nr, pvec->cold);
 	pagevec_reinit(pvec);
@@ -612,19 +605,17 @@ int lru_add_drain_all(void)
 void release_pages(struct page **pages, int nr, int cold)
 {
 	int i;
-	struct pagevec pages_to_free;
-	struct zone *zone = NULL;
-	unsigned long uninitialized_var(flags);
+	LIST_HEAD(pages_to_free);
+	struct lruvec *lruvec = NULL;
+	unsigned long flags;
 
-	pagevec_init(&pages_to_free, cold);
+	local_irq_save(flags);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
 
 		if (unlikely(PageCompound(page))) {
-			if (zone) {
-				spin_unlock_irqrestore(&zone->lru_lock, flags);
-				zone = NULL;
-			}
+			unlock_lruvec(lruvec);
+			lruvec = NULL;
 			put_compound_page(page);
 			continue;
 		}
@@ -633,33 +624,19 @@ void release_pages(struct page **pages, 
 			continue;
 
 		if (PageLRU(page)) {
-			struct zone *pagezone = page_zone(page);
-
-			if (pagezone != zone) {
-				if (zone)
-					spin_unlock_irqrestore(&zone->lru_lock,
-									flags);
-				zone = pagezone;
-				spin_lock_irqsave(&zone->lru_lock, flags);
-			}
+			lruvec = relock_lruvec(lruvec, page_lruvec(page));
 			VM_BUG_ON(!PageLRU(page));
 			__ClearPageLRU(page);
-			del_page_from_lru(zone, page);
+			del_page_from_lru(lruvec, page);
+			gang_del_user_page(page);
 		}
 
-		if (!pagevec_add(&pages_to_free, page)) {
-			if (zone) {
-				spin_unlock_irqrestore(&zone->lru_lock, flags);
-				zone = NULL;
-			}
-			__pagevec_free(&pages_to_free);
-			pagevec_reinit(&pages_to_free);
-  		}
+		list_add(&page->lru, &pages_to_free);
 	}
-	if (zone)
-		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	unlock_lruvec(lruvec);
+	local_irq_restore(flags);
 
-	pagevec_free(&pages_to_free);
+	free_hot_cold_page_list(&pages_to_free, cold);
 }
 
 /*
@@ -683,17 +660,17 @@ EXPORT_SYMBOL(__pagevec_release);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct zone* zone,
+void lru_add_page_tail(struct lruvec *lruvec,
 		       struct page *page, struct page *page_tail)
 {
 	int active;
 	enum lru_list lru;
-	const int file = 0;
 
 	VM_BUG_ON(!PageHead(page));
 	VM_BUG_ON(PageCompound(page_tail));
 	VM_BUG_ON(PageLRU(page_tail));
-	VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+	VM_BUG_ON(page_lruvec(page_tail) != lruvec);
+	VM_BUG_ON(!spin_is_locked(&lruvec->lru_lock));
 
 	SetPageLRU(page_tail);
 
@@ -706,7 +683,7 @@ void lru_add_page_tail(struct zone* zone
 			active = 0;
 			lru = LRU_INACTIVE_ANON;
 		}
-		update_page_reclaim_stat(zone, page_tail, file, active);
+		update_page_reclaim_stat(lruvec, lru);
 	} else {
 		SetPageUnevictable(page_tail);
 		lru = LRU_UNEVICTABLE;
@@ -723,7 +700,7 @@ void lru_add_page_tail(struct zone* zone
 		 * Use the standard add function to put page_tail on the list,
 		 * but then correct its position so they all end up in order.
 		 */
-		add_page_to_lru_list(zone, page_tail, lru);
+		add_page_to_lru_list(lruvec, page_tail, lru);
 		list_head = page_tail->lru.prev;
 		list_move_tail(&page_tail->lru, list_head);
 	}
@@ -737,59 +714,32 @@ void lru_add_page_tail(struct zone* zone
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
 	int i;
-	struct zone *zone = NULL;
+	struct lruvec *lruvec = NULL;
 
 	VM_BUG_ON(is_unevictable_lru(lru));
 
+	local_irq_disable();
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
-		int file;
-		int active;
-
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
+
+		lruvec = relock_lruvec(lruvec, page_lruvec(page));
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(PageUnevictable(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		active = is_active_lru(lru);
-		file = is_file_lru(lru);
-		if (active)
+		if (is_active_lru(lru))
 			SetPageActive(page);
-		update_page_reclaim_stat(zone, page, file, active);
-		add_page_to_lru_list(zone, page, lru);
+		update_page_reclaim_stat(lruvec, lru);
+		add_page_to_lru_list(lruvec, page, lru);
 	}
-	if (zone)
-		spin_unlock_irq(&zone->lru_lock);
+	unlock_lruvec(lruvec);
+	local_irq_enable();
 	release_pages(pvec->pages, pvec->nr, pvec->cold);
 	pagevec_reinit(pvec);
 }
 
 EXPORT_SYMBOL(____pagevec_lru_add);
 
-/*
- * Try to drop buffers from the pages in a pagevec
- */
-void pagevec_strip(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-
-		if (page_has_private(page) && trylock_page(page)) {
-			if (page_has_private(page))
-				try_to_release_page(page, 0);
-			unlock_page(page);
-		}
-	}
-}
-
 /**
  * pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/swap_state.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/swap_state.c
--- linux-2.6.32-504.3.3.el6.orig/mm/swap_state.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/swap_state.c	2015-01-21 12:02:58.690815790 +0300
@@ -8,6 +8,7 @@
  */
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/rmap.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -18,9 +19,14 @@
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
+#include <linux/mmgang.h>
 
 #include <asm/pgtable.h>
 
+#include <bc/vmpages.h>
+#include <bc/io_acct.h>
+#include <bc/kmem.h>
+
 /*
  * swapper_space is a fiction, retained to simplify the path through
  * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
@@ -46,15 +52,17 @@ struct address_space swapper_space = {
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
 };
+EXPORT_SYMBOL(swapper_space);
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 
-static struct {
+struct {
 	unsigned long add_total;
 	unsigned long del_total;
 	unsigned long find_success;
 	unsigned long find_total;
 } swap_cache_info;
+EXPORT_SYMBOL(swap_cache_info);
 
 void show_swap_cache_info(void)
 {
@@ -118,6 +126,7 @@ int add_to_swap_cache(struct page *page,
 	}
 	return error;
 }
+EXPORT_SYMBOL(add_to_swap_cache);
 
 /*
  * This must be called only on pages that have
@@ -140,11 +149,12 @@ void __delete_from_swap_cache(struct pag
 /**
  * add_to_swap - allocate swap space for a page
  * @page: page we want to move to swap
+ * @ub: user_beancounter to charge swap-entry
  *
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page *page)
+int add_to_swap(struct page *page, struct user_beancounter *ub)
 {
 	swp_entry_t entry;
 	int err;
@@ -152,7 +162,7 @@ int add_to_swap(struct page *page)
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(!PageUptodate(page));
 
-	entry = get_swap_page();
+	entry = get_swap_page(ub);
 	if (!entry.val)
 		return 0;
 
@@ -162,6 +172,11 @@ int add_to_swap(struct page *page)
 			return 0;
 		}
 
+	if (PageVSwap(page) && remove_from_vswap(page)) {
+		swapcache_free(entry, NULL);
+		return 0;
+	}
+
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
 	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
@@ -188,6 +203,7 @@ int add_to_swap(struct page *page)
 		return 0;
 	}
 }
+EXPORT_SYMBOL(add_to_swap);
 
 /*
  * This must be called only on pages that have
@@ -208,6 +224,7 @@ void delete_from_swap_cache(struct page 
 	swapcache_free(entry, page);
 	page_cache_release(page);
 }
+EXPORT_SYMBOL(delete_from_swap_cache);
 
 /* 
  * If we are the only user, then try to free up the swap cache. 
@@ -275,6 +292,34 @@ struct page * lookup_swap_cache(swp_entr
 	return page;
 }
 
+static struct user_beancounter *get_swapin_ub(struct page *page,
+					      swp_entry_t entry,
+					      struct vm_area_struct *vma)
+{
+	struct user_beancounter *ub;
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+	rcu_read_lock();
+	ub = get_swap_ub(entry);
+	if (!ub || !get_beancounter_rcu(ub)) {
+		/* speedup unuse pass */
+		if (ub)
+			ub_unuse_swap_page(page);
+		ub = get_beancounter(get_exec_ub());
+	}
+	rcu_read_unlock();
+#else
+	/* can be NULL for shmem, see shmem_swapin() */
+	if (vma && vma->vm_mm)
+		ub = mm_ub(vma->vm_mm);
+	else
+		ub = get_exec_ub();
+	get_beancounter(ub);
+#endif
+
+	return ub;
+}
+
 /* 
  * Locate a page of swap in physical memory, reserving swap cache space
  * and reading the disk if it is not already cached.
@@ -286,6 +331,7 @@ struct page *read_swap_cache_async(swp_e
 {
 	struct page *found_page, *new_page = NULL;
 	int err;
+	struct user_beancounter *ub;
 
 	do {
 		/*
@@ -304,6 +350,12 @@ struct page *read_swap_cache_async(swp_e
 			new_page = alloc_page_vma(gfp_mask, vma, addr);
 			if (!new_page)
 				break;		/* Out of memory */
+
+			ub = get_swapin_ub(new_page, entry, vma);
+			err = gang_add_user_page(new_page, get_ub_gs(ub), gfp_mask);
+			put_beancounter(ub);
+			if (err)
+				break;
 		}
 
 		/*
@@ -351,6 +403,7 @@ struct page *read_swap_cache_async(swp_e
 			/*
 			 * Initiate read into locked page and return.
 			 */
+
 			lru_cache_add_anon(new_page);
 			swap_readpage(new_page);
 			return new_page;
@@ -365,10 +418,14 @@ struct page *read_swap_cache_async(swp_e
 		swapcache_free(entry, NULL);
 	} while (err != -ENOMEM);
 
-	if (new_page)
+	if (new_page) {
+		if (page_gang(new_page))
+			gang_del_user_page(new_page);
 		page_cache_release(new_page);
+	}
 	return found_page;
 }
+EXPORT_SYMBOL(read_swap_cache_async);
 
 /**
  * swapin_readahead - swap in pages in hope we need them soon
@@ -416,3 +473,117 @@ struct page *swapin_readahead(swp_entry_
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
+
+#ifdef CONFIG_MEMORY_VSWAP
+
+static int __add_to_vswap(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(PageSwapCache(page));
+
+	if (PageVSwap(page)) {
+		if (atomic_inc_not_zero(&page->vswap_count))
+			return 1;
+		/*
+		 * wait for put_vswap_page() completion,
+		 * see note in __remove_from_vswap()
+		 */
+		while (PageVSwap(page))
+			cpu_relax();
+	}
+
+	if (unlikely(PageTransHuge(page)) && split_huge_page(page))
+		return 0;
+
+	atomic_set(&page->vswap_count, 1);
+	SetPageVSwap(page);
+	inc_zone_page_state(page, NR_VSWAP);
+	return 1;
+}
+
+int add_to_vswap(struct page *page)
+{
+	int ret = SWAP_FAIL;
+
+	if (__add_to_vswap(page)) {
+		ret = try_to_unmap(page, TTU_VSWAP);
+		put_vswap_page(page);
+	}
+	return ret;
+}
+
+void __remove_from_vswap(struct page *page)
+{
+	/*
+	 * Either in atomic -- under pte-lock
+	 * or in add_to_vswap() under page-lock.
+	 */
+	VM_BUG_ON(!in_atomic() && !PageLocked(page));
+	__dec_zone_page_state(page, NR_VSWAP);
+	ClearPageVSwap(page);
+}
+
+static int remove_vswap_pte(struct page *page, struct vm_area_struct *vma,
+			    unsigned long addr, void *data)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	int ret = SWAP_AGAIN;
+	swp_entry_t entry;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+		goto out;
+
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	pte = *ptep;
+
+	if (!is_swap_pte(pte))
+		goto out_unlock;
+
+	entry = pte_to_swp_entry(pte);
+	if (!is_vswap_entry(entry) || vswap_entry_to_page(entry) != page)
+		goto out_unlock;
+
+	pte = mk_pte(page, vma->vm_page_prot);
+	if (is_write_vswap_entry(entry)) {
+		pte = pte_mkwrite(pte);
+		ClearPageCheckpointed(page);
+	}
+	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
+
+	flush_icache_page(vma, page);
+	set_pte_at(mm, addr, ptep, pte);
+	page_add_anon_rmap(page, vma, addr);
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, pte);
+	put_vswap_page(page);
+out_unlock:
+	pte_unmap_unlock(ptep, ptl);
+out:
+	return ret;
+}
+
+int remove_from_vswap(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	if (rmap_walk(page, remove_vswap_pte, NULL) != SWAP_AGAIN ||
+	    PageVSwap(page))
+		return -EBUSY;
+	return 0;
+}
+
+#endif /* CONFIG_MEMORY_VSWAP */
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/swapfile.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/swapfile.c
--- linux-2.6.32-504.3.3.el6.orig/mm/swapfile.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/swapfile.c	2015-01-21 12:02:58.922809631 +0300
@@ -31,12 +31,15 @@
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
 #include <linux/oom.h>
+#include <linux/pram.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 #include <linux/page_cgroup.h>
 
+#include <bc/vmpages.h>
+
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 				 unsigned char);
 static void free_swap_count_continuations(struct swap_info_struct *);
@@ -44,7 +47,9 @@ static void free_swap_count_continuation
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
+EXPORT_SYMBOL(nr_swap_pages);
 long total_swap_pages;
+EXPORT_SYMBOL(total_swap_pages);
 static int least_priority;
 
 static const char Bad_file[] = "Bad swap file entry ";
@@ -437,7 +442,7 @@ no_page:
 	return 0;
 }
 
-swp_entry_t get_swap_page(void)
+swp_entry_t get_swap_page(struct user_beancounter *ub)
 {
 	struct swap_info_struct *si;
 	pgoff_t offset;
@@ -467,6 +472,8 @@ swp_entry_t get_swap_page(void)
 		/* This is called for allocating swap entry for cache */
 		offset = scan_swap_map(si, SWAP_HAS_CACHE);
 		if (offset) {
+			/* store swap entry owner */
+			ub_swapentry_get(si, offset, ub);
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
 		}
@@ -479,6 +486,16 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+
+struct user_beancounter *get_swap_ub(swp_entry_t entry)
+{
+	return rcu_dereference(swap_info[swp_type(entry)
+			]->swap_ubs[swp_offset(entry)]);
+}
+
+#endif
+
 /* The only caller of this function is now susupend routine */
 swp_entry_t get_swap_page_of_type(int type)
 {
@@ -570,12 +587,23 @@ static unsigned char swap_entry_free(str
 	if (!count)
 		mem_cgroup_uncharge_swap(entry);
 
+	if (usage == SWAP_HAS_CACHE) {
+		/* page removed from swap cache, charge swap entry instead */
+		if (count)
+			ub_swapentry_charge(p, offset);
+	} else {
+		/* last user is gone, uncharge swap entry */
+		if (!count && !has_cache)
+			ub_swapentry_uncharge(p, offset);
+	}
+
 	usage = count | has_cache;
 	p->swap_map[offset] = usage;
 
 	/* free if no reference */
 	if (!usage) {
 		struct gendisk *disk = p->bdev->bd_disk;
+		ub_swapentry_put(p, offset);
 		if (offset < p->lowest_bit)
 			p->lowest_bit = offset;
 		if (offset > p->highest_bit)
@@ -666,9 +694,537 @@ int reuse_swap_page(struct page *page)
 			SetPageDirty(page);
 		}
 	}
+	if (count <= 1 && PageVSwap(page))
+		count += page_vswapcount(page);
 	return count <= 1;
 }
 
+#ifdef CONFIG_PSWAP
+static signed char pswap_type[MAX_SWAPFILES]; /* pswap type -> swap type map */
+
+/*
+ * pswap_reserve - reserve a swap entry
+ * @entry: the swap entry to reserve
+ *
+ * Reservation of a swap entry is, in fact, equivalent to incrementing its
+ * refcount so that the entry will not be recycled even if it has been freed by
+ * all of its users, and setting a bit in a special pswap reservation mask. The
+ * mask will be saved to a pram storage on swapoff and restored on subsequent
+ * swapon so that reserved entries will survive a kexec reboot.
+ *
+ * On success, the function returns a special pseudo swap entry that can be
+ * used to restore the original swap entry (see pswap_restore() below). On
+ * failure 0 is returned.
+ *
+ * Note, the same entry can be reserved many times. In that case, the entry's
+ * refcount will be incremented only once.
+ *
+ * Reserved entries are freed from userspace by writing a non-zero value to
+ * /proc/sys/vm/prune_pswap.
+ */
+swp_entry_t pswap_reserve(swp_entry_t entry)
+{
+	struct swap_info_struct *si;
+	unsigned char count, has_cache;
+	unsigned long offset;
+	int ptype;
+	int retry;
+	swp_entry_t ret;
+
+	ret = (swp_entry_t) {0};
+	offset = swp_offset(entry);
+
+again:
+	retry = 0;
+
+	si = swap_info_get(entry);
+	if (!si || !(si->flags & SWP_WRITEOK))
+		goto out;
+
+	ptype = si->pswap_type;
+	if (ptype < 0)
+		goto out_unlock;
+
+	count = si->swap_map[offset];
+	has_cache = count & SWAP_HAS_CACHE;
+	count &= ~SWAP_HAS_CACHE;
+
+	if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
+		goto out_unlock;
+
+	ret = swp_entry(ptype, offset);
+
+	if (__test_and_set_bit(offset, si->pswap_reserved))
+		goto out_unlock;
+
+	if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+		count++;
+	else if (swap_count_continued(si, offset, count))
+		count = COUNT_CONTINUED;
+	else
+		retry = 1;
+
+	si->swap_map[offset] = count | has_cache;
+
+out_unlock:
+	spin_unlock(&swap_lock);
+	if (retry) {
+		ret = (swp_entry_t) {0};
+		if (add_swap_count_continuation(entry, GFP_ATOMIC) == 0)
+			goto again;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(pswap_reserve);
+
+/*
+ * pswap_restore - restore a previously reserved swap entry
+ * @entry: pseudo swap entry returned by pswap_reserve() (see above)
+ * @ub: user_beancounter to charge swap entry
+ *
+ * The function increments the refcount of the swap entry corresponding to the
+ * pseudo swap entry passed to it.
+ *
+ * On success, the function returns the original swap entry (the one that was
+ * previously reserved using pswap_reserve()). On failure, 0 is returned.
+ */
+swp_entry_t pswap_restore(swp_entry_t entry, struct user_beancounter *ub)
+{
+	struct swap_info_struct *si;
+	unsigned char count, has_cache;
+	unsigned long offset;
+	int type, ptype;
+	int retry;
+	swp_entry_t ret;
+
+	ret = (swp_entry_t) {0};
+
+	ptype = swp_type(entry);
+	offset = swp_offset(entry);
+
+	if (ptype >= MAX_SWAPFILES)
+		goto out;
+
+	type = pswap_type[ptype];
+	if (type < 0)
+		goto out;
+
+	entry = swp_entry(type, offset);
+
+again:
+	retry = 0;
+
+	si = swap_info_get(entry);
+	if (!si || !(si->flags & SWP_WRITEOK))
+		goto out;
+
+	if (si->pswap_type < 0)
+		goto out_unlock;
+
+	count = si->swap_map[offset];
+	has_cache = count & SWAP_HAS_CACHE;
+	count &= ~SWAP_HAS_CACHE;
+
+	if (!count)
+		goto out_unlock;
+
+	if (!__test_and_clear_bit(offset, si->pswap_reserved)) {
+		BUG_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX);
+		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+			count++;
+		else if (swap_count_continued(si, offset, count))
+			count = COUNT_CONTINUED;
+		else
+			retry = 1;
+	}
+
+	if (retry)
+		goto out_unlock;
+
+	si->swap_map[offset] = count | has_cache;
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+	if (si->swap_ubs[offset] != ub)
+		ub_swapentry_recharge(si, offset, ub);
+#endif
+
+	ret = entry;
+
+out_unlock:
+	spin_unlock(&swap_lock);
+	if (retry) {
+		if (add_swap_count_continuation(entry, GFP_ATOMIC) == 0)
+			goto again;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(pswap_restore);
+
+static void pswap_unuse(struct swap_info_struct *si)
+{
+	unsigned int i;
+
+	if (si->pswap_type < 0)
+		return;
+
+	for_each_bit(i, si->pswap_reserved, si->max) {
+		swp_entry_t entry = swp_entry(si->type, i);
+		struct page *page = NULL;
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_NONE,
+		};
+		int err = 0;
+
+		if (!(si->swap_map[i] & SWAP_HAS_CACHE))
+			goto next;
+
+		page = find_get_page(&swapper_space, entry.val);
+		if (!page || !PageDirty(page) || PageWriteback(page))
+			goto next;
+
+		lock_page(page);
+		if (likely(PageDirty(page) && !PageWriteback(page)))
+			err = swap_writepage(page, &wbc);
+		else
+			unlock_page(page);
+
+		if (err)
+			printk(KERN_ERR "PSWAP: "
+			       "Failed to write entry %08lx: %d\n",
+			       entry.val, err);
+next:
+		if (page)
+			page_cache_release(page);
+		swap_free(entry);
+	}
+}
+
+static void pswap_unuse_cancel(struct swap_info_struct *si)
+{
+	unsigned int i;
+
+	if (si->pswap_type < 0)
+		return;
+
+	for_each_bit(i, si->pswap_reserved, si->max) {
+		if (!si->swap_map[i]) {
+			si->swap_map[i] = 1;
+			ub_swapentry_get(si, i, get_ub0());
+			ub_swapentry_charge(si, i);
+		} else {
+			swp_entry_t entry = swp_entry(si->type, i);
+
+			if (swap_duplicate(entry)) {
+				printk(KERN_ERR
+				       "PSWAP: reserved entry %08lx lost\n",
+				       entry.val);
+				__clear_bit(i, si->pswap_reserved);
+			}
+		}
+	}
+}
+
+static void pswap_load(struct swap_info_struct *si,
+		       unsigned long max, unsigned char *map)
+{
+	struct pram_stream stream;
+	char name[64];
+	unsigned int i;
+	size_t size;
+	int err;
+
+	sprintf(name, "pswap.%pU", si->uuid);
+
+	size = BITS_TO_LONGS(max) * sizeof(long);
+	si->pswap_reserved = vmalloc(size);
+	if (!si->pswap_reserved) {
+		printk(KERN_ERR "PSWAP: Failed to init: no memory\n");
+		pram_destroy(name);
+		return;
+	}
+
+	memset(si->pswap_reserved, 0, size);
+
+	err = pram_open(name, PRAM_READ, &stream);
+	if (err) {
+		if (err == -ENOENT)
+			err = 0;
+		goto out;
+	}
+
+	err = -EIO;
+	if (pram_read(&stream, &si->pswap_type, 1) != 1)
+		goto out_close_stream;
+
+	BUG_ON(si->pswap_type < 0);
+
+	if (pram_read(&stream, si->pswap_reserved, size) != size) {
+		memset(si->pswap_reserved, 0, size);
+		goto out_close_stream;
+	}
+
+	err = 0;
+
+	for_each_bit(i, si->pswap_reserved, max) {
+		if (map[i]) {
+			printk(KERN_ERR "PSWAP: bad reserved entry %08x\n", i);
+			__clear_bit(i, si->pswap_reserved);
+			continue;
+		}
+		map[i] = 1;
+		nr_swap_pages--;
+		si->inuse_pages++;
+		ub_swapentry_get(si, i, get_ub0());
+		ub_swapentry_charge(si, i);
+	}
+
+out_close_stream:
+	pram_close(&stream, 0);
+out:
+	if (err) {
+		printk(KERN_ERR "PSWAP: Failed to load: %d\n", err);
+		si->pswap_type = -1;
+	}
+}
+
+static void pswap_save(struct swap_info_struct *si)
+{
+	struct pram_stream stream;
+	char name[64];
+	size_t size;
+	int err = 0;
+
+	if (si->pswap_type < 0)
+		return;
+
+	if (bitmap_empty(si->pswap_reserved, si->max))
+		goto out;
+
+	sprintf(name, "pswap.%pU", si->uuid);
+
+	err = pram_open(name, PRAM_WRITE, &stream);
+	if (err)
+		goto out;
+
+	err = -EIO;
+	if (pram_write(&stream, &si->pswap_type, 1) != 1)
+		goto out_close_stream;
+
+	size = BITS_TO_LONGS(si->max) * sizeof(long);
+	if (pram_write(&stream, si->pswap_reserved, size) != size)
+		goto out_close_stream;
+
+	err = 0;
+
+out_close_stream:
+	pram_close(&stream, err);
+out:
+	if (err)
+		printk(KERN_ERR "PSWAP: Failed to save: %d\n", err);
+
+	vfree(si->pswap_reserved);
+	si->pswap_reserved = NULL;
+}
+
+static void pswap_install(struct swap_info_struct *si,
+			  unsigned long max, unsigned char *map)
+{
+	int ptype = si->pswap_type;
+	int prune, free;
+
+	prune = free = 0;
+
+	if (!si->pswap_reserved) {
+		si->pswap_type = -1;
+		return;
+	}
+
+	spin_lock(&swap_lock);
+	if (ptype >= 0) {
+		BUG_ON(ptype >= MAX_SWAPFILES);
+		if (pswap_type[ptype] < 0) {
+			pswap_type[ptype] = si->type;
+			goto out_unlock;
+		}
+		printk(KERN_ERR "PSWAP: ptype %d busy\n", ptype);
+		prune = 1;
+	}
+
+	for (ptype = 0; ptype < MAX_SWAPFILES; ptype++) {
+		if (pswap_type[ptype] < 0)
+			break;
+	}
+
+	if (ptype < MAX_SWAPFILES) {
+		pswap_type[ptype] = si->type;
+		si->pswap_type = ptype;
+		goto out_unlock;
+	}
+
+	printk(KERN_ERR "PSWAP: no free ptype slots\n");
+	prune = free = 1;
+
+out_unlock:
+	spin_unlock(&swap_lock);
+	if (prune) {
+		unsigned int i;
+
+		for_each_bit(i, si->pswap_reserved, max) {
+			map[i] = 0;
+			ub_swapentry_uncharge(si, i);
+			ub_swapentry_put(si, i);
+		}
+		if (!free)
+			memset(si->pswap_reserved, 0,
+			       BITS_TO_LONGS(max) * sizeof(long));
+	}
+	if (free) {
+		vfree(si->pswap_reserved);
+		si->pswap_reserved = NULL;
+		si->pswap_type = -1;
+	}
+}
+
+static void pswap_uninstall(struct swap_info_struct *si)
+{
+	int ptype = si->pswap_type;
+
+	if (ptype < 0)
+		return;
+
+	spin_lock(&swap_lock);
+	BUG_ON(ptype >= MAX_SWAPFILES);
+	pswap_type[ptype] = -1;
+	si->pswap_type = -1;
+	spin_unlock(&swap_lock);
+}
+
+static void pswap_init(struct swap_info_struct *si,
+		       unsigned long max, unsigned char *map)
+{
+	pswap_load(si, max, map);
+	pswap_install(si, max, map);
+}
+
+static void pswap_fini(struct swap_info_struct *si)
+{
+	pswap_save(si);
+	pswap_uninstall(si);
+}
+
+static void pswap_prune(void)
+{
+	int type;
+	unsigned int i;
+	struct swap_info_struct *si;
+
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+		si = swap_info[type];
+		if ((si->flags & SWP_WRITEOK) && si->pswap_type >= 0) {
+			for_each_bit(i, si->pswap_reserved, si->max)
+				swap_entry_free(si, swp_entry(type, i), 1);
+			memset(si->pswap_reserved, 0,
+			       BITS_TO_LONGS(si->max) * sizeof(long));
+		}
+	}
+	spin_unlock(&swap_lock);
+}
+
+int sysctl_prune_pswap;
+
+int prune_pswap_sysctl_handler(ctl_table *table, int write,
+		void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (write && sysctl_prune_pswap)
+		pswap_prune();
+	return 0;
+}
+
+static int __init init_pswap(void)
+{
+	memset(pswap_type, -1, sizeof(pswap_type));
+	return 0;
+}
+__initcall(init_pswap);
+#else /* !CONFIG_PSWAP */
+static inline void pswap_unuse(struct swap_info_struct *si)
+{
+}
+
+static inline void pswap_unuse_cancel(struct swap_info_struct *si)
+{
+}
+
+static inline void pswap_init(struct swap_info_struct *si,
+			      unsigned long max, unsigned char *map)
+{
+}
+
+static inline void pswap_fini(struct swap_info_struct *si)
+{
+}
+#endif /* CONFIG_PSWAP */
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+
+void ub_unuse_swap_page(struct page *page)
+{
+	struct swap_info_struct *p;
+	swp_entry_t entry;
+
+	if (!PageSwapCache(page))
+		return;
+
+	entry.val = page_private(page);
+	p = swap_info_get(entry);
+	if (p) {
+		ub_swapentry_recharge(p, swp_offset(entry), &ub0);
+		spin_unlock(&swap_lock);
+	}
+}
+
+void ub_unuse_swap(struct user_beancounter *ub)
+{
+	struct swap_info_struct *si;
+	unsigned int type, i;
+
+	spin_lock(&swap_lock);
+
+	if (ub->ub_swapentries)
+		printk(KERN_NOTICE "UB: %d has %ld swap entries to unuse.\n",
+				ub->ub_uid, ub->ub_swapentries);
+
+	for (type = 0 ; type < nr_swapfiles && ub->ub_swapentries ; type++) {
+		si = swap_info[type];
+		if (!(si->flags & SWP_USED))
+			continue;
+
+		si->flags += SWP_SCANNING;
+		spin_unlock(&swap_lock);
+
+		for ( i = 0 ; i < si->max && ub->ub_swapentries ; i++ ) {
+			if (si->swap_ubs[i] != ub)
+				continue;
+
+			spin_lock(&swap_lock);
+			if (si->swap_ubs[i] == ub)
+				ub_swapentry_recharge(si, i, &ub0);
+			spin_unlock(&swap_lock);
+		}
+
+		spin_lock(&swap_lock);
+		si->flags -= SWP_SCANNING;
+	}
+
+	spin_unlock(&swap_lock);
+}
+
+#endif /* CONFIG_BC_SWAP_ACCOUNTING */
+
 /*
  * If swap is getting full, or if there are no more mappings of this page,
  * then try_to_free_swap is called to free its swap space.
@@ -715,6 +1271,7 @@ int free_swap_and_cache(swp_entry_t entr
 {
 	struct swap_info_struct *p;
 	struct page *page = NULL;
+	int uninitialized_var(full);
 
 	if (non_swap_entry(entry))
 		return 1;
@@ -723,7 +1280,11 @@ int free_swap_and_cache(swp_entry_t entr
 	if (p) {
 		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
 			page = find_get_page(&swapper_space, entry.val);
+			full = ub_swap_full(get_swap_ub(entry));
 			if (page && !trylock_page(page)) {
+				if (!page_mapped(page))
+					ub_swapentry_recharge(p,
+						swp_offset(entry), &ub0);
 				page_cache_release(page);
 				page = NULL;
 			}
@@ -736,7 +1297,7 @@ int free_swap_and_cache(swp_entry_t entr
 		 * Also recheck PageSwapCache now page is locked (above).
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
-				(!page_mapped(page) || vm_swap_full())) {
+				(!page_mapped(page) || full || vm_swap_full())) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
@@ -745,6 +1306,7 @@ int free_swap_and_cache(swp_entry_t entr
 	}
 	return p != NULL;
 }
+EXPORT_SYMBOL(free_swap_and_cache);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /**
@@ -881,6 +1443,7 @@ static int unuse_pte(struct vm_area_stru
 	spinlock_t *ptl;
 	pte_t *pte;
 	int ret = 1;
+	struct mm_struct *mm = vma->vm_mm;
 
 	swapcache = page;
 	page = ksm_might_need_to_copy(page, vma, addr);
@@ -900,10 +1463,10 @@ static int unuse_pte(struct vm_area_stru
 		goto out;
 	}
 
-	inc_mm_counter(vma->vm_mm, anon_rss);
 	dec_mm_counter(vma->vm_mm, swap_usage);
+	inc_mm_counter(mm, anon_rss);
 	get_page(page);
-	set_pte_at(vma->vm_mm, addr, pte,
+	set_pte_at(mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
 
 	if (page == swapcache)
@@ -1112,6 +1675,8 @@ static int try_to_unuse(unsigned int typ
 	unsigned int i = 0;
 	int retval = 0;
 
+	pswap_unuse(si);
+
 	/*
 	 * When searching mms for an entry, a good strategy is to
 	 * start at the first mm we freed the previous entry from
@@ -1184,6 +1749,19 @@ static int try_to_unuse(unsigned int typ
 		lock_page(page);
 		wait_on_page_writeback(page);
 
+		/* If read failed we cannot map not-uptodate page to 
+		 * user space. Actually, we are in serious troubles,
+		 * we do not even know what process to kill. So, the only
+		 * variant remains: to stop swapoff() and allow someone
+		 * to kill processes to zap invalid pages.
+		 */
+		if (unlikely(!PageUptodate(page))) {
+			unlock_page(page);
+			page_cache_release(page);
+			retval = -EIO;
+			break;
+		}
+
 		/*
 		 * Remove all references to entry.
 		 */
@@ -1240,6 +1818,7 @@ static int try_to_unuse(unsigned int typ
 			mmput(start_mm);
 			start_mm = new_start_mm;
 		}
+
 		if (retval) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -1304,6 +1883,8 @@ static int try_to_unuse(unsigned int typ
 	}
 
 	mmput(start_mm);
+	if (retval != 0)
+		pswap_unuse_cancel(si);
 	return retval;
 }
 
@@ -1560,6 +2141,10 @@ SYSCALL_DEFINE1(swapoff, const char __us
 	int i, type, prev;
 	int err;
 
+	/* VE admin check is just to be on the safe side, the admin may affect
+	 * swaps only if he has access to special, i.e. if he has been granted
+	 * access to the block device or if the swap file is in the area
+	 * visible to him. */
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -1648,6 +2233,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
 
+	pswap_fini(p);
+
 	mutex_lock(&swapon_mutex);
 	spin_lock(&swap_lock);
 	drain_mmlist();
@@ -1669,6 +2256,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+	ub_swap_fini(p);
 	/* Destroy swap account informatin */
 	swap_cgroup_swapoff(type);
 
@@ -1774,21 +2362,60 @@ static const struct seq_operations swaps
 	.show =		swap_show
 };
 
+#include <linux/virtinfo.h>
+
+static int swap_show_ve(struct seq_file *swap, void *v)
+{
+	struct user_beancounter *old_ub;
+	struct sysinfo si;
+	int ret;
+
+	si_swapinfo(&si);
+	old_ub = set_exec_ub(current->mm->mm_ub);
+	ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, &si);
+	(void)set_exec_ub(old_ub);
+	if (ret & NOTIFY_FAIL)
+		goto out;
+
+	seq_printf(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+	if (!si.totalswap)
+		goto out;
+	seq_printf(swap, "%-40s%s\t%lu\t%lu\t%d\n",
+			"/dev/null",
+			"partition",
+			si.totalswap  << (PAGE_SHIFT - 10),
+			(si.totalswap - si.freeswap) << (PAGE_SHIFT - 10),
+			-1);
+out:
+	return 0;
+}
+
 static int swaps_open(struct inode *inode, struct file *file)
 {
+	if (!ve_is_super(get_exec_env()))
+		return single_open(file, &swap_show_ve, NULL);
 	return seq_open(file, &swaps_op);
 }
 
+static int swaps_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *f = file->private_data;
+
+	if (f->op != &swaps_op)
+		return single_release(inode, file);
+	return seq_release(inode, file);
+}
+
 static const struct file_operations proc_swaps_operations = {
 	.open		= swaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= swaps_release,
 };
 
 static int __init procswaps_init(void)
 {
-	proc_create("swaps", 0, NULL, &proc_swaps_operations);
+	proc_create("swaps", 0, &glob_proc_root, &proc_swaps_operations);
 	return 0;
 }
 __initcall(procswaps_init);
@@ -1963,6 +2590,8 @@ SYSCALL_DEFINE2(swapon, const char __use
 		goto bad_swap;
 	}
 
+	memcpy(p->uuid, swap_header->info.sws_uuid, sizeof(p->uuid));
+
 	p->lowest_bit  = 1;
 	p->cluster_next = 1;
 	p->cluster_nr = 0;
@@ -2047,6 +2676,13 @@ SYSCALL_DEFINE2(swapon, const char __use
 		goto bad_swap;
 	}
 
+	if (ub_swap_init(p, maxpages)) {
+		error = -ENOMEM;
+		goto bad_swap;
+	}
+
+	pswap_init(p, maxpages, swap_map);
+
 	if (p->bdev) {
 		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
 			p->flags |= SWP_SOLIDSTATE;
@@ -2139,6 +2775,44 @@ void si_swapinfo(struct sysinfo *val)
 }
 
 /*
+ * Set swap entry in swap map to shared memory forcibly.
+ * Used when injecting shared memory at container migration
+ */
+int swap_convert_to_shmem(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+	unsigned long offset, type;
+	int err;
+
+	if (non_swap_entry(entry))
+		return -EINVAL;
+
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		return -EINVAL;
+
+	p = swap_info[type];
+	offset = swp_offset(entry);
+
+	spin_lock(&swap_lock);
+
+	err = -EINVAL;
+	if (unlikely(offset >= p->max))
+		goto out;
+
+	if ((p->swap_map[offset] & ~SWAP_HAS_CACHE) != 1)
+		goto out;
+
+	p->swap_map[offset] = SWAP_MAP_SHMEM |
+			      (p->swap_map[offset] & SWAP_HAS_CACHE);
+	err = 0;
+out:
+	spin_unlock(&swap_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(swap_convert_to_shmem);
+
+/*
  * Verify that a swap entry is valid and increment its swap map count.
  *
  * Returns error code in following case.
@@ -2149,7 +2823,7 @@ void si_swapinfo(struct sysinfo *val)
  * - swap-cache reference is requested but the entry is not used. -> ENOENT
  * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
  */
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
 	struct swap_info_struct *p;
 	unsigned long offset, type;
@@ -2178,9 +2852,11 @@ static int __swap_duplicate(swp_entry_t 
 	if (usage == SWAP_HAS_CACHE) {
 
 		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
-		if (!has_cache && count)
+		if (!has_cache && count) {
 			has_cache = SWAP_HAS_CACHE;
-		else if (has_cache)		/* someone else added cache */
+			/* page now in swapcache, drop swap entry charge */
+			ub_swapentry_uncharge(p, offset);
+		} else if (has_cache)		/* someone else added cache */
 			err = -EEXIST;
 		else				/* no users remaining */
 			err = -ENOENT;
@@ -2209,6 +2885,7 @@ bad_file:
 	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 	goto out;
 }
+EXPORT_SYMBOL(__swap_duplicate);
 
 /*
  * Help swapoff by noting that swap entry belongs to shmem/tmpfs
@@ -2230,6 +2907,7 @@ int swap_duplicate(swp_entry_t entry)
 		err = add_swap_count_continuation(entry, GFP_ATOMIC);
 	return err;
 }
+EXPORT_SYMBOL(swap_duplicate);
 
 /*
  * @entry: swap entry for which we allocate swap cache.
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/util.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/util.c
--- linux-2.6.32-504.3.3.el6.orig/mm/util.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/util.c	2015-01-21 12:02:52.403982670 +0300
@@ -13,6 +13,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/kmemtrace.h>
+#include <linux/mount.h>
 
 /**
  * kstrdup - allocate space for and copy an existing string
@@ -302,7 +303,14 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned lon
 			return PTR_ERR(file);
 	}
 
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE | MAP_EXECPRIO | MAP_CPT);
+
+	/* Ugly fix for PSBM-23133 vdavydov@ */
+	if (file && file->f_op && file->f_op->mmap &&
+	    (flags & MAP_TYPE) == MAP_SHARED &&
+	    S_ISREG(file->f_path.dentry->d_inode->i_mode) &&
+	    (file->f_path.mnt->mnt_sb->s_type->fs_flags & FS_HAS_MMAP_PREP))
+		file->f_op->mmap(file, NULL);
 
 	down_write(&current->mm->mmap_sem);
 	retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/vmalloc.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/vmalloc.c
--- linux-2.6.32-504.3.3.el6.orig/mm/vmalloc.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/vmalloc.c	2015-01-21 12:02:58.691815764 +0300
@@ -32,13 +32,16 @@
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
+#include <bc/kmem.h>
+#include <bc/debug.h>
+
 struct vfree_deferred {
 	struct llist_head list;
 	struct work_struct wq;
 };
 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
 
-static void __vunmap(const void *, int);
+static void __vunmap(const void *, int, int);
 
 static void free_work(struct work_struct *w)
 {
@@ -47,7 +50,7 @@ static void free_work(struct work_struct
 	while (llnode) {
 		void *p = llnode;
 		llnode = llist_next(llnode);
-		__vunmap(p, 1);
+		__vunmap(p, 1, 0);
 	}
 }
 
@@ -1470,7 +1473,7 @@ struct vm_struct *remove_vm_area(const v
 	return NULL;
 }
 
-static void __vunmap(const void *addr, int deallocate_pages)
+static void __vunmap(const void *addr, int deallocate_pages, int uncharge)
 {
 	struct vm_struct *area;
 
@@ -1495,6 +1498,8 @@ static void __vunmap(const void *addr, i
 	if (deallocate_pages) {
 		int i;
 
+		if (uncharge)
+			dec_vmalloc_charged(area);
 		for (i = 0; i < area->nr_pages; i++) {
 			struct page *page = area->pages[i];
 
@@ -1538,7 +1543,7 @@ void vfree(const void *addr)
 		llist_add((struct llist_node *)addr, &p->list);
 		schedule_work(&p->wq);
 	} else
-		__vunmap(addr, 1);
+		__vunmap(addr, 1, 1);
 }
 EXPORT_SYMBOL(vfree);
 
@@ -1556,7 +1561,7 @@ void vunmap(const void *addr)
 	BUG_ON(in_interrupt());
 	might_sleep();
 	if (addr)
-		__vunmap(addr, 0);
+		__vunmap(addr, 0, 0);
 }
 EXPORT_SYMBOL(vunmap);
 
@@ -1643,10 +1648,12 @@ static void *__vmalloc_area_node(struct 
 
 	if (map_vm_area(area, prot, &pages))
 		goto fail;
+
+	inc_vmalloc_charged(area, gfp_mask);
 	return area->addr;
 
 fail:
-	vfree(area->addr);
+	__vunmap(area->addr, 1, 0);
 	return NULL;
 }
 
@@ -1745,6 +1752,26 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
+void *ub_vmalloc(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+EXPORT_SYMBOL(ub_vmalloc);
+
+void *vmalloc_best(unsigned long size)
+{
+	return vmalloc(size);
+}
+
+EXPORT_SYMBOL(vmalloc_best);
+
+void *ub_vmalloc_best(unsigned long size)
+{
+	return ub_vmalloc(size);
+}
+
+EXPORT_SYMBOL(ub_vmalloc_best);
+
 /**
  *	vzalloc - allocate virtually contiguous memory with zero fill
  *	@size:	allocation size
@@ -1803,6 +1830,13 @@ void *vmalloc_node(unsigned long size, i
 }
 EXPORT_SYMBOL(vmalloc_node);
 
+void *ub_vmalloc_node(unsigned long size, int node)
+{
+	return __vmalloc_node(size, 1, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL,
+					node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ub_vmalloc_node);
+
 /**
  * vzalloc_node - allocate memory on a specific node with zero fill
  * @size:	allocation size
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/vmscan.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/vmscan.c
--- linux-2.6.32-504.3.3.el6.orig/mm/vmscan.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/vmscan.c	2015-01-21 12:02:58.891810455 +0300
@@ -12,6 +12,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/mmgang.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
@@ -26,6 +27,7 @@
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
+#include <linux/mmgang.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
@@ -48,9 +50,12 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#include <linux/vzstat.h>
 
 #include "internal.h"
 
+#include <bc/dcache.h>
+
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -58,39 +63,62 @@ struct scan_control {
 	/* Number of pages freed so far during a call to shrink_zones() */
 	unsigned long nr_reclaimed;
 
+	/* Reclaimed swapbacked pages */
+	unsigned long nr_reclaim_swapout;
+
 	/* How many pages shrink_list() should reclaim */
 	unsigned long nr_to_reclaim;
 
-	unsigned long hibernation_mode;
-
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
-	int may_writepage;
+	unsigned hibernation_mode:1;
+
+	unsigned may_writepage:1;
 
 	/* Can mapped pages be reclaimed? */
-	int may_unmap;
+	unsigned may_unmap:1;
 
 	/* Can pages be swapped as part of reclaim? */
-	int may_swap;
+	unsigned may_swap:1;
 
-	int swappiness;
+	unsigned near_oom:1;
+
+	unsigned all_unreclaimable:1;
+
+	/* Can move anon pages to shadow lru */
+	unsigned may_shade_anon:1;
+
+	/* Can move file pages to shadow lru */
+	unsigned may_shade_file:1;
 
-	int all_unreclaimable;
+	/* Use virtual swap for anonymous pages */
+	unsigned use_vswap:1;
+
+	int swappiness;
 
 	int order;
 
-	/*
-	 * The memory cgroup that hit its limit and as a result is the
-	 * primary target of this reclaim invocation.
-	 */
-	struct mem_cgroup *target_mem_cgroup;
+	/* Topmost priority for current invocation */
+	int max_priority;
+
+	/* Scan (total_size >> priority) pages at once */
+	int priority;
+
+	/* Reclaim this gang-set */
+	struct gang_set *gs;
 
 	/*
 	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
 	 * are scanned.
 	 */
 	nodemask_t	*nodemask;
+
+	/*
+	 * The memory cgroup that hit its limit and as a result is the
+	 * primary target of this reclaim invocation.
+	 */
+	struct mem_cgroup *target_mem_cgroup;
 };
 
 struct mem_cgroup_zone {
@@ -133,56 +161,30 @@ struct mem_cgroup_zone {
  */
 int vm_swappiness = 60;
 unsigned long vm_total_pages;	/* The total number of pages which the VM controls */
+int vm_sync_reclaim = 0;
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static bool global_reclaim(struct scan_control *sc)
-{
-	return !sc->target_mem_cgroup;
-}
+static unsigned long
+reclaimable_pages(struct lruvec *lruvec, struct scan_control *sc);
 
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-	return !mz->mem_cgroup;
-}
-#else
 static bool global_reclaim(struct scan_control *sc)
 {
-	return true;
-}
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-	return true;
+	return sc->gs == NULL;
 }
-#endif
 
-static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
+static struct lruvec *local_lruvec(struct zone *zone, struct scan_control *sc)
 {
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
-
-	return &mz->zone->reclaim_stat;
+	return &mem_zone_gang(sc->gs, zone)->lruvec;
 }
 
-static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
-				       enum lru_list lru)
-{
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_zone_nr_pages(mz->mem_cgroup, mz->zone, lru);
-
-	return zone_page_state(mz->zone, NR_LRU_BASE + lru);
-}
-
-
 /*
  * Add a shrinker callback to be called from the vm
  */
 void register_shrinker(struct shrinker *shrinker)
 {
-	shrinker->nr = 0;
+	atomic_long_set(&shrinker->nr_in_batch, 0);
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
@@ -229,6 +231,13 @@ unsigned long shrink_slab(unsigned long 
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
 
+	/* Disable fs-related IO for direct reclaim */
+	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+		gfp_mask &= ~__GFP_FS;
+
+	if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+		return 0;
+
 	if (!down_read_trylock(&shrinker_rwsem)) {
 		/* Assume we'll be able to shrink next time */
 		ret = 1;
@@ -237,19 +246,24 @@ unsigned long shrink_slab(unsigned long 
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
-		unsigned long total_scan;
-		unsigned long max_pass;
+		long max_pass, total_scan;
 
 		max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+		if (max_pass <= 0)
+			continue;
+
+		total_scan = atomic_long_xchg(&shrinker->nr_in_batch, 0);
+
 		delta = (4 * scanned) / shrinker->seeks;
 		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
-		shrinker->nr += delta;
-		if (shrinker->nr < 0) {
+		total_scan += delta;
+
+		if (total_scan < 0) {
 			printk(KERN_ERR "shrink_slab: %pF negative objects to "
 			       "delete nr=%ld\n",
-			       shrinker->shrink, shrinker->nr);
-			shrinker->nr = max_pass;
+			       shrinker->shrink, total_scan);
+			total_scan = max_pass;
 		}
 
 		/*
@@ -257,17 +271,17 @@ unsigned long shrink_slab(unsigned long 
 		 * never try to free more than twice the estimate number of
 		 * freeable entries.
 		 */
-		if (shrinker->nr > max_pass * 2)
-			shrinker->nr = max_pass * 2;
-
-		total_scan = shrinker->nr;
-		shrinker->nr = 0;
+		if (total_scan > max_pass * 2)
+			total_scan = max_pass * 2;
 
 		while (total_scan >= SHRINK_BATCH) {
 			long this_scan = SHRINK_BATCH;
 			int shrink_ret;
 			int nr_before;
 
+			if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+				goto done;
+
 			nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
 			shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
 								gfp_mask);
@@ -281,8 +295,9 @@ unsigned long shrink_slab(unsigned long 
 			cond_resched();
 		}
 
-		shrinker->nr += total_scan;
+		atomic_long_add(total_scan, &shrinker->nr_in_batch);
 	}
+done:
 	up_read(&shrinker_rwsem);
 out:
 	cond_resched();
@@ -599,9 +614,8 @@ enum page_references {
 	PAGEREF_ACTIVATE,
 };
 
-static enum page_references page_check_references(struct page *page,
-						  struct mem_cgroup_zone *mz,
-						  struct scan_control *sc)
+static enum page_references
+page_check_references(struct page *page, struct scan_control *sc)
 {
 	int referenced_ptes, referenced_page;
 	unsigned long vm_flags;
@@ -618,6 +632,9 @@ static enum page_references page_check_r
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
+		if (sc->near_oom)
+			return PAGEREF_KEEP;
+
 		if (PageAnon(page))
 			return PAGEREF_ACTIVATE;
 		/*
@@ -636,7 +653,13 @@ static enum page_references page_check_r
 		 */
 		SetPageReferenced(page);
 
-		if (referenced_page)
+		if (referenced_page || referenced_ptes > 1)
+			return PAGEREF_ACTIVATE;
+
+		/*
+		 * Activate file-backed executable pages after first usage.
+		 */
+		if (vm_flags & VM_EXEC)
 			return PAGEREF_ACTIVATE;
 
 		return PAGEREF_KEEP;
@@ -653,15 +676,14 @@ static enum page_references page_check_r
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
+					struct zone *zone,
 					struct scan_control *sc,
-					struct mem_cgroup_zone *mz,
 					enum pageout_io sync_writeback,
-					int priority,
 					unsigned long *ret_nr_dirty,
 					unsigned long *ret_nr_writeback)
 {
 	LIST_HEAD(ret_pages);
-	struct pagevec freed_pvec;
+	LIST_HEAD(free_pages);
 	int pgactivate = 0;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
@@ -670,12 +692,12 @@ static unsigned long shrink_page_list(st
 
 	cond_resched();
 
-	pagevec_init(&freed_pvec, 1);
 	while (!list_empty(page_list)) {
 		enum page_references references;
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
+		bool may_shade;
 
 		cond_resched();
 
@@ -686,7 +708,7 @@ static unsigned long shrink_page_list(st
 			goto keep;
 
 		VM_BUG_ON(PageActive(page));
-		VM_BUG_ON(page_zone(page) != mz->zone);
+		VM_BUG_ON(page_zone(page) != zone);
 
 		sc->nr_scanned++;
 
@@ -703,7 +725,10 @@ static unsigned long shrink_page_list(st
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
-		if (PageWriteback(page)) {
+		may_shade = PageSwapBacked(page) ? sc->may_shade_anon
+						 : sc->may_shade_file;
+
+		if (!may_shade && PageWriteback(page)) {
 			/*
 			 * memcg doesn't have any dirty pages throttling so we
 			 * could easily OOM just because too many pages are in
@@ -741,7 +766,7 @@ static unsigned long shrink_page_list(st
 			wait_on_page_writeback(page);
 		}
 
-		references = page_check_references(page, mz, sc);
+		references = page_check_references(page, sc);
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
@@ -757,11 +782,22 @@ static unsigned long shrink_page_list(st
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!(sc->gfp_mask & __GFP_IO))
-				goto keep_locked;
-			if (!add_to_swap(page))
-				goto activate_locked;
-			may_enter_fs = 1;
+			if (sc->use_vswap && SWP_VSWAP_NUM) {
+				switch (add_to_vswap(page)) {
+				case SWAP_FAIL:
+					goto activate_locked;
+				case SWAP_AGAIN:
+					goto keep_locked;
+				case SWAP_MLOCK:
+					goto cull_mlocked;
+				}
+			} else {
+				if (!(sc->gfp_mask & __GFP_IO))
+					goto keep_locked;
+				if (!add_to_swap(page, get_gang_ub(page_gang(page))))
+					goto activate_locked;
+				may_enter_fs = 1;
+			}
 		}
 
 		mapping = page_mapping(page);
@@ -783,6 +819,19 @@ static unsigned long shrink_page_list(st
 			}
 		}
 
+		if (may_shade) {
+			/* move page to shadow gang */
+			if (gang_mod_shadow_page(page))
+				goto keep_locked;
+
+			if (PageSwapBacked(page))
+				/* account vswapout */
+				sc->nr_reclaim_swapout++;
+
+			nr_reclaimed++;
+			goto keep_locked;
+		}
+
 		if (PageDirty(page)) {
 			nr_dirty++;
 
@@ -792,7 +841,8 @@ static unsigned long shrink_page_list(st
 			 * unless under significant pressure.
 			 */
 			if (page_is_file_cache(page) &&
-					(!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+			    (!current_is_kswapd() ||
+			     sc->priority >= sc->max_priority - 2)) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principal to deactivate_page()
@@ -889,11 +939,14 @@ static unsigned long shrink_page_list(st
 		 */
 		__clear_page_locked(page);
 free_it:
+		gang_del_user_page(page);
 		nr_reclaimed++;
-		if (!pagevec_add(&freed_pvec, page)) {
-			__pagevec_free(&freed_pvec);
-			pagevec_reinit(&freed_pvec);
-		}
+
+		/*
+		 * Is there need to periodically free_page_list? It would
+		 * appear not as the counts should be low
+		 */
+		list_add(&page->lru, &free_pages);
 		continue;
 
 cull_mlocked:
@@ -905,7 +958,8 @@ cull_mlocked:
 
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
-		if (PageSwapCache(page) && vm_swap_full())
+		if (PageSwapCache(page) && (vm_swap_full() ||
+			(sc->gs && ub_swap_full(get_gangs_ub(sc->gs)))))
 			try_to_free_swap(page);
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
@@ -924,11 +978,11 @@ keep:
 	 * will encounter the same problem
 	 */
 	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
-		zone_set_flag(mz->zone, ZONE_CONGESTED);
+		zone_set_flag(zone, ZONE_CONGESTED);
+
+	free_hot_cold_page_list(&free_pages, 1);
 
 	list_splice(&ret_pages, page_list);
-	if (pagevec_count(&freed_pvec))
-		__pagevec_free(&freed_pvec);
 	count_vm_events(PGACTIVATE, pgactivate);
 	trace_mm_pagereclaim_free(nr_reclaimed);
         *ret_nr_dirty += nr_dirty;
@@ -946,7 +1000,8 @@ keep:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file,
+			struct lruvec **locked)
 {
 	bool all_lru_mode;
 	int ret = -EINVAL;
@@ -1013,7 +1068,14 @@ int __isolate_lru_page(struct page *page
 		}
 	}
 
+	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+		return ret;
+
 	if (likely(get_page_unless_zero(page))) {
+		if (locked && !try_relock_page_lru(locked, page)) {
+			put_page(page);
+			return -EINVAL;
+		}
 		/*
 		 * Be careful not to clear PageLRU until after we're
 		 * sure the page is not being freed elsewhere -- the
@@ -1047,31 +1109,42 @@ int __isolate_lru_page(struct page *page
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, isolate_mode_t mode,
-		int file)
+		struct lruvec *lruvec, struct list_head *dst,
+		unsigned long *nr_scanned, struct scan_control *sc,
+		isolate_mode_t mode, enum lru_list lru)
 {
+	struct list_head *src = &lruvec->lru_list[lru];
+	struct lruvec *locked = lruvec;
 	unsigned long nr_taken = 0;
 	unsigned long nr_lumpy_taken = 0, nr_lumpy_dirty = 0, nr_lumpy_failed = 0;
 	unsigned long scan;
+	int file = is_file_lru(lru);
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
 		unsigned long pfn;
 		unsigned long end_pfn;
 		unsigned long page_pfn;
-		int zone_id;
+		int zone_id, isolated_pages;
 
 		page = lru_to_page(src);
+
+		if (is_lru_milestone(lruvec, &page->lru)) {
+			remove_lru_milestone(lruvec, lru);
+			continue;
+		}
+
 		prefetchw_prev_lru_page(page, src, flags);
 
 		VM_BUG_ON(!PageLRU(page));
 
-		switch (__isolate_lru_page(page, mode, file)) {
+		switch (__isolate_lru_page(page, mode, file, NULL)) {
 		case 0:
 			mem_cgroup_lru_del(page);
 			list_move(&page->lru, dst);
-			nr_taken += hpage_nr_pages(page);
+			isolated_pages = hpage_nr_pages(page);
+			lruvec->nr_pages[lru] -= isolated_pages;
+			nr_taken += isolated_pages;
 			break;
 
 		case -EBUSY:
@@ -1083,7 +1156,7 @@ static unsigned long isolate_lru_pages(u
 			BUG();
 		}
 
-		if (!order)
+		if (COMPACTION_BUILD || !sc->order)
 			continue;
 
 		/*
@@ -1097,8 +1170,8 @@ static unsigned long isolate_lru_pages(u
 		 */
 		zone_id = page_zone_id(page);
 		page_pfn = page_to_pfn(page);
-		pfn = page_pfn & ~((1 << order) - 1);
-		end_pfn = pfn + (1 << order);
+		pfn = page_pfn & ~((1 << sc->order) - 1);
+		end_pfn = pfn + (1 << sc->order);
 		for (; pfn < end_pfn; pfn++) {
 			struct page *cursor_page;
 
@@ -1125,12 +1198,14 @@ static unsigned long isolate_lru_pages(u
 			    !PageSwapCache(cursor_page))
 				break;
 
-			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
-				unsigned int isolated_pages;
+			if (!PageLRU(cursor_page))
+				continue;
 
+			if (__isolate_lru_page(cursor_page, mode, file, &locked) == 0) {
 				mem_cgroup_lru_del(cursor_page);
 				list_move(&cursor_page->lru, dst);
 				isolated_pages = hpage_nr_pages(page);
+				locked->nr_pages[page_lru(cursor_page)] -= isolated_pages;
 				nr_taken += isolated_pages;
 				nr_lumpy_taken += isolated_pages;
 				if (PageDirty(cursor_page))
@@ -1160,11 +1235,14 @@ static unsigned long isolate_lru_pages(u
 		/* If we break out of the loop above, lumpy reclaim failed */
 		if (pfn < end_pfn)
 			nr_lumpy_failed++;
+
+		/* Switch back to target lruvec */
+		locked = relock_lruvec(locked, lruvec);
 	}
 
-	*scanned = scan;
+	*nr_scanned = max(scan, 1ul);
 
-	trace_mm_vmscan_lru_isolate(order,
+	trace_mm_vmscan_lru_isolate(sc->order,
 			nr_to_scan, scan,
 			nr_taken,
 			nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1172,23 +1250,6 @@ static unsigned long isolate_lru_pages(u
 	return nr_taken;
 }
 
-static unsigned long isolate_pages(unsigned long nr, struct mem_cgroup_zone *mz,
-				   struct list_head *dst,
-				   unsigned long *scanned, int order,
-				   isolate_mode_t mode, int active, int file)
-{
-	struct lruvec *lruvec;
-	int lru = LRU_BASE;
-
-	lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
-	if (active)
-		lru += LRU_ACTIVE;
-	if (file)
-		lru += LRU_FILE;
-	return isolate_lru_pages(nr, &lruvec->lists[lru], dst,
-				 scanned, order, mode, file);
-}
-
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
@@ -1244,17 +1305,18 @@ int isolate_lru_page(struct page *page)
 	int ret = -EBUSY;
 
 	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
+		struct lruvec *lruvec;
 
-		spin_lock_irq(&zone->lru_lock);
+		local_irq_disable();
+		lruvec = lock_page_lru(page);
 		if (PageLRU(page) && get_page_unless_zero(page)) {
 			int lru = page_lru(page);
 			ret = 0;
 			ClearPageLRU(page);
 
-			del_page_from_lru_list(zone, page, lru);
+			del_page_from_lru_list(lruvec, page, lru);
 		}
-		spin_unlock_irq(&zone->lru_lock);
+		spin_unlock_irq(&lruvec->lru_lock);
 	}
 	return ret;
 }
@@ -1281,6 +1343,10 @@ static int too_many_isolated(struct zone
 		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
 	}
 
+	if (isolated > inactive)
+		isolated = zone_page_state_snapshot(zone, file ?
+				NR_ISOLATED_FILE : NR_ISOLATED_ANON);
+
 	return isolated > inactive;
 }
 
@@ -1288,22 +1354,20 @@ static int too_many_isolated(struct zone
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
-static unsigned long shrink_inactive_list(unsigned long max_scan,
-			struct mem_cgroup_zone *mz, struct scan_control *sc,
-			int priority, int file)
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+		     struct scan_control *sc, enum lru_list lru)
 {
 	LIST_HEAD(page_list);
-	struct pagevec pvec;
+	LIST_HEAD(pages_to_free);
 	unsigned long nr_scanned = 0;
 	unsigned long nr_reclaimed = 0;
-        unsigned long nr_dirty = 0;
-        unsigned long nr_writeback = 0;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
-	struct zone *zone = mz->zone;
-	int order = 0;
-
-	if (!COMPACTION_BUILD)
-		order = sc->order;
+	unsigned long nr_dirty = 0;
+	unsigned long nr_writeback = 0;
+	isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+	struct zone *zone = lruvec_zone(lruvec);
+	int file = is_file_lru(lru);
+	struct lruvec *locked;
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1313,10 +1377,14 @@ static unsigned long shrink_inactive_lis
 			return SWAP_CLUSTER_MAX;
 	}
 
-	pagevec_init(&pvec, 1);
-
 	lru_add_drain();
-	spin_lock_irq(&zone->lru_lock);
+
+	if (!sc->may_unmap)
+		reclaim_mode |= ISOLATE_UNMAPPED;
+	if (!sc->may_writepage)
+		reclaim_mode |= ISOLATE_CLEAN;
+
+	spin_lock_irq(&lruvec->lru_lock);
 	do {
 		struct page *page;
 		unsigned long nr_taken;
@@ -1327,17 +1395,18 @@ static unsigned long shrink_inactive_lis
 		unsigned long nr_anon;
 		unsigned long nr_file;
 
-		nr_taken = isolate_pages(SWAP_CLUSTER_MAX, mz, &page_list,
-					 &nr_scan, order,
-					 ISOLATE_INACTIVE, 0, file);
+		nr_taken = isolate_lru_pages(SWAP_CLUSTER_MAX, lruvec, &page_list,
+					 &nr_scan, sc, reclaim_mode, lru);
 		if (global_reclaim(sc)) {
-			zone->pages_scanned += nr_scan;
+			atomic_long_add(nr_scan, &zone->pages_scanned);
 			if (current_is_kswapd())
 				__count_zone_vm_events(PGSCAN_KSWAPD, zone,
 						       nr_scan);
 			else
 				__count_zone_vm_events(PGSCAN_DIRECT, zone,
 						       nr_scan);
+		} else {
+			atomic_long_add(nr_scan, &lruvec->pages_scanned);
 		}
 
 		if (nr_taken == 0)
@@ -1360,16 +1429,16 @@ static unsigned long shrink_inactive_lis
 		__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
 		__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
 
-		reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-		reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
-		reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
-		reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+		lruvec->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+		lruvec->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+		lruvec->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+		lruvec->recent_scanned[1] += count[LRU_ACTIVE_FILE];
 
-		spin_unlock_irq(&zone->lru_lock);
+		spin_unlock_irq(&lruvec->lru_lock);
 
 		nr_scanned += nr_scan;
-		nr_freed = shrink_page_list(&page_list, sc, mz,
-					PAGEOUT_IO_ASYNC, priority,
+		nr_freed = shrink_page_list(&page_list, zone, sc,
+					PAGEOUT_IO_ASYNC,
 					&nr_dirty, &nr_writeback);
 
 		nr_reclaimed += nr_freed;
@@ -1379,38 +1448,58 @@ static unsigned long shrink_inactive_lis
 			__count_vm_events(KSWAPD_STEAL, nr_freed);
 		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
 
-		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
+		locked = NULL;
+		memset(count, 0, sizeof(count));
+		local_irq_disable();
 		while (!list_empty(&page_list)) {
 			int lru;
 			page = lru_to_page(&page_list);
 			VM_BUG_ON(PageLRU(page));
 			list_del(&page->lru);
 			if (unlikely(!page_evictable(page, NULL))) {
-				spin_unlock_irq(&zone->lru_lock);
+				unlock_lruvec(locked);
+				locked = NULL;
+				local_irq_enable();
 				putback_lru_page(page);
-				spin_lock_irq(&zone->lru_lock);
+				local_irq_disable();
 				continue;
 			}
+			locked = relock_lruvec(locked, __page_lruvec(page));
 			SetPageLRU(page);
 			lru = page_lru(page);
-			add_page_to_lru_list(zone, page, lru);
-			if (is_active_lru(lru)) {
-				int file = is_file_lru(lru);
-				int numpages = hpage_nr_pages(page);
-				reclaim_stat->recent_rotated[file] += numpages;
-			}
-			if (!pagevec_add(&pvec, page)) {
-				spin_unlock_irq(&zone->lru_lock);
-				__pagevec_release(&pvec);
-				spin_lock_irq(&zone->lru_lock);
+			add_page_to_lru_list(locked, page, lru);
+			/* XXX - mess with active/inactive? */
+			if (locked == lruvec)
+				count[lru] += hpage_nr_pages(page);
+			if (put_page_testzero(page)) {
+				__ClearPageLRU(page);
+				__ClearPageActive(page);
+				del_page_from_lru_list(locked, page, lru);
+				gang_del_user_page(page);
+
+				if (unlikely(PageCompound(page))) {
+					spin_unlock_irq(&locked->lru_lock);
+					(*get_compound_page_dtor(page))(page);
+					locked = NULL;
+					local_irq_disable();
+				} else
+					list_add(&page->lru, &pages_to_free);
 			}
 		}
+
+		locked = relock_lruvec(locked, lruvec);
+
 		__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
 		__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
 
+		lruvec->recent_rotated[0] += count[LRU_ACTIVE_ANON];
+		lruvec->recent_rotated[0] += count[LRU_INACTIVE_ANON];
+		lruvec->recent_rotated[1] += count[LRU_ACTIVE_FILE];
+		lruvec->recent_rotated[1] += count[LRU_INACTIVE_FILE];
+
 		/*
 		 * If reclaim is isolating dirty pages under writeback, it implies
 		 * that the long-lived page allocation rate is exceeding the page
@@ -1435,36 +1524,22 @@ static unsigned long shrink_inactive_lis
 		 *                     isolated page is PageWriteback
 		 */
 		if (nr_writeback && nr_writeback >=
-			(nr_taken >> (DEF_PRIORITY-priority))) {
-			spin_unlock_irq(&zone->lru_lock);
+			(nr_taken >> (sc->max_priority - sc->priority))) {
+			spin_unlock_irq(&lruvec->lru_lock);
 			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
-			spin_lock_irq(&zone->lru_lock);
+			spin_lock_irq(&lruvec->lru_lock);
 		}
-  	} while (nr_scanned < max_scan);
+  	} while (nr_scanned < nr_to_scan);
 
 done:
-	spin_unlock_irq(&zone->lru_lock);
-	pagevec_release(&pvec);
+	spin_unlock_irq(&lruvec->lru_lock);
+	free_hot_cold_page_list(&pages_to_free, 1);
 	trace_mm_pagereclaim_shrinkinactive(nr_scanned, file, 
-				nr_reclaimed, priority);
+				nr_reclaimed, sc->priority);
 	return nr_reclaimed;
 }
 
 /*
- * We are about to scan this zone at a certain priority level.  If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-	if (priority < zone->prev_priority)
-		zone->prev_priority = priority;
-}
-
-/*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
@@ -1482,45 +1557,70 @@ static inline void note_zone_scanning_pr
  * But we had to alter page->flags anyway.
  */
 
-static void move_active_pages_to_lru(struct zone *zone,
+static void move_active_pages_to_lru(struct zone *zone, struct lruvec *lruvec,
 				     struct list_head *list,
+				     struct list_head *pages_to_free,
 				     enum lru_list lru)
 {
 	unsigned long pgmoved = 0;
-	struct pagevec pvec;
 	struct page *page;
+	struct lruvec *locked = lruvec;
 
-	pagevec_init(&pvec, 1);
+	if (is_file_lru(lru) && buffer_heads_over_limit) {
+		spin_unlock_irq(&locked->lru_lock);
+		list_for_each_entry(page, list, lru) {
+			if (page_has_private(page) && trylock_page(page)) {
+				if (page_has_private(page))
+					try_to_release_page(page, 0);
+				unlock_page(page);
+			}
+		}
+		spin_lock_irq(&locked->lru_lock);
+	}
 
 	while (!list_empty(list)) {
-		struct lruvec *lruvec;
+		int numpages;
 
 		page = lru_to_page(list);
 
+		if (!COMPACTION_BUILD)
+			locked = relock_lruvec(locked, __page_lruvec(page));
+
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 
-		lruvec = mem_cgroup_lru_add_list(zone, page, lru);
-		list_move(&page->lru, &lruvec->lists[lru]);
-		pgmoved += hpage_nr_pages(page);
-
-		if (!pagevec_add(&pvec, page) || list_empty(list)) {
-			spin_unlock_irq(&zone->lru_lock);
-			if (buffer_heads_over_limit)
-				pagevec_strip(&pvec);
-			__pagevec_release(&pvec);
-			spin_lock_irq(&zone->lru_lock);
+		list_move(&page->lru, &locked->lru_list[lru]);
+
+		numpages = hpage_nr_pages(page);
+		locked->nr_pages[lru] += numpages;
+		pgmoved += numpages;
+
+		if (put_page_testzero(page)) {
+			__ClearPageLRU(page);
+			__ClearPageActive(page);
+			del_page_from_lru_list(locked, page, lru);
+			gang_del_user_page(page);
+
+			if (unlikely(PageCompound(page))) {
+				spin_unlock_irq(&locked->lru_lock);
+				(*get_compound_page_dtor(page))(page);
+				spin_lock_irq(&lruvec->lru_lock);
+				locked = lruvec;
+			} else
+				list_add(&page->lru, pages_to_free);
 		}
 	}
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
+	if (!COMPACTION_BUILD)
+		locked = relock_lruvec(locked, lruvec);
 }
 
-static void shrink_active_list(unsigned long nr_pages,
-			       struct mem_cgroup_zone *mz,
+static void shrink_active_list(unsigned long nr_to_scan,
+			       struct lruvec *lruvec,
 			       struct scan_control *sc,
-			       int priority, int file)
+			       enum lru_list lru)
 {
 	unsigned long nr_taken;
 	unsigned long pgscanned;
@@ -1529,33 +1629,35 @@ static void shrink_active_list(unsigned 
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 	unsigned long nr_rotated = 0;
-	int order = 0;
-	struct zone *zone = mz->zone;
-
-	if (!COMPACTION_BUILD)
-		order = sc->order;
+	isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+	int file = is_file_lru(lru);
+	struct zone *zone = lruvec_zone(lruvec);
 
+	{KSTAT_PERF_ENTER(refill_inact)
 	lru_add_drain();
-	spin_lock_irq(&zone->lru_lock);
 
-	nr_taken = isolate_pages(nr_pages, mz, &l_hold,
-				 &pgscanned, order,
-				 ISOLATE_ACTIVE, 1, file);
+	if (!sc->may_unmap)
+		reclaim_mode |= ISOLATE_UNMAPPED;
+	if (!sc->may_writepage)
+		reclaim_mode |= ISOLATE_CLEAN;
+
+	spin_lock_irq(&lruvec->lru_lock);
+
+	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+				 &pgscanned, sc, reclaim_mode, lru);
 
 	if (global_reclaim(sc))
-		zone->pages_scanned += pgscanned;
+		atomic_long_add(nr_taken, &zone->pages_scanned);
+	else
+		atomic_long_add(nr_taken, &lruvec->pages_scanned);
 
-	reclaim_stat->recent_scanned[file] += nr_taken;
+	lruvec->recent_scanned[file] += nr_taken;
 
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
-	if (file)
-		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
-	else
-		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
-	spin_unlock_irq(&zone->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	while (!list_empty(&l_hold)) {
 		cond_resched();
@@ -1589,70 +1691,75 @@ static void shrink_active_list(unsigned 
 		list_add(&page->lru, &l_inactive);
 	}
 
-	/*
-	 * Move pages back to the lru list.
-	 */
-	spin_lock_irq(&zone->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as rotated,
 	 * even though only some of them are actually re-activated.  This
 	 * helps balance scan pressure between file and anonymous pages in
 	 * get_scan_ratio.
 	 */
-	reclaim_stat->recent_rotated[file] += nr_rotated;
+	lruvec->recent_rotated[file] += nr_rotated;
 
-	move_active_pages_to_lru(zone, &l_active,
+	move_active_pages_to_lru(zone, lruvec, &l_active, &l_hold,
 						LRU_ACTIVE + file * LRU_FILE);
-	move_active_pages_to_lru(zone, &l_inactive,
+	move_active_pages_to_lru(zone, lruvec, &l_inactive, &l_hold,
 						LRU_BASE   + file * LRU_FILE);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(&zone->lru_lock);
-	trace_mm_pagereclaim_shrinkactive(pgscanned, file, priority);  
-}
-
-static int inactive_anon_is_low_global(struct zone *zone)
-{
-	unsigned long active, inactive;
+	spin_unlock_irq(&lruvec->lru_lock);
 
-	active = zone_page_state(zone, NR_ACTIVE_ANON);
-	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+	free_hot_cold_page_list(&l_hold, 1);
 
-	if (inactive * zone->inactive_ratio < active)
-		return 1;
-
-	return 0;
+	trace_mm_pagereclaim_shrinkactive(pgscanned, file, sc->priority);
+	KSTAT_PERF_LEAVE(refill_inact)}
 }
 
 /**
  * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- * @sc:   scan control of this context
+ * @gang: gang to check
  *
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
+ *
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total     target    max
+ * memory    ratio     inactive anon
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
  */
-static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
-{
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
-						       mz->zone);
-
-	return inactive_anon_is_low_global(mz->zone);
-}
-
-static int inactive_file_is_low_global(struct zone *zone)
+static int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	unsigned long active, inactive;
+	unsigned long gb, inactive_ratio;
 
-	active = zone_page_state(zone, NR_ACTIVE_FILE);
-	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+	active = lruvec->nr_pages[LRU_ACTIVE_ANON];
+	inactive = lruvec->nr_pages[LRU_INACTIVE_ANON];
 
-	return (active > inactive);
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
+	else
+		inactive_ratio = 1;
+
+	return inactive * inactive_ratio < active;
 }
 
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
- * @mz: memory cgroup and zone to check
+ * @gang: gang to check
  *
  * When the system is doing streaming IO, memory pressure here
  * ensures that active file pages get deactivated, until more
@@ -1664,37 +1771,116 @@ static int inactive_file_is_low_global(s
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
-static int inactive_file_is_low(struct mem_cgroup_zone *mz)
+static int inactive_file_is_low(struct lruvec *lruvec)
 {
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
-						       mz->zone);
+	unsigned long active, inactive;
 
-	return inactive_file_is_low_global(mz->zone);
+	active = lruvec->nr_pages[LRU_ACTIVE_FILE];
+	inactive = lruvec->nr_pages[LRU_INACTIVE_FILE];
+
+	return (active > inactive);
 }
 
-static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
+static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
-	if (file)
-		return inactive_file_is_low(mz);
+	if (is_file_lru(lru))
+		return inactive_file_is_low(lruvec);
 	else
-		return inactive_anon_is_low(mz);
+		return inactive_anon_is_low(lruvec);
 }
 
-static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-				 struct mem_cgroup_zone *mz,
-				 struct scan_control *sc, int priority)
+#ifdef CONFIG_MEMORY_GANGS
+/*
+ * For active returns true it older than inactive,
+ * for inactive if it older than oldeset milestone in zone.
+ */
+static bool lru_list_is_old(struct lruvec *lruvec, enum lru_list lru)
 {
-	int file = is_file_lru(lru);
+	struct gang *gang = lruvec_gang(lruvec);
+
+	if (is_active_lru(lru))
+		return time_before(gang->timestamp[lru],
+				   gang->timestamp[lru - LRU_ACTIVE]);
+	else
+		return false;
+}
+
+static inline int zone_max_priority(struct zone *zone)
+{
+	return find_last_bit(zone->vmscan_mask, NR_VMSCAN_PRIORITIES);
+}
 
+#else /* CONFIG_MEMORY_GANGS */
+static bool lru_list_is_old(struct lruvec *lruvec, enum lru_list lru)
+{
+	return false;
+}
+
+static inline int zone_max_priority(struct zone *zone)
+{
+	return DEF_PRIORITY;
+}
+
+#endif /* CONFIG_MEMORY_GANGS */
+
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+				 struct lruvec *lruvec, struct scan_control *sc)
+{
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(mz, file))
-			shrink_active_list(nr_to_scan, mz, sc, priority, file);
+		if (lru_list_is_old(lruvec, lru) ||
+		    inactive_list_is_low(lruvec, lru))
+		    shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
 
-	return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
+	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
+}
+
+static inline int no_swap_space(struct scan_control *sc)
+{
+	if (sc->gs && !ub_resource_excess(get_gangs_ub(sc->gs),
+				UB_SWAPPAGES, UB_SOFT))
+		return 1;
+	if (sc->use_vswap && SWP_VSWAP_NUM)
+		return 0;
+	if (nr_swap_pages <= 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
+ * until we collected @swap_cluster_max pages to scan.
+ */
+static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
+				       unsigned long *nr_saved_scan)
+{
+	unsigned long nr;
+
+	*nr_saved_scan += nr_to_scan;
+	nr = *nr_saved_scan;
+
+	if (nr >= SWAP_CLUSTER_MAX)
+		*nr_saved_scan = 0;
+	else
+		nr = 0;
+
+	return nr;
+}
+
+#ifdef CONFIG_MEMORY_GANGS
+int vm_force_scan_thresh = 50;
+
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return zone->force_scan;
+}
+#else
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return false;
 }
+#endif
 
 /*
  * Determine how aggressively the anon and file LRU lists should be
@@ -1705,30 +1891,70 @@ static unsigned long shrink_list(enum lr
  * percent[0] specifies how much pressure to put on ram/swap backed
  * memory, while percent[1] determines pressure on the file LRUs.
  */
-static void get_scan_ratio(struct mem_cgroup_zone *mz, struct scan_control *sc,
-					unsigned long *percent)
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+							unsigned long *nr)
 {
-	unsigned long anon, file, free, zonefile;
+	struct zone *zone = lruvec_zone(lruvec);
+	unsigned long anon, file, free;
 	unsigned long anon_prio, file_prio;
 	unsigned long ap, fp;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+	enum lru_list l;
+	bool force_scan = false;
+	unsigned long fraction[2], denominator;
+	int priority;
 
-	anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
-		zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
-	file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
-		zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+	/*
+	 * If the zone or memcg is small, nr[l] can be 0.  This
+	 * results in no scanning on this priority and a potential
+	 * priority drop.  Global direct reclaim can go to the next
+	 * zone and tends to have no problems. Global kswapd is for
+	 * zone balancing and it needs to scan a minimum amount. When
+	 * reclaiming for a memcg, a priority drop can cause high
+	 * latencies, so it's better to scan a minimum amount there as
+	 * well.
+	 */
+	if (global_reclaim(sc)) {
+		if (zone_force_scan(zone) ||
+		    (current_is_kswapd() && zone_is_all_unreclaimable(zone)))
+			force_scan = true;
+		/* DEF_PRIORITY on home priority and lowers down to zero. */
+		priority = sc->priority + (DEF_PRIORITY - lruvec->priority);
+		if (priority <= 0) {
+			priority = 0;
+			force_scan = true;
+		}
+	} else {
+		force_scan = true;
+		/* Internal reclaimer uses priorities from DEF_PRIORITY to 0 */
+		priority = sc->priority;
+	}
+
+	anon  = lruvec->nr_pages[LRU_ACTIVE_ANON] +
+		lruvec->nr_pages[LRU_INACTIVE_ANON];
+	file  = lruvec->nr_pages[LRU_ACTIVE_FILE] +
+		lruvec->nr_pages[LRU_INACTIVE_FILE];
+
+	/* If we have no swap space, do not bother scanning anon pages. */
+	if (!sc->may_swap || no_swap_space(sc)) {
+		fraction[0] = 0;
+		fraction[1] = 1;
+		denominator = 1;
+		goto out;
+	}
 
 	if (global_reclaim(sc)) {
-		free  = zone_page_state(mz->zone, NR_FREE_PAGES);
-		zonefile =
-		    zone_page_state(mz->zone, NR_LRU_BASE + LRU_ACTIVE_FILE) +
-		    zone_page_state(mz->zone, NR_LRU_BASE + LRU_INACTIVE_FILE);
-		/* If we have very few page cache pages,
-		   force-scan anon pages. */
-		if (unlikely(zonefile + free <= high_wmark_pages(mz->zone))) {
-			percent[0] = 100;
-			percent[1] = 0;
-			return;
+		free = zone_page_state(zone, NR_FREE_PAGES) +
+			zone_page_state(zone, NR_ACTIVE_FILE) +
+			zone_page_state(zone, NR_INACTIVE_FILE);
+		/*
+		 * If we have very few page cache pages,
+		 * force-scan anon pages.
+		 */
+		if (unlikely(free <= high_wmark_pages(zone))) {
+			fraction[0] = 1;
+			fraction[1] = 0;
+			denominator = 1;
+			goto out;
 		}
 	}
 
@@ -1743,18 +1969,18 @@ static void get_scan_ratio(struct mem_cg
 	 *
 	 * anon in [0], file in [1]
 	 */
-	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
-		spin_lock_irq(&mz->zone->lru_lock);
-		reclaim_stat->recent_scanned[0] /= 2;
-		reclaim_stat->recent_rotated[0] /= 2;
-		spin_unlock_irq(&mz->zone->lru_lock);
+	if (unlikely(lruvec->recent_scanned[0] > anon / 4)) {
+		spin_lock_irq(&lruvec->lru_lock);
+		lruvec->recent_scanned[0] /= 2;
+		lruvec->recent_rotated[0] /= 2;
+		spin_unlock_irq(&lruvec->lru_lock);
 	}
 
-	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
-		spin_lock_irq(&mz->zone->lru_lock);
-		reclaim_stat->recent_scanned[1] /= 2;
-		reclaim_stat->recent_rotated[1] /= 2;
-		spin_unlock_irq(&mz->zone->lru_lock);
+	if (unlikely(lruvec->recent_scanned[1] > file / 4)) {
+		spin_lock_irq(&lruvec->lru_lock);
+		lruvec->recent_scanned[1] /= 2;
+		lruvec->recent_rotated[1] /= 2;
+		spin_unlock_irq(&lruvec->lru_lock);
 	}
 
 	/*
@@ -1769,75 +1995,49 @@ static void get_scan_ratio(struct mem_cg
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
-	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
-	ap /= reclaim_stat->recent_rotated[0] + 1;
+	ap = anon_prio * (lruvec->recent_scanned[0] + 1);
+	ap /= lruvec->recent_rotated[0] + 1;
 
-	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
-	fp /= reclaim_stat->recent_rotated[1] + 1;
-
-	/* Normalize to percentages */
-	percent[0] = 100 * ap / (ap + fp + 1);
-	percent[1] = 100 - percent[0];
-}
+	fp = file_prio * (lruvec->recent_scanned[1] + 1);
+	fp /= lruvec->recent_rotated[1] + 1;
 
-/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-				       unsigned long *nr_saved_scan)
-{
-	unsigned long nr;
+	fraction[0] = ap;
+	fraction[1] = fp;
+	denominator = ap + fp + 1;
+out:
+	for_each_evictable_lru(l) {
+		int file = is_file_lru(l);
+		unsigned long scan;
 
-	*nr_saved_scan += nr_to_scan;
-	nr = *nr_saved_scan;
+		scan = lruvec->nr_pages[l] >> priority;
+		scan = (scan * fraction[file]) / denominator;
 
-	if (nr >= SWAP_CLUSTER_MAX)
-		*nr_saved_scan = 0;
-	else
-		nr = 0;
+		nr[l] = nr_scan_try_batch(scan, &lruvec->nr_saved_scan[l]);
 
-	return nr;
+		if (!nr[l] && fraction[file] &&
+		    (force_scan || lru_list_is_old(lruvec, l)))
+			nr[l] = SWAP_CLUSTER_MAX;
+	}
 }
 
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
-				   struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
-	unsigned long percent[2];	/* anon @ 0; file @ 1 */
 	enum lru_list l;
-	unsigned long nr_reclaimed = 0;
+	unsigned long nr_reclaimed = sc->nr_reclaimed;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
-	int noswap = 0;
+	struct gang *gang = lruvec_gang(lruvec);
 
-	/* If we have no swap space, do not bother scanning anon pages. */
-	if (!sc->may_swap || (nr_swap_pages <= 0)) {
-		noswap = 1;
-		percent[0] = 0;
-		percent[1] = 100;
-	} else
-		get_scan_ratio(mz, sc, percent);
-
-	for_each_evictable_lru(l) {
-		int file = is_file_lru(l);
-		unsigned long scan;
-
-		scan = zone_nr_lru_pages(mz, l);
-		if (priority || noswap || !sc->swappiness) {
-			scan >>= priority;
-			scan = (scan * percent[file]) / 100;
-		}
-		nr[l] = nr_scan_try_batch(scan,
-					  &reclaim_stat->nr_saved_scan[l]);
-	}
+	get_scan_count(lruvec, sc, nr);
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
+		cond_resched();
+
 		for_each_evictable_lru(l) {
 			if (nr[l]) {
 				nr_to_scan = min_t(unsigned long,
@@ -1845,40 +2045,66 @@ static void shrink_mem_cgroup_zone(int p
 				nr[l] -= nr_to_scan;
 
 				nr_reclaimed += shrink_list(l, nr_to_scan,
-							    mz, sc, priority);
+							    lruvec, sc);
 			}
 		}
+
+		/*
+		 * Update vmscan priority after meeting milestones in lru lists.
+		 */
+		if (test_bit(GANG_NEED_RESCHED, &gang->flags)) {
+			clear_bit(GANG_NEED_RESCHED, &gang->flags);
+			update_vmscan_priority(gang);
+		}
+
+		/*
+		 * Abort scanning if lru no longer suitable for this priority
+		 */
+		if (lruvec->priority < sc->priority)
+			break;
+
 		/*
 		 * On large memory systems, scan >> priority can become
 		 * really large. This is fine for the starting priority;
-		 * we want to put equal scanning pressure on each zone.
-		 * However, if the VM has a harder time of freeing pages,
 		 * with multiple processes reclaiming pages, the total
 		 * freeing target can get unreasonably large.
 		 */
-		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+		if (nr_reclaimed >= nr_to_reclaim &&
+		    sc->priority < sc->max_priority)
 			break;
 	}
 
-	sc->nr_reclaimed += nr_reclaimed;
-	trace_mm_pagereclaim_shrinkzone(nr_reclaimed, priority);
+	sc->nr_reclaimed = nr_reclaimed;
+
+	trace_mm_pagereclaim_shrinkgang(zone_to_nid(gang_zone(gang)),
+					zone_idx(gang_zone(gang)),
+					get_gang_ub(gang)->ub_uid,
+					sc->gs ? 2 : gang_in_shadow(gang),
+					sc->priority, sc->nr_reclaimed);
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(mz) && nr_swap_pages > 0)
-		shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
+	if ((lru_list_is_old(lruvec, LRU_ACTIVE_ANON) ||
+	     inactive_anon_is_low(lruvec)) && !no_swap_space(sc))
+		shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON);
+
+	/*
+	 * Update vmscan priority after each serious scanning pass.
+	 */
+	if (sc->priority < sc->max_priority)
+		update_vmscan_priority(gang);
 
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
 /* Use reclaim/compaction for costly allocs or under memory pressure */
-static bool in_reclaim_compaction(int priority, struct scan_control *sc)
+static bool in_reclaim_compaction(struct scan_control *sc)
 {
 	if (COMPACTION_BUILD && sc->order &&
 			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
-			 priority < DEF_PRIORITY - 2))
+			 sc->priority < sc->max_priority - 2))
 		return true;
 
 	return false;
@@ -1894,14 +2120,13 @@ static bool in_reclaim_compaction(int pr
 static inline bool should_continue_reclaim(struct zone *zone,
 					unsigned long nr_reclaimed,
 					unsigned long nr_scanned,
-					int priority,
 					struct scan_control *sc)
 {
 	unsigned long pages_for_compaction;
 	unsigned long inactive_lru_pages;
 
 	/* If not in reclaim/compaction mode, stop */
-	if (!in_reclaim_compaction(priority, sc))
+	if (!in_reclaim_compaction(sc))
 		return false;
 
 	/* Consider stopping depending on scan and reclaim activity */
@@ -1949,8 +2174,174 @@ static inline bool should_continue_recla
 	}
 }
 
-static void shrink_zone(int priority, struct zone *zone,
-			struct scan_control *sc)
+
+#ifdef CONFIG_MEMORY_GANGS
+
+int vm_usage_factor = 2;
+int vm_shadow_factor = 8;
+int vm_age_factor = 8;
+
+/*
+ * All Magic Happens Here!
+ */
+void update_vmscan_priority(struct gang *gang)
+{
+	unsigned long age, max_age, now = jiffies;
+	unsigned long present, portion;
+	struct user_beancounter *ub;
+	long limit, usage, shadow;
+	u64 p;
+	int priority;
+
+	age = max(now - gang->timestamp[LRU_INACTIVE_FILE],
+		  now - gang->timestamp[LRU_ACTIVE_FILE]);
+	present = gang->lruvec.nr_pages[LRU_ACTIVE_FILE] +
+		  gang->lruvec.nr_pages[LRU_INACTIVE_FILE];
+	if (nr_swap_pages > 0) {
+		age = max(age, max(now - gang->timestamp[LRU_INACTIVE_ANON],
+				   now - gang->timestamp[LRU_ACTIVE_ANON]));
+		present += gang->lruvec.nr_pages[LRU_ACTIVE_ANON] +
+			   gang->lruvec.nr_pages[LRU_INACTIVE_ANON];
+	}
+
+	max_age = max(1ul, now - gang_zone(gang)->eldest_timestamp);
+
+	ub = get_gang_ub(gang);
+
+	limit = clamp(ub->ub_parms[UB_PHYSPAGES].limit, 1ul, totalram_pages);
+	usage = ub->ub_parms[UB_PHYSPAGES].held;
+	shadow = ub->ub_parms[UB_SHADOWPAGES].held;
+
+	priority = DEF_PRIORITY;
+
+	priority += usage * vm_usage_factor / limit;
+
+	if (gang_in_shadow(gang))
+		priority += shadow * vm_shadow_factor / limit;
+
+	if (unlikely(gang_of_junk(gang))) {
+		priority = DEF_PRIORITY;
+
+		/* Junk must die */
+		if (present) {
+			priority += vm_usage_factor;
+			priority += vm_shadow_factor;
+		}
+	}
+
+
+	p = (u64)vm_age_factor * age;
+	portion = gang->portion;
+	if (portion && present) {
+		p *= present;
+		do_div(p, portion);
+	}
+	do_div(p, max_age);
+	priority += p;
+
+	priority = clamp(priority, 0, MAX_VMSCAN_PRIORITY);
+
+	if (gang->lruvec.priority != priority) {
+		trace_mm_pagereclaim_reschedule(zone_to_nid(gang_zone(gang)),
+						zone_idx(gang_zone(gang)),
+						get_gang_ub(gang)->ub_uid,
+						gang_in_shadow(gang),
+						usage, shadow, limit,
+						age, max_age, priority);
+		set_gang_priority(gang, priority);
+	}
+}
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+	unsigned long nr_reclaimed, nr_scanned;
+	struct list_head **iter, *curr, *next;
+	struct gang *gang;
+	int round;
+
+restart:
+	nr_reclaimed = sc->nr_reclaimed;
+	nr_scanned = sc->nr_scanned;
+
+	round = atomic_read(zone->vmscan_round + sc->priority);
+	iter = zone->vmscan_iter + sc->priority;
+	rcu_read_lock();
+	do {
+		do {
+			curr = rcu_dereference(*iter);
+			next = rcu_dereference(curr->next);
+
+			if (next >= zone->vmscan_prio &&
+			    next < zone->vmscan_prio + NR_VMSCAN_PRIORITIES) {
+				int priority;
+
+				/* Get next active priority */
+				priority = next - zone->vmscan_prio + 1;
+				priority = find_next_bit(zone->vmscan_mask,
+						NR_VMSCAN_PRIORITIES, priority);
+				if (priority < NR_VMSCAN_PRIORITIES) {
+					next = zone->vmscan_prio + priority;
+					next = rcu_dereference(next->next);
+				} else
+					next = zone->vmscan_prio + sc->priority;
+			}
+		} while (cmpxchg(iter, curr, next) != curr);
+
+		if (next >= zone->vmscan_prio &&
+		    next < zone->vmscan_prio + NR_VMSCAN_PRIORITIES) {
+			/* We back to our home priority, round is complete. */
+			if (next == zone->vmscan_prio + sc->priority) {
+				atomic_inc(zone->vmscan_round + sc->priority);
+				break;
+			}
+			continue;
+		}
+
+		gang = list_entry(next, struct gang, vmscan_list);
+		if (pin_mem_gang(gang))
+			continue;
+		rcu_read_unlock();
+
+		shrink_lruvec(&gang->lruvec, sc);
+
+		unpin_mem_gang(gang);
+		rcu_read_lock();
+
+		/*
+		 * On lower priorities we should stop as soon as job is done,
+		 * otherwise we can generate immoderate pressure and reclaim
+		 * too much pages. On default priority we scan tillth end of
+		 * round to generate equal pressure to all zones and lrus.
+		 */
+		if (sc->nr_reclaimed >= sc->nr_to_reclaim &&
+		    sc->priority < zone_max_priority(zone))
+			break;
+
+		/*
+		 * Switch to the next vmscan-priority if current round was
+		 * completed for the current priority while we are here.
+		 */
+	} while (atomic_read(zone->vmscan_round + sc->priority) - round < 1);
+	rcu_read_unlock();
+
+	trace_mm_pagereclaim_shrinkzone(zone_to_nid(zone), zone_idx(zone),
+					sc->priority, sc->nr_reclaimed);
+
+	if (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+					 sc->nr_scanned - nr_scanned, sc))
+		goto restart;
+}
+
+static void wakeup_kswapd_timer_fn(unsigned long data)
+{
+	struct pglist_data *pgdat = (struct pglist_data *)data;
+
+	wake_up_interruptible(&pgdat->kswapd_wait);
+}
+
+#else /* CONFIG_MEMORY_GANGS */
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 
@@ -1996,6 +2387,8 @@ static void shrink_zone(int priority, st
 					 sc));
 }
 
+#endif /* CONFIG_MEMORY_GANGS */
+
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
@@ -2052,8 +2445,7 @@ static inline bool compaction_ready(stru
  * high-order allocation and compaction is ready to begin. This indicates to
  * the caller that it should retry the allocation or fail.
  */
-static bool shrink_zones(int priority, struct zonelist *zonelist,
-					struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	struct zoneref *z;
@@ -2070,14 +2462,18 @@ static bool shrink_zones(int priority, s
 		 * to global LRU.
 		 */
 		if (global_reclaim(sc)) {
-			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+			int max_priority = zone_max_priority(zone);
+
+			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL) ||
+			    sc->priority > max_priority)
 				continue;
-			note_zone_scanning_priority(zone, priority);
 
 			if (zone_is_all_unreclaimable(zone) &&
-						priority != DEF_PRIORITY)
+			    sc->priority < max_priority)
 				continue;	/* Let kswapd poll it */
+
 			sc->all_unreclaimable = 0;
+
 			if (COMPACTION_BUILD) {
 				/*
 				 * If we already have plenty of memory free for
@@ -2093,17 +2489,37 @@ static bool shrink_zones(int priority, s
 					continue;
 				}
 			}
+
+			shrink_zone(zone, sc);
 		} else {
-			/*
-			 * Ignore cpuset limitation here. We just want to reduce
-			 * # of used pages by us regardless of memory shortage.
-			 */
-			sc->all_unreclaimable = 0;
-			mem_cgroup_note_reclaim_priority(sc->target_mem_cgroup,
-							priority);
+			struct lruvec *lruvec = local_lruvec(zone, sc);
+			unsigned long reclaimable = reclaimable_pages(lruvec, sc);
+
+			if (reclaimable < SWAP_CLUSTER_MAX)
+				continue;
+
+			if (atomic_long_read(&lruvec->pages_scanned) < 6 * reclaimable)
+				sc->all_unreclaimable = 0;
+
+			if (atomic_long_read(&lruvec->pages_scanned) < 3 * reclaimable)
+				sc->near_oom = 0;
+
+			shrink_lruvec(lruvec, sc);
 		}
 
-		shrink_zone(priority, zone, sc);
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			break;
+	}
+
+	/* Huge reclaim progress, reset internal OOM countdown counter */
+	if (!global_reclaim(sc) && sc->nr_reclaimed >= sc->nr_to_reclaim) {
+		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+			if (!populated_zone(zone) ||
+			    !cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+				continue;
+
+			atomic_long_set(&local_lruvec(zone, sc)->pages_scanned, 0);
+		}
 	}
 
 	return should_abort_reclaim;
@@ -2128,7 +2544,6 @@ static bool shrink_zones(int priority, s
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc)
 {
-	int priority;
 	unsigned long ret = 0;
 	unsigned long total_scanned = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2139,29 +2554,47 @@ static unsigned long do_try_to_free_page
 	unsigned long writeback_threshold;
 	bool should_abort_reclaim;
 
+	KSTAT_PERF_ENTER(ttfp);
 	get_mems_allowed();
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
 		count_vm_event(ALLOCSTALL);
-	/*
-	 * mem_cgroup will not do shrink_slab.
-	 */
+
+	sc->max_priority = DEF_PRIORITY;
+
 	if (global_reclaim(sc)) {
 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+			int priority;
 
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 
 			lru_pages += zone_reclaimable_pages(zone);
+
+			priority = zone_max_priority(zone);
+			if (priority > sc->max_priority &&
+			    priority < NR_VMSCAN_PRIORITIES)
+				sc->max_priority = priority;
+		}
+	} else {
+		sc->near_oom = 1;
+		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+			struct lruvec *lruvec = local_lruvec(zone, sc);
+			unsigned long reclaimable = reclaimable_pages(lruvec, sc);
+
+			lru_pages += reclaimable;
+
+			if (atomic_long_read(&lruvec->pages_scanned) < 3 * reclaimable)
+				sc->near_oom = 0;
 		}
 	}
 
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+	for (sc->priority = sc->max_priority; sc->priority >= 0; sc->priority--) {
 		sc->nr_scanned = 0;
-		if (!priority)
+		if (sc->priority <= sc->max_priority - DEF_PRIORITY)
 			disable_swap_token();
-		should_abort_reclaim = shrink_zones(priority, zonelist, sc);
+		should_abort_reclaim = shrink_zones(zonelist, sc);
 		if (should_abort_reclaim)
 			break;
 		/*
@@ -2174,6 +2607,14 @@ static unsigned long do_try_to_free_page
 				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
 			}
+		} else if (sc->gfp_mask & __GFP_FS) {
+			if (ub_dcache_reclaim(get_gangs_ub(sc->gs),
+						sc->nr_scanned/4 + 1, lru_pages + 1))
+				sc->all_unreclaimable = 0;
+			if (reclaim_state) {
+				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+				reclaim_state->reclaimed_slab = 0;
+			}
 		}
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
@@ -2189,14 +2630,19 @@ static unsigned long do_try_to_free_page
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
-		if (total_scanned > writeback_threshold) {
-			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+		if (!sc->gs && total_scanned > writeback_threshold) {
+			wakeup_flusher_threads(NULL, laptop_mode ? 0 : total_scanned);
 			sc->may_writepage = 1;
 		}
 
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) {
+			ret = 1;
+			goto out;
+		}
+
 		/* Take a nap, wait for some writeback to complete */
-		if (!sc->hibernation_mode && sc->nr_scanned &&
-		    priority < DEF_PRIORITY - 2) {
+		if (!sc->gs && !sc->hibernation_mode && sc->nr_scanned &&
+		    sc->priority < sc->max_priority - 2) {
 			struct zone *preferred_zone;
 
 			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2205,8 +2651,7 @@ static unsigned long do_try_to_free_page
 		}
 	}
 	/* top priority shrink_zones still had more to do? don't OOM, then */
-	if (!sc->all_unreclaimable && global_reclaim(sc))
-		ret = sc->nr_reclaimed;
+	ret = sc->nr_reclaimed ?: !sc->all_unreclaimable;
 out:
 	/*
 	 * Now that we've scanned all the zones at this priority level, note
@@ -2215,26 +2660,15 @@ out:
 	 * level.  This affects only the decision whether or not to bring
 	 * mapped pages onto the inactive list.
 	 */
-	if (priority < 0)
-		priority = 0;
+	if (sc->priority < 0)
+		sc->priority = 0;
 
 #ifdef CONFIG_NUMA
 	trace_mm_directreclaim_reclaimall(zonelist[0]._zonerefs->zone->node,
-						sc->nr_reclaimed, priority);
+						sc->nr_reclaimed, sc->priority);
 #else
-	trace_mm_directreclaim_reclaimall(0, sc->nr_reclaimed, priority);
+	trace_mm_directreclaim_reclaimall(0, sc->nr_reclaimed, sc->priority);
 #endif
-	if (global_reclaim(sc)) {
-		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
-			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-				continue;
-
-			zone->prev_priority = priority;
-		}
-	} else
-		mem_cgroup_record_reclaim_priority(sc->target_mem_cgroup,
-						   priority);
 
 	delayacct_freepages_end();
 	put_mems_allowed();
@@ -2243,6 +2677,7 @@ out:
 	if (should_abort_reclaim)
 		return 1;
 
+	KSTAT_PERF_LEAVE(ttfp);
 	return ret;
 }
 
@@ -2273,6 +2708,37 @@ unsigned long try_to_free_pages(struct z
 	return nr_reclaimed;
 }
 
+unsigned long try_to_free_gang_pages(struct gang_set *gs, gfp_t gfp_mask)
+{
+	struct zonelist *zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+		.use_vswap = 1,
+		.swappiness = 100,
+		.gs = gs,
+		.may_shade_anon = 1,
+		.may_shade_file = 1,
+	};
+	unsigned long progress;
+
+	if (test_bit(UB_PAGECACHE_ISOLATION, &get_gangs_ub(gs)->ub_flags))
+		sc.may_shade_file = 0;
+
+	progress = do_try_to_free_pages(zonelist, &sc);
+
+	if (sc.nr_reclaim_swapout) {
+		ub_percpu_add(get_gangs_ub(gs), vswapout, sc.nr_reclaim_swapout);
+		ub_reclaim_rate_limit(get_gangs_ub(gs), gfp_mask & __GFP_WAIT,
+				      sc.nr_reclaim_swapout);
+	}
+
+	return progress;
+}
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
@@ -2361,27 +2827,43 @@ static int sleeping_prematurely(pg_data_
 	return 0;
 }
 
-static void age_active_anon(struct zone *zone, struct scan_control *sc,
-			    int priority)
+static void age_active_lists(struct zone *zone, struct scan_control *sc)
 {
-	struct mem_cgroup *memcg;
+	struct gang *gang;
 
-	if (!total_swap_pages)
-		return;
+	rcu_read_lock();
+	for_each_gang(gang, zone) {
+		struct lruvec *lruvec = &gang->lruvec;
 
-	memcg = mem_cgroup_iter(NULL, NULL, NULL);
-	do {
-		struct mem_cgroup_zone mz = {
-			.mem_cgroup = memcg,
-			.zone = zone,
-		};
-
-		if (inactive_anon_is_low(&mz))
-			shrink_active_list(SWAP_CLUSTER_MAX, &mz,
-					   sc, priority, 0);
-
-		memcg = mem_cgroup_iter(NULL, memcg, NULL);
-	} while (memcg);
+		/*
+		 * This keeps active/inactive ratio in all containers.
+		 */
+		if (lru_list_is_old(lruvec, LRU_ACTIVE_ANON) ||
+				inactive_anon_is_low(lruvec)) {
+			if (pin_mem_gang(gang))
+				continue;
+			rcu_read_unlock();
+			shrink_active_list(SWAP_CLUSTER_MAX,
+					lruvec, sc,
+					LRU_ACTIVE_ANON);
+			rcu_read_lock();
+			unpin_mem_gang(gang);
+		}
+		/*
+		 * Balance aged active file lru as well as anon
+		 */
+		if (lru_list_is_old(lruvec, LRU_ACTIVE_FILE)) {
+			if (pin_mem_gang(gang))
+				continue;
+			rcu_read_unlock();
+			shrink_active_list(SWAP_CLUSTER_MAX,
+					lruvec, sc,
+					LRU_ACTIVE_FILE);
+			rcu_read_lock();
+			unpin_mem_gang(gang);
+		}
+	}
+	rcu_read_unlock();
 }
 
 /*
@@ -2408,7 +2890,6 @@ static void age_active_anon(struct zone 
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 {
 	int all_zones_ok;
-	int priority;
 	int i;
 	unsigned long total_scanned;
 	unsigned long total_reclaimed = 0;
@@ -2425,13 +2906,8 @@ static unsigned long balance_pgdat(pg_da
 		.swappiness = vm_swappiness,
 		.order = order,
 		.target_mem_cgroup = NULL,
+		.max_priority = DEF_PRIORITY,
 	};
-	/*
-	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to
-	 * free_pages == high_wmark_pages(zone).
-	 */
-	int temp_priority[MAX_NR_ZONES];
 
 loop_again:
 	total_scanned = 0;
@@ -2439,16 +2915,95 @@ loop_again:
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
-	for (i = 0; i < pgdat->nr_zones; i++)
-		temp_priority[i] = DEF_PRIORITY;
+#ifdef CONFIG_MEMORY_GANGS
+	if (time_after_eq(jiffies, pgdat->next_milestone)) {
+		unsigned long now = jiffies;
+		bool reused = false;
 
-	for (priority = DEF_PRIORITY; priority >= 1; priority--) {
+		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+			unsigned long ts, old_ts = now;
+			struct zone *zone = pgdat->node_zones + i;
+			struct gang *gang;
+
+			rcu_read_lock();
+			for_each_gang(gang, zone) {
+				spin_lock_irq(&gang->lruvec.lru_lock);
+				if (insert_lru_milestone(gang, now, &ts))
+					reused = true;
+				spin_unlock_irq(&gang->lruvec.lru_lock);
+
+				if (time_before(ts, old_ts))
+					old_ts = ts;
+			}
+			rcu_read_unlock();
+
+			zone->eldest_timestamp = old_ts;
+		}
+
+		/*
+		 * Double interval if milestone was reused and halve otherwise.
+		 */
+		if (reused)
+			pgdat->milestone_interval *= 2;
+		else
+			pgdat->milestone_interval /= 2;
+		pgdat->milestone_interval = clamp(pgdat->milestone_interval,
+				MIN_MILESTONE_INTERVAL, MAX_MILESTONE_INTERVAL);
+		pgdat->next_milestone = now + pgdat->milestone_interval;
+		mod_timer(&pgdat->milestone_timer, pgdat->next_milestone);
+	}
+
+	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+		struct zone *zone = pgdat->node_zones + i;
+		struct gang *gang;
+		int max_priority;
+		int nr_tiny, nr_total;
+
+		if (!populated_zone(zone))
+			continue;
+
+		nr_tiny = nr_total = 0;
+
+		rcu_read_lock();
+		for_each_gang(gang, zone) {
+			struct lruvec *lruvec = &gang->lruvec;
+			unsigned long size;
+
+			update_vmscan_priority(gang);
+
+			size = max(lruvec->nr_pages[LRU_ACTIVE_FILE],
+				   lruvec->nr_pages[LRU_INACTIVE_FILE]);
+			if (nr_swap_pages > 0)
+				size = max3(size,
+					lruvec->nr_pages[LRU_ACTIVE_ANON],
+					lruvec->nr_pages[LRU_INACTIVE_ANON]);
+			if (size) {
+				if (!(size >> DEF_PRIORITY))
+					nr_tiny++;
+				nr_total++;
+			}
+		}
+		rcu_read_unlock();
+
+		if (nr_tiny * 100 > nr_total * vm_force_scan_thresh)
+			zone->force_scan = true;
+		else
+			zone->force_scan = false;
+
+		max_priority = zone_max_priority(zone);
+		if (max_priority > sc.max_priority &&
+				max_priority < NR_VMSCAN_PRIORITIES)
+			sc.max_priority = max_priority;
+	}
+#endif /* CONFIG_MEMORY_GANGS */
+
+	for (sc.priority = sc.max_priority; sc.priority >= 0; sc.priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 		int has_under_min_watermark_zone = 0;
 
 		/* The swap token gets in the way of swapout... */
-		if (!priority)
+		if (sc.priority <= sc.max_priority - DEF_PRIORITY)
 			disable_swap_token();
 
 		all_zones_ok = 1;
@@ -2459,19 +3014,20 @@ loop_again:
 		 */
 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
+			int max_priority = zone_max_priority(zone);
 
 			if (!populated_zone(zone))
 				continue;
 
 			if (zone_is_all_unreclaimable(zone) &&
-			    priority != DEF_PRIORITY)
+			    sc.priority < max_priority)
 				continue;
 
 			/*
-			 * Do some background aging of the anon list, to give
+			 * Do some background aging of the active lists, to give
 			 * pages a chance to be referenced before reclaiming.
 			 */
-			age_active_anon(zone, &sc, priority);
+			age_active_lists(zone, &sc);
 
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), 0, 0)) {
@@ -2499,6 +3055,7 @@ loop_again:
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
+			int max_priority = zone_max_priority(zone);
 			int nr_slab;
 			int nid, zid;
 			unsigned long balance_gap;
@@ -2508,15 +3065,17 @@ loop_again:
 				continue;
 
 			if (zone_is_all_unreclaimable(zone) &&
-					priority != DEF_PRIORITY)
+			    sc.priority < max_priority)
 				continue;
 
 			if (!zone_watermark_ok(zone, order,
 					high_wmark_pages(zone), end_zone, 0))
 				all_zones_ok = 0;
-			temp_priority[i] = priority;
+
+			if (sc.priority > max_priority)
+				continue;
+
 			sc.nr_scanned = 0;
-			note_zone_scanning_priority(zone, priority);
 
 			nid = pgdat->node_id;
 			zid = zone_idx(zone);
@@ -2541,7 +3100,7 @@ loop_again:
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0))
-				shrink_zone(priority, zone, &sc);
+				shrink_zone(zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
@@ -2550,7 +3109,7 @@ loop_again:
 
 			if (zone_is_all_unreclaimable(zone))
 				continue;
-			if (nr_slab == 0 && zone->pages_scanned >=
+			if (nr_slab == 0 && atomic_long_read(&zone->pages_scanned) >=
 					(zone_reclaimable_pages(zone) * 6))
 					zone_set_flag(zone,
 						      ZONE_ALL_UNRECLAIMABLE);
@@ -2601,7 +3160,7 @@ loop_again:
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
-		if (total_scanned && (priority < DEF_PRIORITY - 2)) {
+		if (total_scanned && (sc.priority < sc.max_priority - 2)) {
 			if (has_under_min_watermark_zone)
 				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
 			else
@@ -2619,16 +3178,6 @@ loop_again:
 		cond_resched();
 	}
 out:
-	/*
-	 * Note within each zone the priority level at which this zone was
-	 * brought into a happy state.  So that the next thread which scans this
-	 * zone will start out at that priority level.
-	 */
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->prev_priority = temp_priority[i];
-	}
 	if (!all_zones_ok) {
 		cond_resched();
 
@@ -2688,6 +3237,14 @@ static int kswapd(void *p)
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 
+#ifdef CONFIG_MEMORY_GANGS
+	setup_timer(&pgdat->milestone_timer, wakeup_kswapd_timer_fn,
+			(unsigned long)pgdat);
+	pgdat->milestone_interval = MIN_MILESTONE_INTERVAL;
+	pgdat->next_milestone = jiffies + pgdat->milestone_interval;
+	mod_timer(&pgdat->milestone_timer, pgdat->next_milestone);
+#endif
+
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
@@ -2835,6 +3392,22 @@ unsigned long zone_reclaimable_pages(str
 	return nr;
 }
 
+static unsigned long
+reclaimable_pages(struct lruvec *lruvec, struct scan_control *sc)
+{
+	unsigned long nr;
+
+	nr = lruvec->nr_pages[LRU_INACTIVE_FILE] +
+	     lruvec->nr_pages[LRU_ACTIVE_FILE];
+
+	if (!no_swap_space(sc)) {
+		nr += lruvec->nr_pages[LRU_INACTIVE_ANON] +
+		      lruvec->nr_pages[LRU_ACTIVE_ANON];
+	}
+
+	return nr;
+}
+
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -2856,6 +3429,7 @@ unsigned long shrink_all_memory(unsigned
 		.hibernation_mode = 1,
 		.swappiness = vm_swappiness,
 		.order = 0,
+		.priority = DEF_PRIORITY,
 	};
 	struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
@@ -3019,7 +3593,6 @@ static int __zone_reclaim(struct zone *z
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	int priority = ZONE_RECLAIM_PRIORITY;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3078,12 +3651,11 @@ static int __zone_reclaim(struct zone *z
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
 		 */
-		priority = ZONE_RECLAIM_PRIORITY;
+		sc.priority = ZONE_RECLAIM_PRIORITY;
 		do {
-			note_zone_scanning_priority(zone, priority);
-			shrink_zone(priority, zone, &sc);
-			priority--;
-		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+			shrink_zone(zone, &sc);
+			sc.priority--;
+		} while (sc.priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
 
 	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3114,7 +3686,7 @@ static int __zone_reclaim(struct zone *z
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	trace_mm_directreclaim_reclaimzone(zone->node,
-				sc.nr_reclaimed, priority);
+				sc.nr_reclaimed, sc.priority);
 	return sc.nr_reclaimed >= nr_pages;
 }
 
@@ -3259,30 +3831,28 @@ int page_evictable(struct page *page, st
  * Restrictions: zone->lru_lock must be held, page must be on LRU and must
  * have PageUnevictable set.
  */
-static void check_move_unevictable_page(struct page *page, struct zone *zone)
+static void check_move_unevictable_page(struct page *page, struct lruvec *lruvec)
 {
-	struct lruvec *lruvec;
-
 	VM_BUG_ON(PageActive(page));
 retry:
 	ClearPageUnevictable(page);
 	if (page_evictable(page, NULL)) {
 		enum lru_list l = page_lru_base_type(page);
+		struct zone *zone = lruvec_zone(lruvec);
+		int numpages = hpage_nr_pages(page);
 
-		__dec_zone_state(zone, NR_UNEVICTABLE);
-		lruvec = mem_cgroup_lru_move_lists(zone, page,
-						   LRU_UNEVICTABLE, l);
-		list_move(&page->lru, &lruvec->lists[l]);
-		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
+		__mod_zone_page_state(zone, NR_UNEVICTABLE, -numpages);
+		lruvec->nr_pages[LRU_UNEVICTABLE] -= numpages;
+		list_move(&page->lru, &lruvec->lru_list[l]);
+		lruvec->nr_pages[l] += numpages;
+		__mod_zone_page_state(zone, NR_LRU_BASE + l, numpages);
 		__count_vm_event(UNEVICTABLE_PGRESCUED);
 	} else {
 		/*
 		 * rotate unevictable list
 		 */
 		SetPageUnevictable(page);
-		lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
-						   LRU_UNEVICTABLE);
-		list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
+		list_move(&page->lru, &lruvec->lru_list[LRU_UNEVICTABLE]);
 		if (page_evictable(page, NULL))
 			goto retry;
 	}
@@ -3300,7 +3870,7 @@ void scan_mapping_unevictable_pages(stru
 	pgoff_t next = 0;
 	pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
 			 PAGE_CACHE_SHIFT;
-	struct zone *zone;
+	struct lruvec *lruvec;
 	struct pagevec pvec;
 
 	if (mapping->nrpages == 0)
@@ -3312,30 +3882,23 @@ void scan_mapping_unevictable_pages(stru
 		int i;
 		int pg_scanned = 0;
 
-		zone = NULL;
-
+		lruvec = NULL;
+		local_irq_disable();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
-			struct zone *pagezone = page_zone(page);
 
 			pg_scanned++;
 			if (page_index > next)
 				next = page_index;
 			next++;
 
-			if (pagezone != zone) {
-				if (zone)
-					spin_unlock_irq(&zone->lru_lock);
-				zone = pagezone;
-				spin_lock_irq(&zone->lru_lock);
-			}
-
-			if (PageLRU(page) && PageUnevictable(page))
-				check_move_unevictable_page(page, zone);
+			if (try_relock_page_lru(&lruvec, page) &&
+					PageUnevictable(page))
+				check_move_unevictable_page(page, lruvec);
 		}
-		if (zone)
-			spin_unlock_irq(&zone->lru_lock);
+		unlock_lruvec(lruvec);
+		local_irq_enable();
 		pagevec_release(&pvec);
 
 		count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
@@ -3354,17 +3917,17 @@ void scan_mapping_unevictable_pages(stru
  * back onto @zone's unevictable list.
  */
 #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
+static void scan_zone_unevictable_pages(struct zone *zone, struct lruvec *lruvec)
 {
-	struct list_head *l_unevictable = &zone->lruvec.lists[LRU_UNEVICTABLE];
+	struct list_head *l_unevictable = &lruvec->lru_list[LRU_UNEVICTABLE];
 	unsigned long scan;
-	unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+	unsigned long nr_to_scan = lruvec->nr_pages[LRU_UNEVICTABLE];
 
 	while (nr_to_scan > 0) {
 		unsigned long batch_size = min(nr_to_scan,
 						SCAN_UNEVICTABLE_BATCH_SIZE);
 
-		spin_lock_irq(&zone->lru_lock);
+		spin_lock_irq(&lruvec->lru_lock);
 		for (scan = 0;  scan < batch_size; scan++) {
 			struct page *page = lru_to_page(l_unevictable);
 
@@ -3374,11 +3937,11 @@ static void scan_zone_unevictable_pages(
 			prefetchw_prev_lru_page(page, l_unevictable, flags);
 
 			if (likely(PageLRU(page) && PageUnevictable(page)))
-				check_move_unevictable_page(page, zone);
+				check_move_unevictable_page(page, lruvec);
 
 			unlock_page(page);
 		}
-		spin_unlock_irq(&zone->lru_lock);
+		spin_unlock_irq(&lruvec->lru_lock);
 
 		nr_to_scan -= batch_size;
 	}
@@ -3399,9 +3962,19 @@ static void scan_zone_unevictable_pages(
 static void scan_all_zones_unevictable_pages(void)
 {
 	struct zone *zone;
+	struct gang *gang;
 
 	for_each_zone(zone) {
-		scan_zone_unevictable_pages(zone);
+		rcu_read_lock();
+		for_each_gang(gang, zone) {
+			if (pin_mem_gang(gang))
+				continue;
+			rcu_read_unlock();
+			scan_zone_unevictable_pages(zone, &gang->lruvec);
+			rcu_read_lock();
+			unpin_mem_gang(gang);
+		}
+		rcu_read_unlock();
 	}
 }
 
@@ -3442,6 +4015,7 @@ static ssize_t write_scan_unevictable_no
 {
 	struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
 	struct zone *zone;
+	struct gang *gang;
 	unsigned long res;
 	unsigned long req = strict_strtoul(buf, 10, &res);
 
@@ -3451,7 +4025,16 @@ static ssize_t write_scan_unevictable_no
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 		if (!populated_zone(zone))
 			continue;
-		scan_zone_unevictable_pages(zone);
+		rcu_read_lock();
+		for_each_gang(gang, zone) {
+			if (pin_mem_gang(gang))
+				continue;
+			rcu_read_unlock();
+			scan_zone_unevictable_pages(zone, &gang->lruvec);
+			rcu_read_lock();
+			unpin_mem_gang(gang);
+		}
+		rcu_read_unlock();
 	}
 	return 1;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/mm/vmstat.c linux-2.6.32-504.3.3.el6-042stab103_6/mm/vmstat.c
--- linux-2.6.32-504.3.3.el6.orig/mm/vmstat.c	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/mm/vmstat.c	2015-01-21 12:02:58.892810428 +0300
@@ -15,6 +15,7 @@
 #include <linux/cpu.h>
 #include <linux/vmstat.h>
 #include <linux/sched.h>
+#include <linux/virtinfo.h>
 #include <linux/math64.h>
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
@@ -36,6 +37,20 @@ static void sum_vm_events(unsigned long 
 	}
 }
 
+unsigned long vm_events(enum vm_event_item i)
+{
+	int cpu;
+	unsigned long sum;
+	struct vm_event_state *st;
+
+	sum = 0;
+	for_each_online_cpu(cpu) {
+		st = &per_cpu(vm_event_states, cpu);
+		sum += st->event[i];
+	}
+
+	return (sum < 0 ? 0 : sum);
+}
 /*
  * Accumulate the vm event counters across all CPUs.
  * The result is unavoidably approximate - it can change
@@ -790,6 +805,9 @@ static const char * const vmstat_text[] 
 	"nr_isolated_anon",
 	"nr_isolated_file",
 	"nr_shmem",
+#ifdef CONFIG_MEMORY_VSWAP
+	"nr_vswap",
+#endif
 #ifdef CONFIG_NUMA
 	"numa_hit",
 	"numa_miss",
@@ -805,6 +823,10 @@ static const char * const vmstat_text[] 
 	"pgpgout",
 	"pswpin",
 	"pswpout",
+#ifdef CONFIG_MEMORY_VSWAP
+	"vswpin",
+	"vswpout",
+#endif
 
 	TEXTS_FOR_ZONES("pgalloc")
 
@@ -867,6 +889,20 @@ static const char * const vmstat_text[] 
 #endif
 };
 
+#ifdef CONFIG_MEMORY_GANGS
+static unsigned long get_zone_junk_pages(struct zone *zone)
+{
+	struct gang *gang = zone_junk_gang(zone);
+	unsigned long junk = 0;
+	enum lru_list lru;
+
+	for_each_lru(lru)
+		junk += gang->lruvec.nr_pages[lru];
+
+	return junk;
+}
+#endif
+
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 							struct zone *zone)
 {
@@ -884,7 +920,7 @@ static void zoneinfo_show_print(struct s
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-		   zone->pages_scanned,
+		   atomic_long_read(&zone->pages_scanned),
 		   zone->spanned_pages,
 		   zone->present_pages);
 
@@ -920,13 +956,33 @@ static void zoneinfo_show_print(struct s
 	}
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
-		   "\n  prev_priority:     %i"
-		   "\n  start_pfn:         %lu"
-		   "\n  inactive_ratio:    %u",
-			   zone_is_all_unreclaimable(zone),
-		   zone->prev_priority,
-		   zone->zone_start_pfn,
-		   zone->inactive_ratio);
+		   "\n  start_pfn:         %lu",
+		   zone_is_all_unreclaimable(zone),
+		   zone->zone_start_pfn);
+
+#ifdef CONFIG_MEMORY_GANGS
+	seq_printf(m,
+		   "\n  junk_pages:        %ld"
+		   "\n  eldest_page:       %u"
+		   "\n  committed:         %ld"
+		   "\n  overcommit:        %ld %%"
+		   "\n  force_scan:        %d",
+		   get_zone_junk_pages(zone),
+		   jiffies_to_msecs(jiffies - zone->eldest_timestamp),
+		   zone->committed,
+		   zone->committed * 100 / zone->present_pages - 100,
+		   zone->force_scan);
+
+	seq_printf(m,
+		   "\n  vmscan_priorities: ");
+	seq_bitmap_list(m, zone->vmscan_mask, NR_VMSCAN_PRIORITIES);
+
+	seq_printf(m,
+		   "\n  vmscan_rounds:     ");
+	for (i = 0; i < NR_VMSCAN_PRIORITIES; i++)
+		seq_printf(m, " %u", atomic_read(zone->vmscan_round + i));
+#endif
+
 	seq_putc(m, '\n');
 }
 
@@ -965,30 +1021,40 @@ static void *vmstat_start(struct seq_fil
 	unsigned long *v;
 #ifdef CONFIG_VM_EVENT_COUNTERS
 	unsigned long *e;
+#define VMSTAT_BUFSIZE	(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \
+				sizeof(struct vm_event_state))
+#else
+#define VMSTAT_BUFSIZE	(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long))
 #endif
 	int i;
 
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
 
-#ifdef CONFIG_VM_EVENT_COUNTERS
-	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
-			+ sizeof(struct vm_event_state), GFP_KERNEL);
-#else
-	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
-			GFP_KERNEL);
-#endif
+	v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL);
 	m->private = v;
 	if (!v)
 		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		v[i] = global_page_state(i);
+
+	if (ve_is_super(get_exec_env())) {
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+			v[i] = global_page_state(i);
 #ifdef CONFIG_VM_EVENT_COUNTERS
-	e = v + NR_VM_ZONE_STAT_ITEMS;
-	all_vm_events(e);
-	e[PGPGIN] /= 2;		/* sectors -> kbytes */
-	e[PGPGOUT] /= 2;
-#endif
+		e = v + NR_VM_ZONE_STAT_ITEMS;
+		all_vm_events(e);
+		e[PGPGIN] /= 2;		/* sectors -> kbytes */
+		e[PGPGOUT] /= 2;
+#endif
+	} else
+		memset(v, 0, VMSTAT_BUFSIZE);
+
+	if (virtinfo_notifier_call(VITYPE_GENERAL,
+				VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) {
+		kfree(v);
+		m->private = NULL;
+		return ERR_PTR(-ENOMSG);
+	}
+
 	return v + *pos;
 }
 
@@ -1107,7 +1173,7 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_PROC_FS
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+	proc_create("vmstat", S_IRUGO, &glob_proc_root, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #endif
 	return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/8021q/vlan.c linux-2.6.32-504.3.3.el6-042stab103_6/net/8021q/vlan.c
--- linux-2.6.32-504.3.3.el6.orig/net/8021q/vlan.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/8021q/vlan.c	2015-01-21 12:02:51.198014682 +0300
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
 #include <net/p8022.h>
@@ -112,7 +113,7 @@ struct vlan_group *vlan_group_alloc(stru
 {
 	struct vlan_group *grp;
 
-	grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL);
+	grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC);
 	if (!grp)
 		return NULL;
 
@@ -135,7 +136,7 @@ int vlan_group_prealloc_vid(struct vlan_
 		return 0;
 
 	size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
-	array = kzalloc(size, GFP_KERNEL);
+	array = kzalloc(size, GFP_KERNEL_UBC);
 	if (array == NULL)
 		return -ENOBUFS;
 
@@ -149,13 +150,14 @@ static void vlan_rcu_free(struct rcu_hea
 	vlan_group_free(container_of(rcu, struct vlan_group, rcu));
 }
 
-void unregister_vlan_dev(struct net_device *dev)
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 {
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	struct net_device *real_dev = vlan->real_dev;
 	const struct net_device_ops *ops = real_dev->netdev_ops;
 	struct vlan_group *grp;
 	u16 vlan_id = vlan->vlan_id;
+	struct ve_struct *env;
 
 	ASSERT_RTNL();
 
@@ -172,9 +174,12 @@ void unregister_vlan_dev(struct net_devi
 	vlan_group_set_device(grp, vlan_id, NULL);
 	grp->nr_vlans--;
 
-	synchronize_net();
+	if (!head)
+		synchronize_net();
 
-	unregister_netdevice(dev);
+	env = set_exec_env(dev->owner_env);
+	unregister_netdevice_queue(dev, head);
+	set_exec_env(env);
 
 	/* If the group is now empty, kill off the group. */
 	if (grp->nr_vlans == 0) {
@@ -473,6 +478,7 @@ static int vlan_device_event(struct noti
 	struct vlan_group *grp;
 	int i, flgs;
 	struct net_device *vlandev;
+	LIST_HEAD(list);
 
 	if (is_vlan_dev(dev))
 		__vlan_device_event(dev, event);
@@ -607,8 +613,9 @@ static int vlan_device_event(struct noti
 			if (grp->nr_vlans == 1)
 				i = VLAN_GROUP_ARRAY_LEN;
 
-			unregister_vlan_dev(vlandev);
+			unregister_vlan_dev(vlandev, &list);
 		}
+		unregister_netdevice_many(&list);
 		break;
 	}
 
@@ -620,6 +627,17 @@ static struct notifier_block vlan_notifi
 	.notifier_call = vlan_device_event,
 };
 
+static inline int vlan_check_caps(void)
+{
+	if (capable(CAP_NET_ADMIN))
+		return 1;
+#ifdef CONFIG_VE
+	if (capable(CAP_VE_NET_ADMIN))
+		return 1;
+#endif
+	return 0;
+}
+
 /*
  *	VLAN IOCTL handler.
  *	o execute requested action or pass command to the device driver
@@ -661,7 +679,7 @@ static int vlan_ioctl_handler(struct net
 	switch (args.cmd) {
 	case SET_VLAN_INGRESS_PRIORITY_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!vlan_check_caps())
 			break;
 		vlan_dev_set_ingress_priority(dev,
 					      args.u.skb_priority,
@@ -671,7 +689,7 @@ static int vlan_ioctl_handler(struct net
 
 	case SET_VLAN_EGRESS_PRIORITY_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!vlan_check_caps())
 			break;
 		err = vlan_dev_set_egress_priority(dev,
 						   args.u.skb_priority,
@@ -680,7 +698,7 @@ static int vlan_ioctl_handler(struct net
 
 	case SET_VLAN_FLAG_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!vlan_check_caps())
 			break;
 		err = vlan_dev_change_flags(dev,
 					    args.vlan_qos ? args.u.flag : 0,
@@ -689,7 +707,7 @@ static int vlan_ioctl_handler(struct net
 
 	case SET_VLAN_NAME_TYPE_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!vlan_check_caps())
 			break;
 		if ((args.u.name_type >= 0) &&
 		    (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
@@ -705,16 +723,16 @@ static int vlan_ioctl_handler(struct net
 
 	case ADD_VLAN_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!vlan_check_caps())
 			break;
 		err = register_vlan_device(dev, args.u.VID);
 		break;
 
 	case DEL_VLAN_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!vlan_check_caps())
 			break;
-		unregister_vlan_dev(dev);
+		unregister_vlan_dev(dev, NULL);
 		err = 0;
 		break;
 
@@ -745,47 +763,28 @@ out:
 
 static int vlan_init_net(struct net *net)
 {
+	struct vlan_net *vn = net_generic(net, vlan_net_id);
 	int err;
-	struct vlan_net *vn;
-
-	err = -ENOMEM;
-	vn = kzalloc(sizeof(struct vlan_net), GFP_KERNEL);
-	if (vn == NULL)
-		goto err_alloc;
-
-	err = net_assign_generic(net, vlan_net_id, vn);
-	if (err < 0)
-		goto err_assign;
 
 	vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD;
 
 	err = vlan_proc_init(net);
-	if (err < 0)
-		goto err_proc;
 
-	return 0;
-
-err_proc:
-	/* nothing */
-err_assign:
-	kfree(vn);
-err_alloc:
 	return err;
 }
 
 static void vlan_exit_net(struct net *net)
 {
-	struct vlan_net *vn;
-
-	vn = net_generic(net, vlan_net_id);
 	rtnl_kill_links(net, &vlan_link_ops);
+
 	vlan_proc_cleanup(net);
-	kfree(vn);
 }
 
 static struct pernet_operations vlan_net_ops = {
 	.init = vlan_init_net,
 	.exit = vlan_exit_net,
+	.id   = &vlan_net_id,
+	.size = sizeof(struct vlan_net),
 };
 
 static int __init vlan_proto_init(void)
@@ -795,7 +794,7 @@ static int __init vlan_proto_init(void)
 	pr_info("%s v%s %s\n", vlan_fullname, vlan_version, vlan_copyright);
 	pr_info("All bugs added by %s\n", vlan_buggyright);
 
-	err = register_pernet_gen_device(&vlan_net_id, &vlan_net_ops);
+	err = register_pernet_device(&vlan_net_ops);
 	if (err < 0)
 		goto err0;
 
@@ -820,7 +819,7 @@ err4:
 err3:
 	unregister_netdevice_notifier(&vlan_notifier_block);
 err2:
-	unregister_pernet_gen_device(vlan_net_id, &vlan_net_ops);
+	unregister_pernet_device(&vlan_net_ops);
 err0:
 	return err;
 }
@@ -840,7 +839,7 @@ static void __exit vlan_cleanup_module(v
 	for (i = 0; i < VLAN_GRP_HASH_SIZE; i++)
 		BUG_ON(!hlist_empty(&vlan_group_hash[i]));
 
-	unregister_pernet_gen_device(vlan_net_id, &vlan_net_ops);
+	unregister_pernet_device(&vlan_net_ops);
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 
 	vlan_gvrp_uninit();
diff -upr linux-2.6.32-504.3.3.el6.orig/net/8021q/vlan.h linux-2.6.32-504.3.3.el6-042stab103_6/net/8021q/vlan.h
--- linux-2.6.32-504.3.3.el6.orig/net/8021q/vlan.h	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/8021q/vlan.h	2015-01-21 12:02:51.144016116 +0300
@@ -34,7 +34,7 @@ void vlan_dev_get_realdev_name(const str
 int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id);
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev);
-void unregister_vlan_dev(struct net_device *dev);
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
 void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev);
 
 static inline u32 vlan_get_ingress_priority(struct net_device *dev,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/8021q/vlan_dev.c linux-2.6.32-504.3.3.el6-042stab103_6/net/8021q/vlan_dev.c
--- linux-2.6.32-504.3.3.el6.orig/net/8021q/vlan_dev.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/8021q/vlan_dev.c	2015-01-21 12:02:45.451167249 +0300
@@ -24,6 +24,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <linux/ethtool.h>
 #include <net/arp.h>
 
@@ -260,6 +261,7 @@ static int vlan_dev_hard_header(struct s
 static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev)
 {
+	struct ve_struct *env;
 	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
 	unsigned int len;
 	int ret;
@@ -294,7 +296,10 @@ static netdev_tx_t vlan_dev_hard_start_x
 
 	skb->dev = vlan_dev_info(dev)->real_dev;
 	len = skb->len;
+	skb->owner_env = skb->dev->owner_env;
+	env = set_exec_env(skb->owner_env);
 	ret = dev_queue_xmit(skb);
+	set_exec_env(env);
 
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 		struct vlan_pcpu_stats *stats;
@@ -316,6 +321,7 @@ static netdev_tx_t vlan_dev_hard_start_x
 static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
 						    struct net_device *dev)
 {
+	struct ve_struct *env;
 	u16 vlan_tci;
 	unsigned int len;
 	int ret;
@@ -326,7 +332,10 @@ static netdev_tx_t vlan_dev_hwaccel_hard
 
 	skb->dev = vlan_dev_info(dev)->real_dev;
 	len = skb->len;
+	skb->owner_env = skb->dev->owner_env;
+	env = set_exec_env(skb->owner_env);
 	ret = dev_queue_xmit(skb);
+	set_exec_env(env);
 
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 		struct vlan_pcpu_stats *stats;
@@ -926,4 +935,6 @@ void vlan_setup(struct net_device *dev)
 	dev->ethtool_ops	= &vlan_ethtool_ops;
 
 	memset(dev->broadcast, 0, ETH_ALEN);
+	if (!ve_is_super(get_exec_env()))
+		dev->vz_features |= NETIF_F_VIRTUAL;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bluetooth/l2cap.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bluetooth/l2cap.c
--- linux-2.6.32-504.3.3.el6.orig/net/bluetooth/l2cap.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bluetooth/l2cap.c	2015-01-21 12:02:41.305277321 +0300
@@ -1979,7 +1979,8 @@ static int l2cap_sock_shutdown(struct so
 		l2cap_sock_clear_timer(sk);
 		__l2cap_sock_close(sk, 0);
 
-		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+		    !(current->flags & PF_EXITING))
 			err = bt_sock_wait_state(sk, BT_CLOSED,
 							sk->sk_lingertime);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bluetooth/rfcomm/sock.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bluetooth/rfcomm/sock.c
--- linux-2.6.32-504.3.3.el6.orig/net/bluetooth/rfcomm/sock.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bluetooth/rfcomm/sock.c	2015-01-21 12:02:41.306277294 +0300
@@ -990,7 +990,8 @@ static int rfcomm_sock_shutdown(struct s
 		sk->sk_shutdown = SHUTDOWN_MASK;
 		__rfcomm_sock_close(sk);
 
-		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+		    !(current->flags & PF_EXITING))
 			err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
 	}
 	release_sock(sk);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bluetooth/sco.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bluetooth/sco.c
--- linux-2.6.32-504.3.3.el6.orig/net/bluetooth/sco.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bluetooth/sco.c	2015-01-21 12:02:41.306277294 +0300
@@ -761,7 +761,8 @@ static int sco_sock_shutdown(struct sock
 		sco_sock_clear_timer(sk);
 		__sco_sock_close(sk);
 
-		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+		    !(current->flags & PF_EXITING))
 			err = bt_sock_wait_state(sk, BT_CLOSED,
 							sk->sk_lingertime);
 	}
@@ -781,7 +782,8 @@ static int sco_sock_release(struct socke
 
 	sco_sock_close(sk);
 
-	if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) {
+	if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+	    !(current->flags & PF_EXITING)) {
 		lock_sock(sk);
 		err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
 		release_sock(sk);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br.c	2015-01-21 12:02:49.616056678 +0300
@@ -17,6 +17,7 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/llc.h>
+#include <linux/cpt_image.h>
 #include <net/llc.h>
 #include <net/stp.h>
 
@@ -24,6 +25,11 @@
 
 int (*br_should_route_hook)(struct sk_buff *skb);
 
+static struct netdev_rst br_netdev_rst = {
+	.cpt_object = CPT_OBJ_NET_BR,
+	.ndo_rst = br_rst,
+};
+
 static const struct stp_proto br_stp_proto = {
 	.rcv	= br_stp_rcv,
 };
@@ -53,7 +59,7 @@ static int __init br_init(void)
 	if (err)
 		goto err_out;
 
-	err = register_pernet_subsys(&br_net_ops);
+	err = register_pernet_device(&br_net_ops);
 	if (err)
 		goto err_out1;
 
@@ -72,18 +78,20 @@ static int __init br_init(void)
 	brioctl_set(br_ioctl_deviceless_stub);
 	br_handle_frame_hook = br_handle_frame;
 	br_get_br_dev_for_port_hook = __br_get_br_dev_for_port_rcu;
+	br_hard_xmit_hook = br_xmit;
 
 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
 	br_fdb_test_addr_hook = br_fdb_test_addr;
 #endif
 
+	register_netdev_rst(&br_netdev_rst);
 	return 0;
 err_out4:
 	unregister_netdevice_notifier(&br_device_notifier);
 err_out3:
 	br_netfilter_fini();
 err_out2:
-	unregister_pernet_subsys(&br_net_ops);
+	unregister_pernet_device(&br_net_ops);
 err_out1:
 	br_fdb_fini();
 err_out:
@@ -93,13 +101,14 @@ err_out:
 
 static void __exit br_deinit(void)
 {
+	unregister_netdev_rst(&br_netdev_rst);
 	stp_proto_unregister(&br_stp_proto);
 
 	br_netlink_fini();
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
 
-	unregister_pernet_subsys(&br_net_ops);
+	unregister_pernet_device(&br_net_ops);
 
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 
@@ -110,6 +119,7 @@ static void __exit br_deinit(void)
 
 	br_handle_frame_hook = NULL;
 	br_get_br_dev_for_port_hook = NULL;
+	br_hard_xmit_hook = NULL;
 	br_fdb_fini();
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_device.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_device.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_device.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_device.c	2015-01-21 12:02:49.631056281 +0300
@@ -17,6 +17,9 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/list.h>
+#include <linux/nsproxy.h>
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
 
 #include <asm/uaccess.h>
 #include "br_private.h"
@@ -40,13 +43,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *
 	skb_reset_mac_header(skb);
 	skb_pull(skb, ETH_HLEN);
 
+	skb->brmark = BR_ALREADY_SEEN;
+
 	if (is_broadcast_ether_addr(dest))
 		br_flood_deliver(br, skb);
 	else if (is_multicast_ether_addr(dest)) {
-		if (unlikely(netpoll_tx_running(dev))) {
-			br_flood_deliver(br, skb);
-			goto out;
-		}
 		if (br_multicast_rcv(br, NULL, skb)) {
 			kfree_skb(skb);
 			goto out;
@@ -59,7 +60,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *
 		else
 			br_flood_deliver(br, skb);
 	} else if ((dst = __br_fdb_get(br, dest)) != NULL)
-		br_deliver(dst->dst, skb);
+		br_deliver(dst->dst, skb, 1);
 	else
 		br_flood_deliver(br, skb);
 
@@ -67,6 +68,36 @@ out:
 	return NETDEV_TX_OK;
 }
 
+int br_xmit(struct sk_buff *skb, struct net_bridge_port *port)
+{
+	struct net_bridge *br = port->br;
+	const unsigned char *dest = skb->data;
+	struct net_bridge_fdb_entry *dst;
+	int ret = 0;
+
+	if (!br->via_phys_dev)
+		return 0;
+
+	br->dev->stats.tx_packets++;
+	br->dev->stats.tx_bytes += skb->len;
+
+	skb_reset_mac_header(skb);
+	skb_pull(skb, ETH_HLEN);
+
+	skb->brmark = BR_ALREADY_SEEN;
+
+	if (dest[0] & 1)
+		br_xmit_deliver(br, port, skb);
+	else if ((dst = __br_fdb_get(br, dest)) != NULL)
+		ret = br_deliver(dst->dst, skb, 0);
+	else
+		br_xmit_deliver(br, port, skb);
+
+	skb_push(skb, ETH_HLEN);
+
+	return ret;
+}
+
 static int br_dev_open(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
@@ -205,87 +236,154 @@ static int br_set_tx_csum(struct net_dev
 	return 0;
 }
 
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static void br_poll_controller(struct net_device *br_dev)
-{
-}
+static int br_rst_nested_dev(loff_t start, struct cpt_br_image *bri,
+			 struct net_bridge *br, struct rst_ops *ops,
+			 struct cpt_context *ctx)
+{
+	struct net_device *dev;
+	int ret = 0;
+	loff_t pos;
+
+	pos = start + bri->cpt_hdrlen;
+
+	while (pos < start + bri->cpt_next) {
+		struct cpt_br_nested_dev o;
+
+		ret = ops->get_object(CPT_OBJ_NET_BR_DEV, pos, &o, sizeof(o), ctx);
+		if (ret)
+			break;
+
+		dev = dev_get_by_name(dev_net(br->dev), o.name);
+		if (!dev) {
+			printk(KERN_ERR "%s: restore '%s' nested dev\n", __func__, o.name);
+			WARN_ON(1);
+			ret = -ENODEV;
+			break;
+		}
 
-static void br_netpoll_cleanup(struct net_device *dev)
-{
-	struct net_bridge *br = netdev_priv(dev);
-	struct net_bridge_port *p, *n;
+		ret = br_add_if(br, dev);
+		dev_put(dev);
+		if (ret)
+			break;
 
-	list_for_each_entry_safe(p, n, &br->port_list, list) {
-		br_netpoll_disable(p);
+		pos += o.cpt_next;
 	}
+	return ret;
 }
 
-static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
+int br_rst(loff_t start, struct cpt_netdev_image *di,
+		struct rst_ops *ops, struct cpt_context *ctx)
 {
-	struct net_bridge *br = netdev_priv(dev);
-	struct net_bridge_port *p, *n;
-	int err = 0;
-
-	br->dev->npinfo = NULL;
-	list_for_each_entry_safe(p, n, &br->port_list, list) {
-		if (!p->dev)
-			continue;
+	struct net *net = current->nsproxy->net_ns;
+	struct cpt_br_image bri;
+	struct net_device *dev;
+	struct net_bridge *br;
+	loff_t pos;
+	int ret;
+
+	pos = start + di->cpt_hdrlen;
+	ret = ops->get_object(CPT_OBJ_NET_BR, pos,
+			&bri, sizeof(bri), ctx);
+	if (ret)
+		goto out;
 
-		err = br_netpoll_enable(p);
-		if (err)
-			goto fail;
-	}
+	dev = new_bridge_dev(net, di->cpt_name);
+	if (!dev)
+		return -ENOMEM;
+
+	br = netdev_priv(dev);
+
+	memcpy(&br->designated_root, &bri.designated_root, 8);
+	memcpy(&br->bridge_id, &bri.bridge_id, 8);
+	br->root_path_cost = bri.root_path_cost;
+	br->max_age = clock_t_to_jiffies(bri.max_age);
+	br->hello_time = clock_t_to_jiffies(bri.hello_time);
+	br->forward_delay = bri.forward_delay;
+	br->bridge_max_age = bri.bridge_max_age;
+	br->bridge_hello_time = bri.bridge_hello_time;
+	br->bridge_forward_delay = clock_t_to_jiffies(bri.bridge_forward_delay);
+	br->ageing_time = clock_t_to_jiffies(bri.ageing_time);
+	br->root_port = bri.root_port;
+	br->stp_enabled = bri.stp_enabled;
+	br->via_phys_dev = bri.via_phys_dev;
+
+	SET_NETDEV_DEVTYPE(dev, &br_type);
+
+	ret = register_netdevice(dev);
+	if (ret)
+		goto out_free;
+
+	ret = br_sysfs_addbr(dev);
+	if (ret)
+		goto out_unreg;
 
+	ret = br_rst_nested_dev(pos, &bri, br, ops, ctx);
 out:
-	return err;
+	return ret;
 
-fail:
-	br_netpoll_cleanup(dev);
+out_unreg:
+	unregister_netdevice(dev);
+	goto out;
+out_free:
+	free_netdev(dev);
 	goto out;
 }
 
-int br_netpoll_enable(struct net_bridge_port *p)
+static void br_cpt_nested_dev(struct net_bridge *br, struct cpt_ops *ops,
+			      struct cpt_context *ctx)
 {
-	struct netpoll *np;
-	int err = 0;
+	struct net_bridge_port *p;
 
-	np = kzalloc(sizeof(*p->np), GFP_KERNEL);
-	err = -ENOMEM;
-	if (!np)
-		goto out;
+	list_for_each_entry(p, &br->port_list, list) {
+		struct cpt_br_nested_dev o;
+		loff_t saved_obj;
 
-	np->dev = p->dev;
+		ops->push_object(&saved_obj, ctx);
 
-	err = __netpoll_setup(np);
-	if (err) {
-		kfree(np);
-		goto out;
+		o.cpt_next = CPT_NULL;
+		o.cpt_object = CPT_OBJ_NET_BR_DEV;
+		o.cpt_hdrlen = sizeof(o);
+		o.cpt_content = CPT_CONTENT_NAME;
+		BUILD_BUG_ON(IFNAMSIZ != 16);
+		memcpy(o.name, p->dev->name, IFNAMSIZ);
+
+		ops->write(&o, sizeof(o), ctx);
+
+		ops->pop_object(&saved_obj, ctx);
 	}
 
-	p->np = np;
 
-out:
-	return err;
 }
 
-void br_netpoll_disable(struct net_bridge_port *p)
+static void br_cpt(struct net_device *dev, struct cpt_ops *ops, struct cpt_context *ctx)
 {
-	struct netpoll *np = p->np;
-
-	if (!np)
-		return;
+	struct cpt_br_image v;
+	struct net_bridge *br = netdev_priv(dev);
 
-	p->np = NULL;
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_BR;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	memcpy(&v.designated_root, &br->designated_root, 8);
+	memcpy(&v.bridge_id, &br->bridge_id, 8);
+	v.root_path_cost = br->root_path_cost;
+	v.max_age = jiffies_to_clock_t(br->max_age);
+	v.hello_time = jiffies_to_clock_t(br->hello_time);
+	v.forward_delay = br->forward_delay;
+	v.bridge_max_age = br->bridge_max_age;
+	v.bridge_hello_time = br->bridge_hello_time;
+	v.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay);
+	v.ageing_time = jiffies_to_clock_t(br->ageing_time);
+	v.root_port = br->root_port;
+	v.stp_enabled = br->stp_enabled;
+	v.via_phys_dev = br->via_phys_dev;
 
-	/* Wait for transmitting packets to finish before freeing. */
-	synchronize_rcu_bh();
+	ops->write(&v, sizeof(v), ctx);
 
-	__netpoll_cleanup(np);
-	kfree(np);
+	br_cpt_nested_dev(br, ops, ctx);
 }
 
-#endif
-
 static const struct ethtool_ops br_ethtool_ops = {
 	.get_drvinfo    = br_getinfo,
 	.get_link	= ethtool_op_get_link,
@@ -308,10 +406,7 @@ static const struct net_device_ops br_ne
 	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
 	.ndo_change_mtu		 = br_change_mtu,
 	.ndo_do_ioctl		 = br_dev_ioctl,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_netpoll_cleanup	 = br_netpoll_cleanup,
-	.ndo_poll_controller	 = br_poll_controller,
-#endif
+	.ndo_cpt		 = br_cpt,
 };
 
 static const struct net_device_ops_ext br_netdev_ops_ext = {
@@ -334,9 +429,6 @@ void br_dev_setup(struct net_device *dev
 
 	dev->netdev_ops = &br_netdev_ops;
 	set_netdev_ops_ext(dev, &br_netdev_ops_ext);
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	netdev_extended(dev)->netpoll_data.ndo_netpoll_setup = br_netpoll_setup;
-#endif
 	dev->destructor = br_dev_free;
 	SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
 	dev->tx_queue_len = 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_forward.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_forward.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_forward.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_forward.c	2015-01-21 12:02:47.120122941 +0300
@@ -68,16 +68,6 @@ static void __br_deliver(const struct ne
 {
 	skb->dev = to->dev;
 
-	if (unlikely(netpoll_tx_running(to->dev))) {
-		if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
-			kfree_skb(skb);
-		else {
-			skb_push(skb, ETH_HLEN);
-			br_netpoll_send_skb(to, skb);
-		}
-		return;
-	}
-
 	NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
 		br_forward_finish);
 }
@@ -100,14 +90,26 @@ static void __br_forward(const struct ne
 }
 
 /* called with rcu_read_lock */
-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+int br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free)
 {
 	if (should_deliver(to, skb)) {
+		if (!free) {
+			struct sk_buff *skb2;
+
+			if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+				to->dev->stats.tx_dropped++;
+				return 1;
+			}
+			skb = skb2;
+		}
 		__br_deliver(to, skb);
-		return;
+		return 1;
 	}
 
-	kfree_skb(skb);
+	if (free)
+		kfree_skb(skb);
+
+	return 0;
 }
 
 /* called with rcu_read_lock */
@@ -202,10 +204,32 @@ void br_flood_deliver(struct net_bridge 
 	br_flood(br, skb, NULL, __br_deliver);
 }
 
+/* called with rcu_read_lock */
+void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port,
+						struct sk_buff *skb)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry_rcu(p, &br->port_list, list) {
+		if (p == port)
+			continue;
+		if (should_deliver(p, skb)) {
+			struct sk_buff *skb2;
+
+			if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+				br->dev->stats.tx_dropped++;
+				return;
+			}
+			__br_deliver(p, skb2);
+		}
+	}
+}
+
 /* called under bridge lock */
 void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
 		      struct sk_buff *skb2)
 {
+	skb->brmark = BR_ALREADY_SEEN;
 	br_flood(br, skb, skb2, __br_forward);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_if.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_if.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_if.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_if.c	2015-01-21 12:02:51.153015876 +0300
@@ -12,6 +12,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/nsproxy.h>
 #include <linux/netdevice.h>
 #include <linux/netpoll.h>
 #include <linux/ethtool.h>
@@ -147,25 +148,32 @@ static void del_nbp(struct net_bridge_po
 
 	list_del_rcu(&p->list);
 
-	rcu_assign_pointer(dev->br_port, NULL);
-
 	dev->priv_flags &= ~IFF_BRIDGE_PORT;
 
+	smp_wmb(); /* In pair to handle_bridge() and bridge_hard_start_xmit() */
+
+	synchronize_net();
+
+	rcu_assign_pointer(dev->br_port, NULL);
+
 	br_multicast_del_port(p);
 
 	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 
-	br_netpoll_disable(p);
-
 	call_rcu(&p->rcu, destroy_nbp_rcu);
 }
 
 /* called with RTNL */
-static void del_br(struct net_bridge *br)
+static void del_br(struct net_bridge *br, struct list_head *head)
 {
 	struct net_bridge_port *p, *n;
 
+	if (br->master_dev) {
+		dev_put(br->master_dev);
+		rcu_assign_pointer(br->master_dev, NULL);
+	}
+
 	list_for_each_entry_safe(p, n, &br->port_list, list) {
 		del_nbp(p);
 	}
@@ -173,10 +181,10 @@ static void del_br(struct net_bridge *br
 	del_timer_sync(&br->gc_timer);
 
 	br_sysfs_delbr(br->dev);
-	unregister_netdevice(br->dev);
+	unregister_netdevice_queue(br->dev, head);
 }
 
-static struct net_device *new_bridge_dev(struct net *net, const char *name)
+struct net_device *new_bridge_dev(struct net *net, const char *name)
 {
 	struct net_bridge *br;
 	struct net_device *dev;
@@ -280,7 +288,7 @@ static struct net_bridge_port *new_nbp(s
 	return p;
 }
 
-static struct device_type br_type = {
+struct device_type br_type = {
 	.name	= "bridge",
 };
 
@@ -339,7 +347,7 @@ int br_del_bridge(struct net *net, const
 	}
 
 	else
-		del_br(netdev_priv(dev));
+		del_br(netdev_priv(dev), NULL);
 
 	rtnl_unlock();
 	return ret;
@@ -419,8 +427,6 @@ int br_add_if(struct net_bridge *br, str
 	if (err)
 		goto put_back;
 
-	call_netdevice_notifiers(NETDEV_JOIN, dev);
-
 	err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
 				   SYSFS_BRIDGE_PORT_ATTR);
 	if (err)
@@ -434,15 +440,14 @@ int br_add_if(struct net_bridge *br, str
 	if (err)
 		goto err2;
 
-	if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
-		goto err3;
-
 	rcu_assign_pointer(dev->br_port, p);
 
 	dev_disable_lro(dev);
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
+	smp_wmb(); /* In pair to handle_bridge() and bridge_hard_start_xmit() */
+
 	list_add_rcu(&p->list, &br->port_list);
 
 	br_features_recompute(br);
@@ -453,6 +458,10 @@ int br_add_if(struct net_bridge *br, str
 	if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
 	    (br->dev->flags & IFF_UP))
 		br_stp_enable_port(p);
+	if (!(dev->vz_features & NETIF_F_VIRTUAL) && !br->master_dev) {
+		dev_hold(dev);
+		rcu_assign_pointer(br->master_dev, dev);
+	}
 	spin_unlock_bh(&br->lock);
 
 	br_ifinfo_notify(RTM_NEWLINK, p);
@@ -462,8 +471,6 @@ int br_add_if(struct net_bridge *br, str
 	kobject_uevent(&p->kobj, KOBJ_ADD);
 
 	return 0;
-err3:
-	sysfs_remove_link(br->ifobj, p->dev->name);
 err2:
 	br_fdb_delete_by_port(br, p, 1);
 err1:
@@ -489,6 +496,16 @@ int br_del_if(struct net_bridge *br, str
 
 	spin_lock_bh(&br->lock);
 	br_stp_recalculate_bridge_id(br);
+	if (br->master_dev == dev) {
+		rcu_assign_pointer(br->master_dev, NULL);
+		dev_put(dev);
+		list_for_each_entry(p, &br->port_list, list)
+			if (!(p->dev->vz_features & NETIF_F_VIRTUAL)) {
+				dev_hold(p->dev);
+				rcu_assign_pointer(br->master_dev, p->dev);
+				break;
+			}
+	}
 	spin_unlock_bh(&br->lock);
 
 	br_features_recompute(br);
@@ -499,15 +516,14 @@ int br_del_if(struct net_bridge *br, str
 void br_net_exit(struct net *net)
 {
 	struct net_device *dev;
+	LIST_HEAD(list);
 
 	rtnl_lock();
-restart:
-	for_each_netdev(net, dev) {
-		if (dev->priv_flags & IFF_EBRIDGE) {
-			del_br(netdev_priv(dev));
-			goto restart;
-		}
-	}
+	for_each_netdev(net, dev)
+		if (dev->priv_flags & IFF_EBRIDGE)
+			del_br(netdev_priv(dev), &list);
+
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
 
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_input.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_input.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_input.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_input.c	2015-01-21 12:02:45.171174681 +0300
@@ -64,6 +64,7 @@ static int br_pass_frame_up(struct sk_bu
 	struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
 	struct net_bridge *br = netdev_priv(brdev);
 	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
+	struct net_device *master_dev = NULL;
 
 	u64_stats_update_begin(&brstats->syncp);
 	brstats->rx_packets++;
@@ -71,7 +72,16 @@ static int br_pass_frame_up(struct sk_bu
 	u64_stats_update_end(&brstats->syncp);
 
 	indev = skb->dev;
-	skb->dev = brdev;
+
+	if (br->via_phys_dev)
+		master_dev = rcu_dereference(br->master_dev);
+
+	if (!master_dev)
+		skb->dev = brdev;
+	else {
+		skb->brmark = BR_ALREADY_SEEN;
+		skb->dev = master_dev;
+	}
 
 	skb = br_vlan_workaround(skb);
 	if (!skb)
@@ -90,6 +100,7 @@ int br_handle_frame_finish(struct sk_buf
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
 	struct sk_buff *skb2;
+	int err = 0;
 
 	if (!p || p->state == BR_STATE_DISABLED)
 		goto drop;
@@ -110,7 +121,7 @@ int br_handle_frame_finish(struct sk_buf
 	/* The packet skb2 goes to the local host (NULL to skip). */
 	skb2 = NULL;
 
-	if (br->dev->flags & IFF_PROMISC)
+	if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev)
 		skb2 = skb;
 
 	dst = NULL;
@@ -138,16 +149,20 @@ int br_handle_frame_finish(struct sk_buf
 		skb = NULL;
 	}
 
+	if (skb2 == skb)
+		skb2 = skb_clone(skb, GFP_ATOMIC);
+
+	if (skb2)
+		err = br_pass_frame_up(skb2);
+
 	if (skb) {
 		if (dst)
-			br_forward(dst->dst, skb, skb2);
+			br_forward(dst->dst, skb, NULL);
 		else
-			br_flood_forward(br, skb, skb2);
+			br_flood_forward(br, skb, NULL);
 	}
 
-	if (skb2)
-		return br_pass_frame_up(skb2);
-
+	return err;
 out:
 	return 0;
 drop:
@@ -200,6 +215,8 @@ struct sk_buff *br_handle_frame(struct n
 
 forward:
 	switch (p->state) {
+		struct net_device *out;
+
 	case BR_STATE_FORWARDING:
 		rhook = rcu_dereference(br_should_route_hook);
 		if (rhook != NULL) {
@@ -209,7 +226,15 @@ forward:
 		}
 		/* fall through */
 	case BR_STATE_LEARNING:
-		if (!compare_ether_addr(p->br->dev->dev_addr, dest))
+		if (skb->brmark == BR_ALREADY_SEEN)
+			return skb;
+
+		if (p->br->via_phys_dev)
+			out = rcu_dereference(p->br->master_dev);
+		else
+			out = p->br->dev;
+
+		if (out && !compare_ether_addr(out->dev_addr, dest))
 			skb->pkt_type = PACKET_HOST;
 
 		NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_ioctl.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_ioctl.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_ioctl.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_ioctl.c	2015-01-21 12:02:45.149175267 +0300
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/if_bridge.h>
 #include <linux/netdevice.h>
+#include <linux/nsproxy.h>
 #include <linux/times.h>
 #include <net/net_namespace.h>
 #include <asm/uaccess.h>
@@ -140,6 +141,7 @@ static int old_dev_ioctl(struct net_devi
 		b.root_port = br->root_port;
 
 		b.stp_enabled = (br->stp_enabled != BR_NO_STP);
+		b.via_phys_dev = br->via_phys_dev;
 		b.ageing_time = jiffies_to_clock_t(br->ageing_time);
 		b.hello_timer_value = br_timer_value(&br->hello_timer);
 		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
@@ -262,6 +264,13 @@ static int old_dev_ioctl(struct net_devi
 		br_stp_set_enabled(br, args[1]);
 		return 0;
 
+	case BRCTL_SET_VIA_ORIG_DEV:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		br->via_phys_dev = args[1] ? 1 : 0;
+		return 0;
+
 	case BRCTL_SET_BRIDGE_PRIORITY:
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
@@ -371,6 +380,9 @@ static int old_deviceless(struct net *ne
 
 int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
 {
+	if (!(get_exec_env()->features & VE_FEATURE_BRIDGE))
+		return -ENOTTY;
+
 	switch (cmd) {
 	case SIOCGIFBR:
 	case SIOCSIFBR:
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_netfilter.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_netfilter.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_netfilter.c	2014-12-12 23:29:26.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_netfilter.c	2015-01-21 12:02:45.169174735 +0300
@@ -141,8 +141,18 @@ static inline struct rtable *bridge_pare
 static inline struct net_device *bridge_parent(const struct net_device *dev)
 {
 	struct net_bridge_port *port = rcu_dereference(dev->br_port);
+	struct net_device *master_dev;
+	struct net_bridge *br;
 
-	return port ? port->br->dev : NULL;
+	if (!port)
+		return NULL;
+
+	br = port->br;
+	master_dev = rcu_dereference(br->master_dev);
+	if (br->via_phys_dev && master_dev)
+		return master_dev;
+	else
+		return br->dev;
 }
 
 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
@@ -799,8 +809,7 @@ static unsigned int br_nf_local_out(unsi
 #if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
-	if (skb->nfct != NULL &&
-	    (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) &&
+	if ((skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) &&
 	    skb->len > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
 		/* BUG: Should really parse the IP options here. */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_private.h linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_private.h
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_private.h	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_private.h	2015-01-21 12:02:49.616056678 +0300
@@ -19,6 +19,10 @@
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
 
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+
 #define BR_HASH_BITS 8
 #define BR_HASH_SIZE (1 << BR_HASH_BITS)
 
@@ -158,6 +162,8 @@ struct net_bridge
 
 	struct br_cpu_netstats __percpu *stats;
 
+	struct net_device		*master_dev;
+	unsigned char			via_phys_dev;
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 	struct list_head		age_list;
@@ -249,6 +255,13 @@ static inline int br_is_root_bridge(cons
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
 			       struct net_device *dev);
+extern netdev_tx_t br_xmit(struct sk_buff *skb, struct net_bridge_port *port);
+struct cpt_context;
+struct rst_ops;
+struct cpt_netdev_image;
+extern int br_rst(loff_t start, struct cpt_netdev_image *di,
+		struct rst_ops *ops, struct cpt_context *ctx);
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
 {
@@ -309,17 +322,20 @@ extern void br_fdb_update(struct net_bri
 			  const unsigned char *addr);
 
 /* br_forward.c */
-extern void br_deliver(const struct net_bridge_port *to,
-		struct sk_buff *skb);
+extern int br_deliver(const struct net_bridge_port *to,
+		struct sk_buff *skb, int free);
 extern int br_dev_queue_push_xmit(struct sk_buff *skb);
 extern void br_forward(const struct net_bridge_port *to,
 		struct sk_buff *skb, struct sk_buff *skb0);
 extern int br_forward_finish(struct sk_buff *skb);
 extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
+extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb);
 extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
 			     struct sk_buff *skb2);
 
 /* br_if.c */
+extern struct device_type br_type;
+extern struct net_device *new_bridge_dev(struct net *net, const char *name);
 extern void br_port_carrier_check(struct net_bridge_port *p);
 extern int br_add_bridge(struct net *net, const char *name);
 extern int br_del_bridge(struct net *net, const char *name);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/bridge/br_sysfs_br.c linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_sysfs_br.c
--- linux-2.6.32-504.3.3.el6.orig/net/bridge/br_sysfs_br.c	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/bridge/br_sysfs_br.c	2015-01-21 12:02:45.149175267 +0300
@@ -183,6 +183,28 @@ static ssize_t store_stp_state(struct de
 static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state,
 		   store_stp_state);
 
+static ssize_t show_via_phys_dev_state(struct device *cd,
+				struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(cd);
+	return sprintf(buf, "%d\n", br->via_phys_dev);
+}
+
+static int set_via_phys_dev_state(struct net_bridge *br, unsigned long val)
+{
+	br->via_phys_dev = val ? 1 : 0;
+	return 0;
+}
+
+static ssize_t store_via_phys_dev_state(struct device *cd,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	return store_bridge_parm(cd, buf, len, set_via_phys_dev_state);
+}
+
+static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state,
+			 store_via_phys_dev_state);
+
 static ssize_t show_priority(struct device *d, struct device_attribute *attr,
 			     char *buf)
 {
@@ -631,6 +653,7 @@ static struct attribute *bridge_attrs[] 
 	&dev_attr_max_age.attr,
 	&dev_attr_ageing_time.attr,
 	&dev_attr_stp_state.attr,
+	&dev_attr_via_phys_dev.attr,
 	&dev_attr_priority.attr,
 	&dev_attr_bridge_id.attr,
 	&dev_attr_root_id.attr,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/datagram.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/datagram.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/datagram.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/datagram.c	2015-01-21 12:02:43.282224833 +0300
@@ -58,6 +58,8 @@
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
 
+#include <bc/net.h>
+
 /*
  *	Is a socket 'connection oriented' ?
  */
@@ -728,6 +730,7 @@ unsigned int datagram_poll(struct file *
 {
 	struct sock *sk = sock->sk;
 	unsigned int mask;
+	int no_ubc_space;
 
 	sock_poll_wait(file, sk->sk_sleep, wait);
 	mask = 0;
@@ -737,8 +740,14 @@ unsigned int datagram_poll(struct file *
 		mask |= POLLERR;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		mask |= POLLRDHUP;
-	if (sk->sk_shutdown == SHUTDOWN_MASK)
+	if (sk->sk_shutdown == SHUTDOWN_MASK) {
+		no_ubc_space = 0;
 		mask |= POLLHUP;
+	} else {
+		no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
+		if (no_ubc_space)
+			ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
+	}
 
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
@@ -755,7 +764,7 @@ unsigned int datagram_poll(struct file *
 	}
 
 	/* writable? */
-	if (sock_writeable(sk))
+	if (!no_ubc_space && sock_writeable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/dev.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/dev.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/dev.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/dev.c	2015-01-21 12:02:58.539819797 +0300
@@ -136,9 +136,13 @@
 #include <linux/cpu_rmap.h>
 #include <linux/net_tstamp.h>
 #include <linux/hashtable.h>
+#include <linux/fence-watchdog.h>
 
 #include "net-sysfs.h"
 
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+
 /* Instead of increasing this, you should create a hash table. */
 #define MAX_GRO_SKBS 8
 
@@ -210,15 +214,9 @@ static DEFINE_SPINLOCK(napi_hash_lock);
 static unsigned int napi_gen_id;
 static DEFINE_HASHTABLE(napi_hash, 8);
 
-static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
-{
-	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-	return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)];
-}
-
-static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+static inline void dev_base_seq_inc(struct net *net)
 {
-	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+	while (++net->dev_base_seq == 0);
 }
 
 /* Device list insertion */
@@ -233,6 +231,9 @@ static int list_netdevice(struct net_dev
 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
+
+	dev_base_seq_inc(net);
+
 	return 0;
 }
 
@@ -247,6 +248,8 @@ static void unlist_netdevice(struct net_
 	hlist_del(&dev->name_hlist);
 	hlist_del(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
+
+	dev_base_seq_inc(dev_net(dev));
 }
 
 /*
@@ -1036,15 +1039,10 @@ int dev_change_name(struct net_device *d
 		return err;
 
 rollback:
-	/* For now only devices in the initial network namespace
-	 * are in sysfs.
-	 */
-	if (net == &init_net) {
-		ret = device_rename(&dev->dev, dev->name);
-		if (ret) {
-			memcpy(dev->name, oldname, IFNAMSIZ);
-			return ret;
-		}
+	ret = device_rename(&dev->dev, dev->name);
+	if (ret) {
+		memcpy(dev->name, oldname, IFNAMSIZ);
+		return ret;
 	}
 
 	write_lock_bh(&dev_base_lock);
@@ -1126,8 +1124,10 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
+		struct ve_struct *vesave = set_exec_env(dev->owner_env);
 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+		set_exec_env(vesave);
 	}
 }
 EXPORT_SYMBOL(netdev_state_change);
@@ -1256,6 +1256,7 @@ EXPORT_SYMBOL(dev_open);
 
 static int __dev_close_many(struct list_head *head)
 {
+	struct ve_struct *old_env;
 	struct net_device *dev;
 	struct net_device_extended *nde;
 
@@ -1269,7 +1270,9 @@ static int __dev_close_many(struct list_
 		 *	Tell people we are going down, so that they can
 		 *	prepare to death, when device is still operating.
 		 */
+		old_env = set_exec_env(dev->owner_env);
 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+		set_exec_env(old_env);
 
 		clear_bit(__LINK_STATE_START, &dev->state);
 
@@ -1296,8 +1299,11 @@ static int __dev_close_many(struct list_
 		 *	We allow it to be called even after a DETACH hot-plug
 		 *	event.
 		 */
-		if (ops->ndo_stop)
+		if (ops->ndo_stop) {
+			old_env = set_exec_env(dev->owner_env);
 			ops->ndo_stop(dev);
+			set_exec_env(old_env);
+		}
 
 		/*
 		 *	Device is now down.
@@ -1316,6 +1322,7 @@ static int __dev_close_many(struct list_
 
 int dev_close_many(struct list_head *head)
 {
+	struct ve_struct *old_env;
 	struct net_device *dev;
 	struct net_device_extended *nde, *tmp;
 	LIST_HEAD(tmp_list);
@@ -1331,8 +1338,10 @@ int dev_close_many(struct list_head *hea
 	 */
 	list_for_each_entry(nde, head, unreg_list) {
 		dev = nde->dev;
+		old_env = set_exec_env(dev->owner_env);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
 		call_netdevice_notifiers(NETDEV_DOWN, dev);
+		set_exec_env(old_env);
 	}
 
 	/* rollback_registered_many needs the complete original list */
@@ -1655,6 +1664,7 @@ int dev_forward_skb(struct net_device *d
 	skb->mark = 0;
 	secpath_reset(skb);
 	nf_reset(skb);
+	skb_init_brmark(skb);
 	return netif_rx(skb);
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -2248,6 +2258,36 @@ int netif_skb_features(struct sk_buff *s
 }
 EXPORT_SYMBOL(netif_skb_features);
 
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port);
+EXPORT_SYMBOL(br_hard_xmit_hook);
+static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb,
+						struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	if (!br_hard_xmit_hook)
+		return 0;
+
+	if (skb->brmark == BR_ALREADY_SEEN)
+		return 0;
+	if (!(skb->dev->priv_flags & IFF_BRIDGE_PORT))
+		return 0;
+
+	smp_rmb(); /* Pairs with smp_wmb in del_nbp() and in br_add_if() */
+
+	port = rcu_dereference(dev->br_port);
+	if (!port) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	return br_hard_xmit_hook(skb, port);
+}
+#else
+#define bridge_hard_start_xmit(skb, dev)	(0)
+#endif
+
 /*
  * Returns true if either:
  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
@@ -2265,13 +2305,50 @@ static inline int skb_needs_linearize(st
 				!(features & NETIF_F_SG)));
 }
 
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq)
+static inline int dev_hard_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc;
 	unsigned int skb_len;
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	if (unlikely(fence_wdog_check_timer())) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+#endif
+	/*
+	 * Bridge must handle packet with dst information set.
+	 * If there is no dst set in skb - it can cause oops in NAT.
+	 */
+	rc = bridge_hard_start_xmit(skb, dev);
+
+	/*
+	 * If device doesnt need skb->dst, release it right now while
+	 * its hot in this cpu cache
+	 */
+	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+		skb_dst_drop(skb);
+
+	if (rc > 0) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	if (!list_empty(&ptype_all))
+		dev_queue_xmit_nit(skb, dev);
+
+	skb_len = skb->len;
+	rc = ops->ndo_start_xmit(skb, dev);
+	trace_net_dev_xmit(skb, rc, dev, skb_len);
+	return rc;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+			struct netdev_queue *txq)
+{
+	int rc;
+
 	if (likely(!skb->next)) {
 		int features;
 
@@ -2318,19 +2395,8 @@ int dev_hard_start_xmit(struct sk_buff *
 			}
 		}
 
-		/*
-		 * If device doesnt need skb->dst, release it right now while
-		 * its hot in this cpu cache
-		 */
-		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
-			skb_dst_drop(skb);
+		rc = dev_hard_xmit(skb, dev);
 
-		if (!list_empty(&ptype_all))
-			dev_queue_xmit_nit(skb, dev);
-
-		skb_len = skb->len;
-		rc = ops->ndo_start_xmit(skb, dev);
-		trace_net_dev_xmit(skb, rc, dev, skb_len);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		/*
@@ -2357,12 +2423,8 @@ gso:
 		skb->next = nskb->next;
 		nskb->next = NULL;
 
-		if (!list_empty(&ptype_all))
-			dev_queue_xmit_nit(nskb, dev);
+		rc = dev_hard_xmit(nskb, dev);
 
-		skb_len = nskb->len;
-		rc = ops->ndo_start_xmit(nskb, dev);
-		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			nskb->next = skb->next;
 			skb->next = nskb;
@@ -3141,8 +3203,16 @@ static inline struct sk_buff *handle_bri
 	struct net_bridge_port *port;
 
 	if (skb->pkt_type == PACKET_LOOPBACK ||
-	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
+	    !(skb->dev->priv_flags & IFF_BRIDGE_PORT))
+		return skb;
+
+	smp_rmb(); /* Pairs with smp_wmb in del_nbp() and in br_add_if() */
+
+	port = rcu_dereference(skb->dev->br_port);
+	if (!port) {
+		WARN_ON(1);
 		return skb;
+	}
 
 	/* RHEL only: skbs received on inactive slaves
 	 * due the vlan hwaccel path should not pass along */
@@ -3323,6 +3393,7 @@ int __netif_receive_skb(struct sk_buff *
 	struct net_device *exact_dev;
 	int ret = NET_RX_DROP;
 	__be16 type;
+	struct ve_struct *old_ve;
 
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
@@ -3366,6 +3437,16 @@ int __netif_receive_skb(struct sk_buff *
 	skb_reset_transport_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
 
+#ifdef CONFIG_VE
+	/*
+	 * Skb might be alloced in another VE context, than its device works.
+	 * So, set the correct owner_env.
+	 */
+	skb->owner_env = skb->dev->owner_env;
+	BUG_ON(skb->owner_env == NULL);
+#endif
+	old_ve = set_exec_env(skb->owner_env);
+
 	pt_prev = NULL;
 
 	rcu_read_lock();
@@ -3377,14 +3458,18 @@ int __netif_receive_skb(struct sk_buff *
 	}
 #endif
 
-	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		    ptype->dev == orig_dev) {
-			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = ptype;
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+	if (skb->brmark != BR_ALREADY_SEEN) {
+		list_for_each_entry_rcu(ptype, &ptype_all, list) {
+			if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
+			    ptype->dev == orig_dev) {
+				if (pt_prev)
+					ret = deliver_skb(skb, pt_prev, orig_dev);
+				pt_prev = ptype;
+			}
 		}
 	}
+#endif
 
 #ifdef CONFIG_NET_CLS_ACT
 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
@@ -3441,6 +3526,7 @@ ncls:
 
 out:
 	rcu_read_unlock();
+	(void)set_exec_env(old_ve);
 	return ret;
 }
 
@@ -4096,6 +4182,10 @@ static void net_rx_action(struct softirq
 
 	local_irq_disable();
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	fence_wdog_check_timer();
+#endif
+
 	while (!list_empty(list)) {
 		struct napi_struct *n;
 		int work, weight;
@@ -4680,8 +4770,13 @@ static int __dev_set_promiscuity(struct 
 			return -EOVERFLOW;
 		}
 	}
-	if (dev->flags != old_flags) {
-		printk(KERN_INFO "device %s %s promiscuous mode\n",
+	/*
+	 * Promiscous mode on LOOPBACK/POINTTOPOINT devices does
+	 * not mean anything
+	 */
+	if ((dev->flags != old_flags) &&
+			!(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) {
+		ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n",
 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
 							       "left");
 		if (audit_enabled) {
@@ -5841,16 +5936,25 @@ int dev_ioctl(struct net *net, unsigned 
 	 *	- require strict serialization.
 	 *	- do not return a value
 	 */
+	case SIOCSIFMTU:
+	case SIOCSIFHWADDR:
 	case SIOCSIFFLAGS:
+	case SIOCSIFTXQLEN:
+		if (!capable(CAP_NET_ADMIN) &&
+		    !capable(CAP_VE_NET_ADMIN))
+			return -EPERM;
+		dev_load(net, ifr.ifr_name);
+		rtnl_lock();
+		ret = dev_ifsioc(net, &ifr, cmd);
+		rtnl_unlock();
+		return ret;
+
 	case SIOCSIFMETRIC:
-	case SIOCSIFMTU:
 	case SIOCSIFMAP:
-	case SIOCSIFHWADDR:
 	case SIOCSIFSLAVE:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCSIFHWBROADCAST:
-	case SIOCSIFTXQLEN:
 	case SIOCSMIIREG:
 	case SIOCBONDENSLAVE:
 	case SIOCBONDRELEASE:
@@ -5930,12 +6034,11 @@ EXPORT_SYMBOL(dev_get_phys_port_id);
  */
 static int dev_new_index(struct net *net)
 {
-	static int ifindex;
 	for (;;) {
-		if (++ifindex <= 0)
-			ifindex = 1;
-		if (!__dev_get_by_index(net, ifindex))
-			return ifindex;
+		if (++net->ifindex <= 0)
+			net->ifindex = 1;
+		if (!__dev_get_by_index(net, net->ifindex))
+			return net->ifindex;
 	}
 }
 
@@ -5951,12 +6054,14 @@ static void rollback_registered_many(str
 {
 	struct net_device *dev;
 	struct net_device_extended *nde, *tmp;
+	struct ve_struct *old_env;
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
 	list_for_each_entry_safe(nde, tmp, head, unreg_list) {
 		dev = nde->dev;
+
 		/* Some devices call without registering
 		 * for initialization unwind. Remove those
 		 * devices and proceed with the remaining.
@@ -5988,6 +6093,8 @@ static void rollback_registered_many(str
 
 	list_for_each_entry(nde, head, unreg_list) {
 		dev = nde->dev;
+		old_env = set_exec_env(dev->owner_env);
+
 		/* Shutdown queueing discipline. */
 		dev_shutdown(dev);
 
@@ -6011,11 +6118,15 @@ static void rollback_registered_many(str
 
 		/* Remove entries from kobject tree */
 		netdev_unregister_kobject(dev);
+
+		set_exec_env(old_env);
 	}
 
 	/* Process any work delayed until the end of the batch */
 	nde = list_first_entry(head, struct net_device_extended, unreg_list);
+	old_env = set_exec_env(nde->dev->owner_env);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, nde->dev);
+	set_exec_env(old_env);
 
 	synchronize_net();
 
@@ -6269,6 +6380,10 @@ int register_netdevice(struct net_device
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 	BUG_ON(!net);
 
+	ret = -EPERM;
+	if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev))
+		goto out;
+
 	spin_lock_init(&dev->addr_list_lock);
 	netdev_set_addr_lockdep_class(dev);
 	netdev_init_queue_locks(dev);
@@ -6358,6 +6473,10 @@ int register_netdevice(struct net_device
 
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 
+	dev->owner_env = get_exec_env();
+	netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub());
+	netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub());
+
 	dev_init_scheduler(dev);
 	dev_hold(dev);
 	list_netdevice(dev);
@@ -6460,6 +6579,65 @@ out:
 }
 EXPORT_SYMBOL(register_netdev);
 
+/*
+ * We do horrible things -- we left a netdevice
+ * in "leaked" state, which means we release as much
+ * resources as possible but the device will remain
+ * present in namespace because someone holds a reference.
+ *
+ * The idea is to be able to force stop VE.
+ */
+static void ve_netdev_leak(struct net_device *dev)
+{
+	struct napi_struct *p, *n;
+
+	dev->is_leaked = 1;
+	barrier();
+
+	/*
+	 * Make sure we're unable to tx/rx
+	 * network packets to outside.
+	 */
+	WARN_ON_ONCE(dev->flags & IFF_UP);
+	WARN_ON_ONCE(dev->qdisc != &noop_qdisc);
+
+	rtnl_lock();
+
+	/*
+	 * No address and napi after that.
+	 */
+	dev_addr_flush(dev);
+	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+		netif_napi_del(p);
+
+	/*
+	 * No release_net() here since the device remains
+	 * present in the namespace.
+	 */
+
+	__rtnl_unlock();
+
+	put_beancounter(netdev_bc(dev)->exec_ub);
+	put_beancounter(netdev_bc(dev)->owner_ub);
+
+	netdev_bc(dev)->exec_ub		= get_beancounter(get_ub0());
+	netdev_bc(dev)->owner_ub	= get_beancounter(get_ub0());
+
+	/*
+	 * Since we've already screwed the device and releasing
+	 * it in a normal way is not possible anymore, we're
+	 * to be sure the device will remain here forever.
+	 */
+	dev_hold(dev);
+
+	synchronize_net();
+
+	pr_emerg("Device (%s:%d:%u:%p) marked as leaked\n",
+		 dev->name, atomic_read(&dev->refcnt) - 1,
+		 VEID(dev->owner_env), dev);
+	dst_cache_dump();
+}
+
 /**
  * set_netdev_hw_features() - Set netdev's hw_features
  * @netdev:       Network device
@@ -6520,9 +6698,10 @@ EXPORT_SYMBOL(get_netdev_ops_ext);
  * We can get stuck here if buggy protocols don't correctly
  * call dev_put.
  */
-static void netdev_wait_allrefs(struct net_device *dev)
+static int netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int i = 0;
 
 	rebroadcast_time = warning_time = jiffies;
 	while (atomic_read(&dev->refcnt) != 0) {
@@ -6554,12 +6733,25 @@ static void netdev_wait_allrefs(struct n
 
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
-			       "waiting for %s to become free. Usage "
-			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       "waiting for %s=%p to become free. Usage "
+			       "count = %d\n ve=%u",
+			       dev->name, dev, atomic_read(&dev->refcnt),
+			       VEID(get_exec_env()));
 			warning_time = jiffies;
 		}
+
+		/*
+		 * If device has lost the reference we might stuck
+		 * in this loop forever not having a chance the VE
+		 * to stop.
+		 */
+		if (++i > 200) { /* give 50 seconds to try */
+			ve_netdev_leak(dev);
+			return -EBUSY;
+		}
 	}
+
+	return 0;
 }
 
 /* The sequence is:
@@ -6589,6 +6781,7 @@ static void netdev_wait_allrefs(struct n
 void netdev_run_todo(void)
 {
 	struct list_head list;
+	struct ve_struct *old_ve;
 
 	/* Snapshot list, allow later requests */
 	list_replace_init(&net_todo_list, &list);
@@ -6601,6 +6794,7 @@ void netdev_run_todo(void)
 	if (!list_empty(&list))
 		rcu_barrier();
 
+	old_ve = get_exec_env();
 	while (!list_empty(&list)) {
 		struct net_device *dev
 			= list_first_entry(&list, struct net_device, todo_list);
@@ -6613,11 +6807,17 @@ void netdev_run_todo(void)
 			continue;
 		}
 
+		(void)set_exec_env(dev->owner_env);
 		dev->reg_state = NETREG_UNREGISTERED;
 
 		on_each_cpu(flush_backlog, dev, 1);
 
-		netdev_wait_allrefs(dev);
+		/*
+		 * Even if device get stuck here we are
+		 * to proceed the rest of the list.
+		 */
+		if (netdev_wait_allrefs(dev))
+			continue;
 
 		/* paranoia */
 		BUG_ON(atomic_read(&dev->refcnt));
@@ -6625,12 +6825,21 @@ void netdev_run_todo(void)
 		WARN_ON(dev->ip6_ptr);
 		WARN_ON(dev->dn_ptr);
 
+		put_beancounter(netdev_bc(dev)->exec_ub);
+		put_beancounter(netdev_bc(dev)->owner_ub);
+		netdev_bc(dev)->exec_ub = NULL;
+		netdev_bc(dev)->owner_ub = NULL;
+
+		/* It must be the very last action,
+		 * after this 'dev' may point to freed up memory.
+		 */
 		if (dev->destructor)
 			dev->destructor(dev);
 
 		/* Free network device */
 		kobject_put(&dev->dev.kobj);
 	}
+	(void)set_exec_env(old_ve);
 }
 
 /**
@@ -6707,7 +6916,7 @@ static void netdev_stats64_to_stats(stru
 }
 
 /**
- *	dev_get_stats	- get network device statistics
+ *	__dev_get_stats	- get network device statistics
  *	@dev: device to get statistics from
  *
  *	Get network statistics from device. The device driver may provide
@@ -6715,7 +6924,7 @@ static void netdev_stats64_to_stats(stru
  *	dev->netdev_ops->get_stats; otherwise the internal statistics
  *	structure is used.
  */
-const struct net_device_stats *dev_get_stats(struct net_device *dev)
+static const struct net_device_stats *__dev_get_stats(struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -6731,6 +6940,63 @@ const struct net_device_stats *dev_get_s
 	dev_txq_stats_fold(dev, &dev->stats);
 	return &dev->stats;
 }
+
+#ifdef CONFIG_VE
+#define add_stats(res, a, b, field) 		\
+do {						\
+	res->field = a->field + b->field;	\
+} while (0)
+
+static struct net_device_stats *summ_stats(struct net_device_stats *res,
+					   const struct net_device_stats *a,
+					   const struct net_device_stats *b)
+{
+	add_stats(res, a, b, rx_packets);
+	add_stats(res, a, b, tx_packets);
+	add_stats(res, a, b, tx_packets);
+	add_stats(res, a, b, rx_bytes);
+	add_stats(res, a, b, tx_bytes);
+	add_stats(res, a, b, rx_errors);
+	add_stats(res, a, b, tx_errors);
+	add_stats(res, a, b, rx_dropped);
+	add_stats(res, a, b, tx_dropped);
+	add_stats(res, a, b, multicast);
+	add_stats(res, a, b, collisions);
+	add_stats(res, a, b, rx_length_errors);
+	add_stats(res, a, b, rx_over_errors);
+	add_stats(res, a, b, rx_crc_errors);
+	add_stats(res, a, b, rx_frame_errors);
+	add_stats(res, a, b, rx_fifo_errors);
+	add_stats(res, a, b, rx_missed_errors);
+	add_stats(res, a, b, tx_aborted_errors);
+	add_stats(res, a, b, tx_carrier_errors);
+	add_stats(res, a, b, tx_fifo_errors);
+	add_stats(res, a, b, tx_heartbeat_errors);
+	add_stats(res, a, b, tx_window_errors);
+	add_stats(res, a, b, rx_compressed);
+	add_stats(res, a, b, tx_compressed);
+
+	return res;
+}
+const struct net_device_stats *dev_get_stats(struct net_device *dev)
+{
+	const struct net_device_stats *curr = __dev_get_stats(dev);
+	struct net_device_stats *saved, *buff;
+
+	saved = &dev->s_stats;
+	buff  = &dev->b_stats;
+
+	if (!ve_is_super(dev->owner_env))
+		return summ_stats(buff, curr, saved);
+	else
+		return curr;
+}
+#else
+const struct net_device_stats *dev_get_stats(struct net_device *dev)
+{
+	return __dev_get_stats(dev);
+}
+#endif
 EXPORT_SYMBOL(dev_get_stats);
 
 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
@@ -6902,13 +7168,13 @@ struct net_device *alloc_netdev_mqs(int 
 	/* ensure 32-byte alignment of whole construct */
 	alloc_size += NETDEV_ALIGN - 1;
 
-	p = kzalloc(alloc_size, GFP_KERNEL);
+	p = kzalloc(alloc_size, GFP_KERNEL_UBC);
 	if (!p) {
 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
 		return NULL;
 	}
 
-	tx = kcalloc(txqs, sizeof(struct netdev_queue), GFP_KERNEL);
+	tx = kcalloc(txqs, sizeof(struct netdev_queue), GFP_KERNEL_UBC);
 	if (!tx) {
 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
 		       "tx qdiscs.\n");
@@ -6960,6 +7226,7 @@ struct net_device *alloc_netdev_mqs(int 
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
+	INIT_LIST_HEAD(&netdev_extended(dev)->unreg_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 	strcpy(dev->name, name);
@@ -6989,6 +7256,13 @@ void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
+	if (dev->is_leaked) {
+		pr_emerg("%s: device %s=%p is leaked\n",
+			 __func__, dev->name, dev);
+		dump_stack();
+		return;
+	}
+
 	release_net(dev_net(dev));
 
 	kfree(netdev_extended(dev)->_tx_ext);
@@ -7049,7 +7323,7 @@ void unregister_netdevice_queue(struct n
 	ASSERT_RTNL();
 
 	if (head) {
-		list_add_tail(&netdev_extended(dev)->unreg_list, head);
+		list_move_tail(&netdev_extended(dev)->unreg_list, head);
 	} else {
 		rollback_registered(dev);
 		/* Finish processing unregister after unlock */
@@ -7099,6 +7373,45 @@ void unregister_netdev(struct net_device
 }
 EXPORT_SYMBOL(unregister_netdev);
 
+#if defined(CONFIG_SYSFS) && defined(CONFIG_VE)
+extern int ve_netdev_add(struct device *dev, struct ve_struct *ve);
+extern int ve_netdev_delete(struct device *dev, struct ve_struct *ve);
+
+/*
+ *     netdev_fixup_sysfs - create/remove dirlinks to the net device directory
+ *     @dev: net device
+ *     @op: operation type registration/deregistration
+ */
+static void netdev_fixup_sysfs(struct net_device *dev,
+				unsigned long op)
+{
+	struct ve_struct *ve = get_exec_env();
+	int err;
+
+	/* Super VE do not need to patch sysfs entries */
+	if (ve_is_super(ve))
+		return;
+
+	if (op == NETDEV_REGISTER)
+		err = ve_netdev_add(&dev->dev, ve);
+	else
+		err = ve_netdev_delete(&dev->dev, ve);
+
+	/*
+	 * Just tell about error in case the state can't
+	 * be properly reverted at net device namespace
+	 * changing
+	 */
+	WARN_ON(err);
+}
+#else
+static inline void netdev_fixup_sysfs(struct net_device *dev,
+					unsigned long op)
+{
+	return 0;
+}
+#endif
+
 /**
  *	dev_change_net_namespace - move device to different nethost namespace
  *	@dev: device
@@ -7113,9 +7426,16 @@ EXPORT_SYMBOL(unregister_netdev);
  *	Callers must hold the rtnl semaphore.
  */
 
-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat,
+		struct user_beancounter *exec_ub)
 {
 	int err;
+	struct user_beancounter *tmp_ub;
+#ifdef CONFIG_VE
+	struct ve_struct *cur_ve = get_exec_env();
+	struct ve_struct *src_ve = dev->owner_env;
+	struct ve_struct *dst_ve = net->owner_ve;
+#endif
 
 	ASSERT_RTNL();
 
@@ -7124,17 +7444,6 @@ int dev_change_net_namespace(struct net_
 	if (dev->features & NETIF_F_NETNS_LOCAL)
 		goto out;
 
-#ifdef CONFIG_SYSFS
-	/* Don't allow real devices to be moved when sysfs
-	 * is enabled.
-	 */
-	err = -EINVAL;
-	if (dev->dev.parent) {
-		printk(KERN_INFO "real device is not allowed to be moved\n");
-		goto out;
-	}
-#endif
-
 	/* Ensure the device has been registrered */
 	err = -EINVAL;
 	if (dev->reg_state != NETREG_REGISTERED)
@@ -7168,6 +7477,11 @@ int dev_change_net_namespace(struct net_
 	err = -ENODEV;
 	unlist_netdevice(dev);
 
+	dev->owner_env = dst_ve;
+	tmp_ub = netdev_bc(dev)->exec_ub;
+	netdev_bc(dev)->exec_ub = get_beancounter(exec_ub);
+	put_beancounter(tmp_ub);
+
 	synchronize_net();
 
 	/* Shutdown queueing discipline. */
@@ -7180,8 +7494,11 @@ int dev_change_net_namespace(struct net_
 	   This is wanted because this way 8021q and macvlan know
 	   the device is just moving and can keep their slaves up.
 	*/
+	set_exec_env(src_ve);
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+	(void)set_exec_env(cur_ve);
+
 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
 	/*
@@ -7190,7 +7507,10 @@ int dev_change_net_namespace(struct net_
 	dev_unicast_flush(dev);
 	dev_addr_discard(dev);
 
+	set_exec_env(src_ve);
+	netdev_fixup_sysfs(dev, NETDEV_UNREGISTER);
 	netdev_unregister_kobject(dev);
+	set_exec_env(cur_ve);
 
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
@@ -7204,14 +7524,19 @@ int dev_change_net_namespace(struct net_
 	}
 
 	/* Fixup kobjects */
+	set_exec_env(dst_ve);
 	err = netdev_register_kobject(dev);
+	netdev_fixup_sysfs(dev, NETDEV_REGISTER);
+	set_exec_env(cur_ve);
 	WARN_ON(err);
 
 	/* Add the device back in the hashes */
 	list_netdevice(dev);
 
 	/* Notify protocols, that a new device appeared. */
+	set_exec_env(dst_ve);
 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
+	(void)set_exec_env(cur_ve);
 
 	/*
 	 *	Prevent userspace races by waiting until the network
@@ -7226,6 +7551,14 @@ out:
 }
 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
 
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+	struct user_beancounter *ub = get_exec_ub();
+
+	return __dev_change_net_namespace(dev, net, pat, ub);
+}
+EXPORT_SYMBOL(__dev_change_net_namespace);
+
 static int dev_cpu_callback(struct notifier_block *nfb,
 			    unsigned long action,
 			    void *ocpu)
@@ -7310,7 +7643,7 @@ static struct hlist_head *netdev_create_
 	int i;
 	struct hlist_head *hash;
 
-	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC);
 	if (hash != NULL)
 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
 			INIT_HLIST_HEAD(&hash[i]);
@@ -7443,14 +7776,13 @@ static struct pernet_operations __net_in
 
 static void __net_exit default_device_exit(struct net *net)
 {
-	struct net_device *dev;
+	struct net_device *dev, *aux;
 	/*
-	 * Push all migratable of the network devices back to the
+	 * Push all migratable network devices back to the
 	 * initial network namespace
 	 */
 	rtnl_lock();
-restart:
-	for_each_netdev(net, dev) {
+	for_each_netdev_safe(net, dev, aux) {
 		int err;
 		char fb_name[IFNAMSIZ];
 
@@ -7458,11 +7790,9 @@ restart:
 		if (dev->features & NETIF_F_NETNS_LOCAL)
 			continue;
 
-		/* Delete virtual devices */
-		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
-			dev->rtnl_link_ops->dellink(dev);
-			goto restart;
-		}
+		/* Leave virtual devices for the generic cleanup */
+		if (dev->rtnl_link_ops)
+			continue;
 
 		/* Push remaing network devices to init_net */
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
@@ -7472,13 +7802,41 @@ restart:
 				__func__, dev->name, err);
 			BUG();
 		}
-		goto restart;
 	}
 	rtnl_unlock();
 }
 
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+	/* At exit all network devices most be removed from a network
+	 * namespace.  Do this in the reverse order of registeration.
+	 * Do this across as many network namespaces as possible to
+	 * improve batching efficiency.
+	 */
+	struct net_device *dev;
+	struct net *net;
+	LIST_HEAD(dev_kill_list);
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list) {
+		struct ve_struct *old_env;
+
+		old_env = set_exec_env(net->owner_ve);
+		for_each_netdev_reverse(net, dev) {
+			if (dev->rtnl_link_ops)
+				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+			else
+				unregister_netdevice_queue(dev, &dev_kill_list);
+		}
+		set_exec_env(old_env);
+	}
+	unregister_netdevice_many(&dev_kill_list);
+	rtnl_unlock();
+}
+
 static struct pernet_operations __net_initdata default_device_ops = {
 	.exit = default_device_exit,
+	.exit_batch = default_device_exit_batch,
 };
 
 /*
@@ -7575,3 +7933,32 @@ static int __init initialize_hashrnd(voi
 
 late_initcall_sync(initialize_hashrnd);
 
+static LIST_HEAD(dev_cpt_operations);
+
+void register_netdev_rst(struct netdev_rst *ops)
+{
+	rtnl_lock();
+	list_add_tail(&ops->list, &dev_cpt_operations);
+	__rtnl_unlock();
+}
+EXPORT_SYMBOL(register_netdev_rst);
+
+void unregister_netdev_rst(struct netdev_rst *ops)
+{
+	rtnl_lock();
+	list_del(&ops->list);
+	__rtnl_unlock();
+}
+EXPORT_SYMBOL(unregister_netdev_rst);
+
+struct netdev_rst *netdev_find_rst(int cpt_object, struct netdev_rst *ops)
+{
+	ops = list_prepare_entry(ops, &dev_cpt_operations, list);
+
+	list_for_each_entry_continue(ops, &dev_cpt_operations, list)
+		if (ops->cpt_object == cpt_object)
+			return ops;
+
+	return NULL;
+}
+EXPORT_SYMBOL(netdev_find_rst);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/dst.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/dst.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/dst.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/dst.c	2015-01-21 12:02:47.078124054 +0300
@@ -242,6 +242,7 @@ again:
 #if RT_CACHE_DEBUG >= 2
 	atomic_dec(&dst_total);
 #endif
+	dst->flags |= DST_FREE;
 	kmem_cache_free(dst->ops->kmem_cachep, dst);
 
 	dst = child;
@@ -313,6 +314,7 @@ static int dst_dev_event(struct notifier
 	switch (event) {
 	case NETDEV_UNREGISTER:
 	case NETDEV_DOWN:
+		dst_gc_task(NULL);
 		mutex_lock(&dst_gc_mutex);
 		for (dst = dst_busy_list; dst; dst = dst->next) {
 			last = dst;
@@ -349,3 +351,18 @@ void __init dst_init(void)
 EXPORT_SYMBOL(__dst_free);
 EXPORT_SYMBOL(dst_alloc);
 EXPORT_SYMBOL(dst_destroy);
+
+void dst_dump_one(struct dst_entry *d)
+{
+	printk("\tdev %p err %d obs %d flags %x i/o %p/%p ref %d use %d\n",
+			d->dev, (int)d->error, (int)d->obsolete, d->flags,
+			d->input, d->output, atomic_read(&d->__refcnt), d->__use);
+}
+EXPORT_SYMBOL(dst_dump_one);
+
+void dst_cache_dump(void)
+{
+	ip_rt_dump_dsts();
+	if (ip6_rt_dump_dsts)
+		ip6_rt_dump_dsts();
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/ethtool.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/ethtool.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/ethtool.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/ethtool.c	2015-01-21 12:02:45.692160850 +0300
@@ -2026,12 +2026,17 @@ int dev_ethtool(struct net *net, struct 
 	case ETHTOOL_GRXCLSRLCNT:
 	case ETHTOOL_GRXCLSRULE:
 	case ETHTOOL_GRXCLSRLALL:
+		break;
+	case ETHTOOL_SEEPROM:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		break;
 	case ETHTOOL_GRSSH:
 	case ETHTOOL_GFEATURES:
 	case ETHTOOL_GET_TS_INFO:
 		break;
 	default:
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			return -EPERM;
 	}
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/fib_rules.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/fib_rules.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/fib_rules.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/fib_rules.c	2015-01-21 12:02:43.283224806 +0300
@@ -20,7 +20,7 @@ int fib_default_rule_add(struct fib_rule
 {
 	struct fib_rule *r;
 
-	r = kzalloc(ops->rule_size, GFP_KERNEL);
+	r = kzalloc(ops->rule_size, GFP_KERNEL_UBC);
 	if (r == NULL)
 		return -ENOMEM;
 
@@ -90,7 +90,7 @@ static void flush_route_cache(struct fib
 		ops->flush_cache(ops);
 }
 
-int fib_rules_register(struct fib_rules_ops *ops)
+static int __fib_rules_register(struct fib_rules_ops *ops)
 {
 	int err = -EEXIST;
 	struct fib_rules_ops *o;
@@ -120,6 +120,28 @@ errout:
 	return err;
 }
 
+struct fib_rules_ops *
+fib_rules_register(struct fib_rules_ops *tmpl, struct net *net)
+{
+	struct fib_rules_ops *ops;
+	int err;
+
+	ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL);
+	if (ops == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->rules_list);
+	ops->fro_net = net;
+
+	err = __fib_rules_register(ops);
+	if (err) {
+		kfree(ops);
+		ops = ERR_PTR(err);
+	}
+
+	return ops;
+}
+
 EXPORT_SYMBOL_GPL(fib_rules_register);
 
 void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
@@ -133,6 +155,15 @@ void fib_rules_cleanup_ops(struct fib_ru
 }
 EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
 
+static void fib_rules_put_rcu(struct rcu_head *head)
+{
+	struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
+	struct net *net = ops->fro_net;
+
+	release_net(net);
+	kfree(ops);
+}
+
 void fib_rules_unregister(struct fib_rules_ops *ops)
 {
 	struct net *net = ops->fro_net;
@@ -142,8 +173,7 @@ void fib_rules_unregister(struct fib_rul
 	fib_rules_cleanup_ops(ops);
 	spin_unlock(&net->rules_mod_lock);
 
-	synchronize_rcu();
-	release_net(net);
+	call_rcu(&ops->rcu, fib_rules_put_rcu);
 }
 
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
@@ -256,7 +286,7 @@ static int fib_nl_newrule(struct sk_buff
 	if (err < 0)
 		goto errout;
 
-	rule = kzalloc(ops->rule_size, GFP_KERNEL);
+	rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC);
 	if (rule == NULL) {
 		err = -ENOMEM;
 		goto errout;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/filter.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/filter.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/filter.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/filter.c	2015-01-21 12:02:43.283224806 +0300
@@ -540,7 +540,7 @@ int sk_attach_filter(struct sock_fprog *
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC);
 	if (!fp)
 		return -ENOMEM;
 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/neighbour.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/neighbour.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/neighbour.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/neighbour.c	2015-01-21 12:02:49.932048291 +0300
@@ -21,6 +21,8 @@
 #include <linux/socket.h>
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -39,6 +41,7 @@
 #include <linux/inetdevice.h>
 #include <net/addrconf.h>
 #endif
+#include <bc/beancounter.h>
 
 #define NEIGH_DEBUG 1
 
@@ -268,17 +271,19 @@ static struct neighbour *neigh_alloc(str
 	int entries;
 
 	entries = atomic_inc_return(&tbl->entries) - 1;
+	n = ERR_PTR(-ENOBUFS);
 	if (entries >= tbl->gc_thresh3 ||
 	    (entries >= tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
 		if (!neigh_forced_gc(tbl) &&
-		    entries >= tbl->gc_thresh3)
+		    entries >= tbl->gc_thresh3 &&
+		    !restoring_ve(get_exec_env()))
 			goto out_entries;
 	}
 
 	n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
 	if (!n)
-		goto out_entries;
+		goto out_nomem;
 
 	skb_queue_head_init(&n->arp_queue);
 	rwlock_init(&n->lock);
@@ -295,6 +300,8 @@ static struct neighbour *neigh_alloc(str
 out:
 	return n;
 
+out_nomem:
+	n = ERR_PTR(-ENOMEM);
 out_entries:
 	atomic_dec(&tbl->entries);
 	goto out;
@@ -324,6 +331,17 @@ static void neigh_hash_free(struct neigh
 		free_pages((unsigned long)hash, get_order(size));
 }
 
+static void init_hash_rnd(struct neigh_table *tbl)
+{
+	int i;
+
+	for (i = 0; i < NEIGH_NUM_HASH_RND; i++) {
+		get_random_bytes(&tbl->hash_rnd[i],
+				 sizeof(tbl->hash_rnd[i]));
+		tbl->hash_rnd[i] |= 1;
+	}
+}
+
 static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
 {
 	struct neighbour **new_hash, **old_hash;
@@ -340,7 +358,7 @@ static void neigh_hash_grow(struct neigh
 	new_hash_mask = new_entries - 1;
 	old_hash = tbl->hash_buckets;
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+	init_hash_rnd(tbl);
 	for (i = 0; i < old_entries; i++) {
 		struct neighbour *n, *next;
 
@@ -413,12 +431,11 @@ struct neighbour *neigh_create(struct ne
 	u32 hash_val;
 	int key_len = tbl->key_len;
 	int error;
-	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+	struct neighbour *n1, *rc, *n;
 
-	if (!n) {
-		rc = ERR_PTR(-ENOBUFS);
+	rc = n = neigh_alloc(tbl);
+	if (IS_ERR(n))
 		goto out;
-	}
 
 	memcpy(n->primary_key, pkey, key_len);
 	n->dev = dev;
@@ -629,6 +646,13 @@ void neigh_destroy(struct neighbour *nei
 
 	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
 
+	if (neigh->dev->is_leaked) {
+		printk(KERN_WARNING
+		       "Destroying neighbour %p on leaked device\n", neigh);
+		dump_stack();
+		return;
+	}
+
 	if (!neigh->dead) {
 		printk(KERN_WARNING
 		       "Destroying alive neighbour %p\n", neigh);
@@ -738,10 +762,21 @@ static void neigh_periodic_work(struct w
 			if (atomic_read(&n->refcnt) == 1 &&
 			    (state == NUD_FAILED ||
 			     time_after(jiffies, n->used + n->parms->gc_staletime))) {
+				struct net_device *dev = n->dev;
+				struct ve_struct *ve;
+				struct user_beancounter *ub;
+
 				*np = n->next;
 				n->dead = 1;
 				write_unlock(&n->lock);
+
+				ve = set_exec_env(dev->owner_env);
+				ub = set_exec_ub(netdev_bc(dev)->owner_ub);
+
 				neigh_cleanup_and_release(n);
+
+				set_exec_ub(ub);
+				set_exec_env(ve);
 				continue;
 			}
 			write_unlock(&n->lock);
@@ -804,6 +839,11 @@ static void neigh_timer_handler(unsigned
 	struct neighbour *neigh = (struct neighbour *)arg;
 	unsigned state;
 	int notify = 0;
+	struct ve_struct *env;
+	struct user_beancounter *ub;
+
+	env = set_exec_env(neigh->dev->owner_env);
+	ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub);
 
 	write_lock(&neigh->lock);
 
@@ -889,6 +929,8 @@ out:
 		neigh_update_notify(neigh);
 
 	neigh_release(neigh);
+	(void)set_exec_ub(ub);
+	(void)set_exec_env(env);
 }
 
 int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
@@ -1295,9 +1337,16 @@ static void neigh_proxy_process(unsigned
 		if (tdif <= 0) {
 			struct net_device *dev = skb->dev;
 			__skb_unlink(skb, &tbl->proxy_queue);
-			if (tbl->proxy_redo && netif_running(dev))
+			if (tbl->proxy_redo && netif_running(dev)) {
+				struct ve_struct *ve;
+				struct user_beancounter *ub;
+
+				ve = set_exec_env(dev->owner_env);
+				ub = set_exec_ub(netdev_bc(dev)->owner_ub);
 				tbl->proxy_redo(skb);
-			else
+				set_exec_ub(ub);
+				set_exec_env(ve);
+			} else
 				kfree_skb(skb);
 
 			dev_put(dev);
@@ -1466,7 +1515,7 @@ void neigh_table_init_no_netlink(struct 
 	if (!tbl->hash_buckets || !tbl->phash_buckets)
 		panic("cannot allocate neighbour cache hashes");
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+	init_hash_rnd(tbl);
 
 	rwlock_init(&tbl->lock);
 	INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
@@ -1776,7 +1825,7 @@ static int neightbl_fill_info(struct sk_
 			.ndtc_entries		= atomic_read(&tbl->entries),
 			.ndtc_last_flush	= jiffies_to_msecs(flush_delta),
 			.ndtc_last_rand		= jiffies_to_msecs(rand_delta),
-			.ndtc_hash_rnd		= tbl->hash_rnd,
+			.ndtc_hash_rnd		= tbl->hash_rnd[0],
 			.ndtc_hash_mask		= tbl->hash_mask,
 			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
 		};
@@ -2108,6 +2157,35 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
+			    u32 pid, u32 seq, int type, unsigned int flags,
+			    struct neigh_table *tbl)
+{
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family	 = tbl->family;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags	 = pn->flags | NTF_PROXY;
+	ndm->ndm_type	 = NDA_DST;
+	ndm->ndm_ifindex = pn->dev->ifindex;
+	ndm->ndm_state	 = NUD_NONE;
+
+	NLA_PUT(skb, NDA_DST, tbl->key_len, pn->key);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
 static void neigh_update_notify(struct neighbour *neigh)
 {
 	call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -2153,23 +2231,78 @@ out:
 	return rc;
 }
 
+static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+			     struct netlink_callback *cb)
+{
+	struct pneigh_entry *n;
+	struct net *net = sock_net(skb->sk);
+	int rc, h, s_h = cb->args[3];
+	int idx, s_idx = idx = cb->args[4];
+
+	read_lock_bh(&tbl->lock);
+
+	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
+		if (h < s_h)
+			continue;
+		if (h > s_h)
+			s_idx = 0;
+		for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+			if (dev_net(n->dev) != net)
+				continue;
+			if (idx < s_idx)
+				goto next;
+			if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI, tbl) <= 0) {
+				read_unlock_bh(&tbl->lock);
+				rc = -1;
+				goto out;
+			}
+		next:
+			idx++;
+		}
+	}
+
+	read_unlock_bh(&tbl->lock);
+	rc = skb->len;
+out:
+	cb->args[3] = h;
+	cb->args[4] = idx;
+	return rc;
+
+}
+
 static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct neigh_table *tbl;
 	int t, family, s_t;
+	int proxy = 0;
+	int err = 0;
 
 	read_lock(&neigh_tbl_lock);
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+	/* check for full ndmsg structure presence, family member is
+	 * the same for both structures
+	 */
+	if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
+	    ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+		proxy = 1;
+
 	s_t = cb->args[0];
 
-	for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+	for (tbl = neigh_tables, t = 0; tbl && (err >= 0);
+	     tbl = tbl->next, t++) {
 		if (t < s_t || (family && tbl->family != family))
 			continue;
 		if (t > s_t)
 			memset(&cb->args[1], 0, sizeof(cb->args) -
 						sizeof(cb->args[0]));
-		if (neigh_dump_table(tbl, skb, cb) < 0)
-			break;
+		if (proxy)
+			err = pneigh_dump_table(tbl, skb, cb);
+		else
+			err = neigh_dump_table(tbl, skb, cb);
 	}
 	read_unlock(&neigh_tbl_lock);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/net-sysfs.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/net-sysfs.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/net-sysfs.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/net-sysfs.c	2015-01-21 12:02:51.339010941 +0300
@@ -354,6 +354,27 @@ static struct device_attribute net_class
 	{}
 };
 
+#ifdef CONFIG_VE
+struct device_attribute ve_net_class_attributes[] = {
+	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
+	__ATTR(iflink, S_IRUGO, show_iflink, NULL),
+	__ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
+	__ATTR(features, S_IRUGO, show_features, NULL),
+	__ATTR(type, S_IRUGO, show_type, NULL),
+	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
+	__ATTR(address, S_IRUGO, show_address, NULL),
+	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
+	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
+	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
+	__ATTR(operstate, S_IRUGO, show_operstate, NULL),
+	__ATTR(mtu, S_IRUGO, show_mtu, NULL),
+	__ATTR(flags, S_IRUGO, show_flags, NULL),
+	__ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL),
+	{}
+};
+EXPORT_SYMBOL(ve_net_class_attributes);
+#endif
+
 /* Show a given an attribute in the statistics group */
 static ssize_t netstat_show(const struct device *d,
 			    struct device_attribute *attr, char *buf,
@@ -1174,9 +1195,6 @@ static int netdev_uevent(struct device *
 	struct net_device *dev = to_net_dev(d);
 	int retval;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return 0;
-
 	/* pass interface to uevent. */
 	retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
 	if (retval)
@@ -1208,7 +1226,7 @@ static void netdev_release(struct device
 	kfree((char *)dev_ext_frozen - dev->padded);
 }
 
-static struct class net_class = {
+struct class net_class = {
 	.name = "net",
 	.dev_release = netdev_release,
 #ifdef CONFIG_SYSFS
@@ -1218,6 +1236,13 @@ static struct class net_class = {
 	.dev_uevent = netdev_uevent,
 #endif
 };
+EXPORT_SYMBOL(net_class);
+
+#ifndef CONFIG_VE
+#define visible_net_class net_class
+#else
+#define visible_net_class (*get_exec_env()->net_class)
+#endif
 
 /* Delete sysfs entries but hold kobject reference until after all
  * netdev references are gone.
@@ -1228,7 +1253,8 @@ void netdev_unregister_kobject(struct ne
 
 	kobject_get(&dev->kobj);
 
-	if (dev_net(net) != &init_net)
+	if (dev_net(net)->owner_ve->ve_netns &&
+	    dev_net(net)->owner_ve->ve_netns != net->nd_net)
 		return;
 
 	remove_queue_kobjects(net);
@@ -1243,7 +1269,7 @@ int netdev_register_kobject(struct net_d
 	const struct attribute_group **groups = net->sysfs_groups;
 	int error = 0;
 
-	dev->class = &net_class;
+	dev->class = &visible_net_class;
 	dev->platform_data = net;
 	dev->groups = groups;
 
@@ -1258,7 +1284,8 @@ int netdev_register_kobject(struct net_d
 #endif
 #endif /* CONFIG_SYSFS */
 
-	if (dev_net(net) != &init_net)
+	if (dev_net(net)->owner_ve->ve_netns &&
+	    dev_net(net)->owner_ve->ve_netns != net->nd_net)
 		return 0;
 
 	error = device_add(dev);
@@ -1347,7 +1374,32 @@ void netdev_initialize_kobject(struct ne
 	device_initialize(device);
 }
 
+void prepare_sysfs_netdev(void)
+{
+#ifdef CONFIG_VE
+	get_ve0()->net_class = &net_class;
+#endif
+}
+
 int netdev_kobject_init(void)
 {
+	prepare_sysfs_netdev();
 	return class_register(&net_class);
 }
+
+/*
+ * device_add helper to detect net class provider devices and
+ * do not add to them ve_device property
+ */
+int is_dev_netdev(struct device *dev)
+{
+	/*
+	 * At early stages the class is NULL and the visible_net_class of VE is
+	 * NULL too, which cause all devices became netdevs, of course
+	 * this situation must be avoided.
+	 */
+	if (!dev->class)
+		return 0;
+
+	return (dev->class == &visible_net_class);
+}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/net_namespace.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/net_namespace.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/net_namespace.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/net_namespace.c	2015-01-21 12:02:58.119830945 +0300
@@ -1,6 +1,7 @@
 #include <linux/workqueue.h>
 #include <linux/rtnetlink.h>
 #include <linux/cache.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/delay.h>
@@ -8,6 +9,7 @@
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
+#include <linux/netdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/file.h>
 #include <net/net_namespace.h>
@@ -22,7 +24,7 @@ static struct list_head *first_device = 
 static DEFINE_MUTEX(net_mutex);
 
 LIST_HEAD(net_namespace_list);
-EXPORT_SYMBOL_GPL(net_namespace_list);
+EXPORT_SYMBOL(net_namespace_list);
 
 struct net init_net = {
 	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
@@ -33,6 +35,25 @@ EXPORT_SYMBOL(init_net);
 
 static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
 
+struct net_context {
+	struct ve_struct *ve;
+	struct user_beancounter *ub;
+};
+
+static void set_net_context(struct net *net, struct net_context *ctx)
+{
+	ctx->ve = set_exec_env(net->owner_ve);
+	ctx->ub = get_exec_ub();
+	if (net->loopback_dev)
+		set_exec_ub(netdev_bc(net->loopback_dev)->exec_ub);
+}
+
+static void restore_net_context(struct net_context *ctx)
+{
+	set_exec_env(ctx->ve);
+	set_exec_ub(ctx->ub);
+}
+
 static struct net_generic *net_alloc_generic(void)
 {
 	struct net_generic *ng;
@@ -45,27 +66,87 @@ static struct net_generic *net_alloc_gen
 	return ng;
 }
 
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+	int err;
+	if (ops->id && ops->size) {
+		void *data = kzalloc(ops->size, GFP_KERNEL);
+		if (!data)
+			return -ENOMEM;
+
+		err = net_assign_generic(net, *ops->id, data);
+		if (err) {
+			kfree(data);
+			return err;
+		}
+	}
+	if (ops->init)
+		return ops->init(net);
+	return 0;
+}
+
+static void ops_free(const struct pernet_operations *ops, struct net *net)
+{
+	if (ops->id && ops->size) {
+		int id = *ops->id;
+		kfree(net_generic(net, id));
+	}
+}
+
+static void ops_exit_list(const struct pernet_operations *ops,
+			  struct list_head *net_exit_list)
+{
+	struct net *net;
+
+	if (ops->exit) {
+		list_for_each_entry(net, net_exit_list, exit_list) {
+			struct ve_struct *old_env;
+
+			old_env = set_exec_env(net->owner_ve);
+			ops->exit(net);
+			set_exec_env(old_env);
+		}
+	}
+	if (ops->exit_batch)
+		ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+			  struct list_head *net_exit_list)
+{
+	struct net *net;
+	if (ops->size && ops->id) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops_free(ops, net);
+	}
+}
+
 /*
  * setup_net runs the initializers for the network namespace object.
  */
 static __net_init int setup_net(struct net *net)
 {
 	/* Must be called with net_mutex held */
-	struct pernet_operations *ops;
+	const struct pernet_operations *ops, *saved_ops;
 	int error = 0;
+	LIST_HEAD(net_exit_list);
+
+#ifdef CONFIG_VE
+	net->owner_ve = get_exec_env();
+#endif
 
 	atomic_set(&net->count, 1);
 
+	net->dev_base_seq = 1;
+
 #ifdef NETNS_REFCNT_DEBUG
 	atomic_set(&net->use_count, 0);
 #endif
 
 	list_for_each_entry(ops, &pernet_list, list) {
-		if (ops->init) {
-			error = ops->init(net);
-			if (error < 0)
-				goto out_undo;
-		}
+		error = ops_init(ops, net);
+		if (error < 0)
+			goto out_undo;
 	}
 out:
 	return error;
@@ -74,10 +155,14 @@ out_undo:
 	/* Walk through the list backwards calling the exit functions
 	 * for the pernet modules whose init functions did not fail.
 	 */
-	list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
-		if (ops->exit)
-			ops->exit(net);
-	}
+	list_add(&net->exit_list, &net_exit_list);
+	saved_ops = ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_exit_list(ops, &net_exit_list);
+
+	ops = saved_ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_free_list(ops, &net_exit_list);
 
 	rcu_barrier();
 	goto out;
@@ -112,6 +197,8 @@ out_free:
 
 static void net_free(struct net *net)
 {
+	struct completion *sysfs_completion;
+
 #ifdef NETNS_REFCNT_DEBUG
 	if (unlikely(atomic_read(&net->use_count) != 0)) {
 		printk(KERN_EMERG "network namespace not free! Usage: %d\n",
@@ -119,8 +206,11 @@ static void net_free(struct net *net)
 		return;
 	}
 #endif
+	sysfs_completion = net->sysfs_completion;
 	kfree(net->gen);
 	kmem_cache_free(net_cachep, net);
+	if (sysfs_completion)
+		complete(sysfs_completion);
 }
 
 static struct net *net_create(void)
@@ -153,18 +243,29 @@ struct net *copy_net_ns(unsigned long fl
 	return net_create();
 }
 
+static DEFINE_SPINLOCK(cleanup_list_lock);
+static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
+
 static void cleanup_net(struct work_struct *work)
 {
-	struct pernet_operations *ops;
-	struct net *net;
-
-	net = container_of(work, struct net, work);
+	const struct pernet_operations *ops;
+	struct net *net, *tmp;
+	LIST_HEAD(net_kill_list);
+	LIST_HEAD(net_exit_list);
+
+	/* Atomically snapshot the list of namespaces to cleanup */
+	spin_lock_irq(&cleanup_list_lock);
+	list_replace_init(&cleanup_list, &net_kill_list);
+	spin_unlock_irq(&cleanup_list_lock);
 
 	mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_del_rcu(&net->list);
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+		list_del_rcu(&net->list);
+		list_add_tail(&net->exit_list, &net_exit_list);
+	}
 	rtnl_unlock();
 
 	/*
@@ -175,10 +276,16 @@ static void cleanup_net(struct work_stru
 	synchronize_rcu();
 
 	/* Run all of the network namespace exit methods */
-	list_for_each_entry_reverse(ops, &pernet_list, list) {
-		if (ops->exit)
-			ops->exit(net);
-	}
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_exit_list(ops, &net_exit_list);
+
+	/* Free the net generic variables */
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_free_list(ops, &net_exit_list);
+
+	list_for_each_entry(net, &net_kill_list, cleanup_list)
+		if (net->owner_ve->ve_netns == net)
+			net->owner_ve->ve_netns = NULL;
 
 	mutex_unlock(&net_mutex);
 
@@ -188,14 +295,23 @@ static void cleanup_net(struct work_stru
 	rcu_barrier();
 
 	/* Finally it is safe to free my network namespace structure */
-	net_free(net);
+	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+		list_del_init(&net->exit_list);
+		net_free(net);
+	}
 }
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
 
 void __put_net(struct net *net)
 {
 	/* Cleanup the network namespace in process context */
-	INIT_WORK(&net->work, cleanup_net);
-	queue_work(netns_wq, &net->work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cleanup_list_lock, flags);
+	list_add(&net->cleanup_list, &cleanup_list);
+	spin_unlock_irqrestore(&cleanup_list_lock, flags);
+
+	queue_work(netns_wq, &net_cleanup_work);
 }
 EXPORT_SYMBOL_GPL(__put_net);
 
@@ -307,18 +423,25 @@ static int __init net_ns_init(void)
 pure_initcall(net_ns_init);
 
 #ifdef CONFIG_NET_NS
-static int register_pernet_operations(struct list_head *list,
-				      struct pernet_operations *ops)
+
+static int __register_pernet_operations(struct list_head *list,
+					struct pernet_operations *ops)
 {
-	struct net *net, *undo_net;
+	struct net *net;
 	int error;
+	LIST_HEAD(net_exit_list);
 
 	list_add_tail(&ops->list, list);
-	if (ops->init) {
+	if (ops->init || (ops->id && ops->size)) {
 		for_each_net(net) {
-			error = ops->init(net);
+			struct net_context ctx;
+
+			set_net_context(net, &ctx);
+			error = ops_init(ops, net);
+			restore_net_context(&ctx);
 			if (error)
 				goto out_undo;
+			list_add_tail(&net->exit_list, &net_exit_list);
 		}
 	}
 	return 0;
@@ -326,45 +449,82 @@ static int register_pernet_operations(st
 out_undo:
 	/* If I have an error cleanup all namespaces I initialized */
 	list_del(&ops->list);
-	if (ops->exit) {
-		for_each_net(undo_net) {
-			if (undo_net == net)
-				goto undone;
-			ops->exit(undo_net);
-		}
-	}
-undone:
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
 	return error;
 }
 
-static void unregister_pernet_operations(struct pernet_operations *ops)
+static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
 	struct net *net;
+	LIST_HEAD(net_exit_list);
 
 	list_del(&ops->list);
-	if (ops->exit)
-		for_each_net(net)
-			ops->exit(net);
+	for_each_net(net)
+		list_add_tail(&net->exit_list, &net_exit_list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
 }
 
 #else
 
+static int __register_pernet_operations(struct list_head *list,
+					struct pernet_operations *ops)
+{
+	int err = 0;
+	err = ops_init(ops, &init_net);
+	if (err)
+		ops_free(ops, &init_net);
+	return err;
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+	LIST_HEAD(net_exit_list);
+	list_add(&init_net.exit_list, &net_exit_list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
+}
+
+#endif /* CONFIG_NET_NS */
+
+static DEFINE_IDA(net_generic_ids);
+
 static int register_pernet_operations(struct list_head *list,
 				      struct pernet_operations *ops)
 {
-	if (ops->init == NULL)
-		return 0;
-	return ops->init(&init_net);
+	int error;
+
+	if (ops->id) {
+again:
+		error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+		if (error < 0) {
+			if (error == -EAGAIN) {
+				ida_pre_get(&net_generic_ids, GFP_KERNEL);
+				goto again;
+			}
+			return error;
+		}
+		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+	}
+	error = __register_pernet_operations(list, ops);
+	if (error) {
+		rcu_barrier();
+		if (ops->id)
+			ida_remove(&net_generic_ids, *ops->id);
+	}
+
+	return error;
 }
 
 static void unregister_pernet_operations(struct pernet_operations *ops)
 {
-	if (ops->exit)
-		ops->exit(&init_net);
+	
+	__unregister_pernet_operations(ops);
+	rcu_barrier();
+	if (ops->id)
+		ida_remove(&net_generic_ids, *ops->id);
 }
-#endif
-
-static DEFINE_IDA(net_generic_ids);
 
 /**
  *      register_pernet_subsys - register a network namespace subsystem
@@ -412,38 +572,6 @@ void unregister_pernet_subsys(struct per
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 
-int register_pernet_gen_subsys(int *id, struct pernet_operations *ops)
-{
-	int rv;
-
-	mutex_lock(&net_mutex);
-again:
-	rv = ida_get_new_above(&net_generic_ids, 1, id);
-	if (rv < 0) {
-		if (rv == -EAGAIN) {
-			ida_pre_get(&net_generic_ids, GFP_KERNEL);
-			goto again;
-		}
-		goto out;
-	}
-	rv = register_pernet_operations(first_device, ops);
-	if (rv < 0)
-		ida_remove(&net_generic_ids, *id);
-out:
-	mutex_unlock(&net_mutex);
-	return rv;
-}
-EXPORT_SYMBOL_GPL(register_pernet_gen_subsys);
-
-void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops)
-{
-	mutex_lock(&net_mutex);
-	unregister_pernet_operations(ops);
-	ida_remove(&net_generic_ids, id);
-	mutex_unlock(&net_mutex);
-}
-EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys);
-
 /**
  *      register_pernet_device - register a network namespace device
  *	@ops:  pernet operations structure for the subsystem
@@ -475,31 +603,6 @@ int register_pernet_device(struct pernet
 }
 EXPORT_SYMBOL_GPL(register_pernet_device);
 
-int register_pernet_gen_device(int *id, struct pernet_operations *ops)
-{
-	int error;
-	mutex_lock(&net_mutex);
-again:
-	error = ida_get_new_above(&net_generic_ids, 1, id);
-	if (error) {
-		if (error == -EAGAIN) {
-			ida_pre_get(&net_generic_ids, GFP_KERNEL);
-			goto again;
-		}
-		goto out;
-	}
-	max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *id);
-	error = register_pernet_operations(&pernet_list, ops);
-	if (error)
-		ida_remove(&net_generic_ids, *id);
-	else if (first_device == &pernet_list)
-		first_device = &ops->list;
-out:
-	mutex_unlock(&net_mutex);
-	return error;
-}
-EXPORT_SYMBOL_GPL(register_pernet_gen_device);
-
 /**
  *      unregister_pernet_device - unregister a network namespace netdevice
  *	@ops: pernet operations structure to manipulate
@@ -519,17 +622,6 @@ void unregister_pernet_device(struct per
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
 
-void unregister_pernet_gen_device(int id, struct pernet_operations *ops)
-{
-	mutex_lock(&net_mutex);
-	if (&ops->list == first_device)
-		first_device = first_device->next;
-	unregister_pernet_operations(ops);
-	ida_remove(&net_generic_ids, id);
-	mutex_unlock(&net_mutex);
-}
-EXPORT_SYMBOL_GPL(unregister_pernet_gen_device);
-
 static void net_generic_release(struct rcu_head *rcu)
 {
 	struct net_generic *ng;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/netpoll.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/netpoll.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/netpoll.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/netpoll.c	2015-01-21 12:02:42.521245036 +0300
@@ -237,8 +237,12 @@ static void refill_skbs(void)
 static void zap_completion_queue(void)
 {
 	unsigned long flags;
-	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct softnet_data *sd;
 
+	if (in_irq() || irqs_disabled())
+		return;
+
+	sd = &get_cpu_var(softnet_data);
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
 
@@ -779,7 +783,7 @@ int netpoll_setup(struct netpoll *np)
 		return -ENODEV;
 	}
 
-	if ((ndev->priv_flags & IFF_BRIDGE_PORT) || (ndev->flags & IFF_SLAVE)) {
+	if (ndev->flags & IFF_SLAVE) {
 		printk(KERN_ERR "%s: %s is a slave device, aborting.\n",
 		       np->name, np->dev_name);
 		err = -EBUSY;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/rtnetlink.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/rtnetlink.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/rtnetlink.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/rtnetlink.c	2015-01-21 12:02:51.145016090 +0300
@@ -289,7 +289,7 @@ static LIST_HEAD(link_ops);
 int __rtnl_link_register(struct rtnl_link_ops *ops)
 {
 	if (!ops->dellink)
-		ops->dellink = unregister_netdevice;
+		ops->dellink = unregister_netdevice_queue;
 
 	list_add_tail(&ops->list, &link_ops);
 	return 0;
@@ -318,13 +318,13 @@ EXPORT_SYMBOL_GPL(rtnl_link_register);
 static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
 {
 	struct net_device *dev;
-restart:
+	LIST_HEAD(list_kill);
+
 	for_each_netdev(net, dev) {
-		if (dev->rtnl_link_ops == ops) {
-			ops->dellink(dev);
-			goto restart;
-		}
+		if (dev->rtnl_link_ops == ops)
+			ops->dellink(dev, &list_kill);
 	}
+	unregister_netdevice_many(&list_kill);
 }
 
 void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
@@ -1135,6 +1135,18 @@ static u32 get_ext_mask(struct nlattr *t
 	return 0;
 }
 
+/*
+ * New iproute2 user-space tool send requests in struct ifinfomsg format,
+ * while old ones use struct rtgenmsg. So guess which one is passed depending
+ * on the payload size.
+ */
+static int compat_parse_size(const struct nlmsghdr *nlh)
+{
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct ifinfomsg)))
+		return sizeof(struct rtgenmsg);
+	return sizeof(struct ifinfomsg);
+}
+
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
@@ -1146,14 +1158,16 @@ static int rtnl_dump_ifinfo(struct sk_bu
 	struct hlist_head *head;
 	struct hlist_node *node;
 	int err;
+	struct netlink_callback_extended *cb_ext = nl_callback_extended(cb);
 
 	s_h = cb->args[0];
 	s_idx = cb->args[1];
 
-	nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX,
-		    ifla_policy);
+	if (nlmsg_parse(cb->nlh, compat_parse_size(cb->nlh), tb, IFLA_MAX, ifla_policy))
+		memset(tb, 0, sizeof(tb));
 
 	ext_filter_mask = get_ext_mask(tb);
+	cb_ext->seq = net->dev_base_seq;
 
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
@@ -1173,6 +1187,7 @@ static int rtnl_dump_ifinfo(struct sk_bu
 
 			if (err <= 0)
 				goto out;
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
 			idx++;
 		}
@@ -1671,7 +1686,7 @@ static int rtnl_dellink(struct sk_buff *
 	if (!ops)
 		return -EOPNOTSUPP;
 
-	ops->dellink(dev);
+	ops->dellink(dev, NULL);
 	return 0;
 }
 
@@ -1870,6 +1885,7 @@ static int rtnl_getlink(struct sk_buff *
 {
 	struct net *net = sock_net(skb->sk);
 	struct ifinfomsg *ifm;
+	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	struct net_device *dev = NULL;
 	struct sk_buff *nskb;
@@ -1880,16 +1896,24 @@ static int rtnl_getlink(struct sk_buff *
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		ifname[0] = '\0';
+
 	ext_filter_mask = get_ext_mask(tb);
 
 	ifm = nlmsg_data(nlh);
-	if (ifm->ifi_index > 0) {
+	if (ifm->ifi_index > 0)
 		dev = dev_get_by_index(net, ifm->ifi_index);
-		if (dev == NULL)
-			return -ENODEV;
-	} else
+	else if (tb[IFLA_IFNAME])
+		dev = dev_get_by_name(net, ifname);
+	else
 		return -EINVAL;
 
+	if (dev == NULL)
+		return -ENODEV;
+
 	nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
 	if (nskb == NULL) {
 		err = -ENOBUFS;
@@ -1919,7 +1943,8 @@ static u16 rtnl_calcit(struct sk_buff *s
 	u32 ext_filter_mask = 0;
 	u16 min_ifinfo_dump_size = 0;
 
-	nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ifla_policy);
+	if (nlmsg_parse(nlh, compat_parse_size(nlh), tb, IFLA_MAX, ifla_policy))
+		return NLMSG_GOODSIZE;
 
 	ext_filter_mask = get_ext_mask(tb);
 
@@ -1952,6 +1977,8 @@ static int rtnl_dump_all(struct sk_buff 
 		if (rtnl_msg_handlers[idx] == NULL ||
 		    rtnl_msg_handlers[idx][type].dumpit == NULL)
 			continue;
+		if (vz_security_family_check(idx))
+			continue;
 		if (idx > s_idx)
 			memset(&cb->args[0], 0, sizeof(cb->args));
 		if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
@@ -2197,10 +2224,14 @@ static int rtnetlink_rcv_msg(struct sk_b
 		return 0;
 
 	family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
+	if (vz_security_family_check(family))
+		return -EAFNOSUPPORT;
+
 	sz_idx = type>>2;
 	kind = type&3;
 
-	if (kind != 2 && !netlink_capable(skb, CAP_NET_ADMIN))
+	if (kind != 2 && !netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/scm.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/scm.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/scm.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/scm.c	2015-01-21 12:02:45.361169638 +0300
@@ -36,6 +36,7 @@
 #include <net/compat.h>
 #include <net/scm.h>
 
+#include <bc/kmem.h>
 
 /*
  *	Only allow a user to send credentials, that they could set with
@@ -46,7 +47,9 @@ static __inline__ int scm_check_creds(st
 {
 	const struct cred *cred = current_cred();
 
-	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
+	if ((creds->pid == task_tgid_vnr(current) ||
+	     creds->pid == current->tgid ||
+	     capable(CAP_VE_SYS_ADMIN)) &&
 	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
 	      creds->uid == cred->suid) || capable(CAP_SETUID)) &&
 	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
@@ -73,7 +76,7 @@ static int scm_fp_copy(struct cmsghdr *c
 
 	if (!fpl)
 	{
-		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC);
 		if (!fpl)
 			return -ENOMEM;
 		*fplp = fpl;
@@ -330,7 +333,7 @@ struct scm_fp_list *scm_fp_dup(struct sc
 		return NULL;
 
 	new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
-			  GFP_KERNEL);
+			  GFP_KERNEL_UBC);
 	if (new_fpl) {
 		for (i = 0; i < fpl->count; i++)
 			get_file(fpl->fp[i]);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/skbuff.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/skbuff.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/skbuff.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/skbuff.c	2015-01-21 12:02:47.479113411 +0300
@@ -69,6 +69,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <trace/events/skb.h>
+#include <bc/net.h>
 
 #include "kmap_skb.h"
 
@@ -186,6 +187,9 @@ struct sk_buff *__alloc_skb(unsigned int
 	if (!skb)
 		goto out;
 
+	if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA))
+		goto nobc;
+
 	size = SKB_DATA_ALIGN(size);
 	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
 			gfp_mask, node);
@@ -204,6 +208,7 @@ struct sk_buff *__alloc_skb(unsigned int
 	skb->data = data;
 	skb_reset_tail_pointer(skb);
 	skb->end = skb->tail + size;
+	skb->owner_env = get_exec_env();
 	kmemcheck_annotate_bitfield(skb, flags1);
 	kmemcheck_annotate_bitfield(skb, flags2);
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -236,6 +241,8 @@ struct sk_buff *__alloc_skb(unsigned int
 out:
 	return skb;
 nodata:
+	ub_skb_free_bc(skb);
+nobc:
 	kmem_cache_free(cache, skb);
 	skb = NULL;
 	goto out;
@@ -268,6 +275,11 @@ struct sk_buff *build_skb(void *data)
 	if (!skb)
 		return NULL;
 
+	if (ub_skb_alloc_bc(skb, GFP_ATOMIC)) {
+		kmem_cache_free(skbuff_head_cache, skb);
+		return NULL;
+	}
+
 	size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
@@ -277,6 +289,7 @@ struct sk_buff *build_skb(void *data)
 	skb->data = data;
 	skb_reset_tail_pointer(skb);
 	skb->end = skb->tail + size;
+	skb->owner_env = get_exec_env();
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 	skb->mac_header = ~0U;
 #endif
@@ -426,6 +439,7 @@ static void kfree_skbmem(struct sk_buff 
 	struct sk_buff *other;
 	atomic_t *fclone_ref;
 
+	ub_skb_free_bc(skb);
 	switch (skb->fclone) {
 	case SKB_FCLONE_UNAVAILABLE:
 		kmem_cache_free(skbuff_head_cache, skb);
@@ -458,6 +472,7 @@ static void skb_release_head_state(struc
 #ifdef CONFIG_XFRM
 	secpath_put(skb->sp);
 #endif
+	ub_skb_uncharge(skb);
 	if (skb->destructor) {
 		WARN_ON(in_irq());
 		skb->destructor(skb);
@@ -631,6 +646,11 @@ static void __copy_skb_header(struct sk_
 #endif
 	new->vlan_tci		= old->vlan_tci;
 
+#ifdef CONFIG_VE
+	new->accounted = old->accounted;
+	new->redirected = old->redirected;
+#endif
+	skb_copy_brmark(new, old);
 	skb_copy_secmark(new, old);
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -657,6 +677,10 @@ static struct sk_buff *__skb_clone(struc
 	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
 	n->cloned = 1;
 	n->nohdr = 0;
+	C(owner_env);
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+	C(brmark);
+#endif
 	n->destructor = NULL;
 	C(tail);
 	C(end);
@@ -665,6 +689,11 @@ static struct sk_buff *__skb_clone(struc
 	C(truesize);
 	atomic_set(&n->users, 1);
 
+#ifdef CONFIG_VE
+	C(accounted);
+	C(redirected);
+#endif
+
 	atomic_inc(&(skb_shinfo(skb)->dataref));
 	skb->cloned = 1;
 
@@ -774,6 +803,10 @@ struct sk_buff *skb_clone(struct sk_buff
 		n->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
 
+	if (ub_skb_alloc_bc(n, gfp_mask)) {
+		kmem_cache_free(skbuff_head_cache, n);
+		return NULL;
+	}
 	return __skb_clone(n, skb);
 }
 EXPORT_SYMBOL(skb_clone);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/sock.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/sock.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/sock.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/sock.c	2015-01-21 12:02:51.390009587 +0300
@@ -129,6 +129,9 @@
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
 
+#include <bc/net.h>
+#include <bc/beancounter.h>
+
 #include <linux/filter.h>
 
 #include <trace/events/sock.h>
@@ -287,7 +290,7 @@ static void sock_warn_obsolete_bsdism(co
 	static char warncomm[TASK_COMM_LEN];
 	if (strcmp(warncomm, current->comm) && warned < 5) {
 		strcpy(warncomm,  current->comm);
-		printk(KERN_WARNING "process `%s' is using obsolete "
+		ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete "
 		       "%s SO_BSDCOMPAT\n", warncomm, name);
 		warned++;
 	}
@@ -322,7 +325,7 @@ int sock_queue_rcv_skb(struct sock *sk, 
 	if (err)
 		goto out;
 
-	if (!sk_rmem_schedule(sk, skb->truesize)) {
+	if (!sk_rmem_schedule(sk, skb)) {
 		err = -ENOBUFS;
 		goto out;
 	}
@@ -1118,6 +1121,7 @@ static void sk_prot_free(struct proto *p
 	slab = prot->slab;
 
 	security_sk_free(sk);
+	ub_sock_uncharge(sk);
 	if (slab != NULL)
 		kmem_cache_free(slab, sk);
 	else
@@ -1169,6 +1173,7 @@ struct sock *sk_alloc(struct net *net, i
 		 */
 		sk->sk_prot = sk->sk_prot_creator = prot;
 		sock_lock_init(sk);
+		sk->owner_env = get_exec_env();
 		sock_net_set(sk, get_net(net));
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
@@ -1288,14 +1293,11 @@ struct sock *sk_clone(const struct sock 
 		if (filter != NULL)
 			sk_filter_charge(newsk, filter);
 
-		if (unlikely(xfrm_sk_clone_policy(newsk))) {
-			/* It is still raw copy of parent, so invalidate
-			 * destructor and make plain sk_free() */
-			newsk->sk_destruct = NULL;
-			sk_free(newsk);
-			newsk = NULL;
-			goto out;
-		}
+		if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type, 0) < 0)
+			goto out_err;
+
+		if (unlikely(xfrm_sk_clone_policy(newsk)))
+			 goto out_err;
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
@@ -1328,13 +1330,22 @@ struct sock *sk_clone(const struct sock 
 		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 			net_enable_timestamp();
 	}
-out:
 	return newsk;
+
+out_err:
+	/* It is still raw copy of parent, so invalidate
+	 * destructor and make plain sk_free() */
+	sock_reset_flag(newsk, SOCK_TIMESTAMP);
+	newsk->sk_destruct = NULL;
+	sk_free(newsk);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(sk_clone);
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
+	extern int sysctl_tcp_use_sg;
+
 	__sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
@@ -1348,6 +1359,8 @@ void sk_setup_caps(struct sock *sk, stru
 			sk_extended(sk)->sk_gso_max_segs = netdev_extended(dst->dev)->gso_max_segs;
 		}
 	}
+	if (!sysctl_tcp_use_sg)
+		sk->sk_route_caps &= ~NETIF_F_SG;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
@@ -1508,19 +1521,24 @@ static long sock_wait_for_wmem(struct so
 	return timeo;
 }
 
-
 /*
  *	Generic send/receive buffer handlers
  */
-
-struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
-				     unsigned long data_len, int noblock,
-				     int *errcode)
+struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long header_len,
+				     unsigned long data_len,
+				     unsigned long min_size,
+				     int noblock, int *errcode)
 {
 	struct sk_buff *skb;
 	gfp_t gfp_mask;
 	long timeo;
-	int err;
+	int err, i;
+	unsigned long chunk, size = header_len + data_len;
+	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+
+	err = -EMSGSIZE;
+	if (npages > MAX_SKB_FRAGS)
+		goto failure;
 
 	gfp_mask = sk->sk_allocation;
 	if (gfp_mask & __GFP_WAIT)
@@ -1528,7 +1546,6 @@ struct sk_buff *sock_alloc_send_pskb(str
 
 	timeo = sock_sndtimeo(sk, noblock);
 	while (1) {
-		int npages;
 		err = sock_error(sk);
 		if (err != 0)
 			goto failure;
@@ -1537,74 +1554,97 @@ struct sk_buff *sock_alloc_send_pskb(str
 		if (sk->sk_shutdown & SEND_SHUTDOWN)
 			goto failure;
 
-		err = -EMSGSIZE;
-		npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
-		if (npages > MAX_SKB_FRAGS)
-			goto failure;
-
-		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
-			skb = alloc_skb(header_len, gfp_mask);
-			if (skb) {
-				int i;
-
-				/* No pages, we're done... */
+		if (ub_sock_getwres_other(sk, skb_charge_size(size))) {
+			if (min_size < size) {
+				chunk = min_t(unsigned long,
+					      data_len, min_size);
+				data_len -= chunk;
 				if (!data_len)
-					break;
-
-				skb->truesize += data_len;
-				skb_shinfo(skb)->nr_frags = npages;
-				for (i = 0; i < npages; i++) {
-					struct page *page;
-					skb_frag_t *frag;
-
-					page = alloc_pages(sk->sk_allocation, 0);
-					if (!page) {
-						err = -ENOBUFS;
-						skb_shinfo(skb)->nr_frags = i;
-						kfree_skb(skb);
-						goto failure;
-					}
-
-					frag = &skb_shinfo(skb)->frags[i];
-					frag->page = page;
-					frag->page_offset = 0;
-					frag->size = (data_len >= PAGE_SIZE ?
-						      PAGE_SIZE :
-						      data_len);
-					data_len -= PAGE_SIZE;
+					header_len -= (min_size - chunk);
+				size = min_size;
+				continue;
+			}
+			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+			err = -EAGAIN;
+			if (!timeo)
+				goto failure;
+			err = sock_intr_errno(timeo);
+			if (signal_pending(current))
+				goto failure;
+			timeo = ub_sock_wait_for_space(sk, timeo,
+					skb_charge_size(size));
+			continue;
+		}
+
+		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
+			ub_sock_retwres_other(sk,
+					skb_charge_size(size),
+					SOCK_MIN_UBCSPACE_CH);
+			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			err = -EAGAIN;
+			if (!timeo)
+				goto failure;
+			err = sock_intr_errno(timeo);
+			if (signal_pending(current))
+				goto failure;
+			timeo = sock_wait_for_wmem(sk, timeo);
+			continue;
+		}
+
+		skb = alloc_skb(header_len, gfp_mask);
+		if (!skb)
+			goto ret_ubc;
+
+		skb->truesize += data_len;
+
+		for (i = 0; npages > 0; i++) {
+			int order = 0;
+			struct page *page;
+
+			while (order) {
+				if (npages >= 1 << order) {
+					page = alloc_pages(sk->sk_allocation |
+							   __GFP_COMP |
+							   __GFP_NOWARN |
+							   __GFP_NORETRY,
+							   order);
+					if (page)
+						goto fill_page;
 				}
-
-				/* Full success... */
-				break;
+				order--;
 			}
-			err = -ENOBUFS;
-			goto failure;
+			page = alloc_page(sk->sk_allocation);
+			if (!page)
+				goto ret_ubc;
+fill_page:
+			chunk = min_t(unsigned long, data_len,
+				      PAGE_SIZE << order);
+			skb_fill_page_desc(skb, i, page, 0, chunk);
+			data_len -= chunk;
+			npages -= 1 << order;
 		}
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
-		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		err = -EAGAIN;
-		if (!timeo)
-			goto failure;
-		if (signal_pending(current))
-			goto interrupted;
-		timeo = sock_wait_for_wmem(sk, timeo);
+
+		break;
 	}
 
+	ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF);
 	skb_set_owner_w(skb, sk);
 	return skb;
 
-interrupted:
-	err = sock_intr_errno(timeo);
+ret_ubc:
+	ub_sock_retwres_other(sk, skb_charge_size(size), SOCK_MIN_UBCSPACE_CH);
+	kfree_skb(skb);
 failure:
 	*errcode = err;
 	return NULL;
 }
-EXPORT_SYMBOL(sock_alloc_send_pskb);
+EXPORT_SYMBOL(sock_alloc_send_skb2);
 
 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 				    int noblock, int *errcode)
 {
-	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+	return sock_alloc_send_skb2(sk, size, 0, size, noblock, errcode);
 }
 EXPORT_SYMBOL(sock_alloc_send_skb);
 
@@ -2049,21 +2089,24 @@ void lock_sock_nested(struct sock *sk, i
 		__lock_sock(sk);
 	sk->sk_lock.owned = 1;
 	spin_unlock(&sk->sk_lock.slock);
+#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE)
 	/*
 	 * The sk_lock has mutex_lock() semantics here:
 	 */
 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+#endif
 	local_bh_enable();
 }
 EXPORT_SYMBOL(lock_sock_nested);
 
 void release_sock(struct sock *sk)
 {
+#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE)
 	/*
 	 * The sk_lock has mutex_unlock() semantics:
 	 */
 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
+#endif
 	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
@@ -2345,7 +2388,8 @@ int proto_register(struct proto *prot, i
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name,
 					sk_alloc_size(prot->obj_size), 0,
-					SLAB_HWCACHE_ALIGN | proto_slab_flags(prot),
+					SLAB_HWCACHE_ALIGN | SLAB_UBC |
+						proto_slab_flags(prot),
 					NULL);
 
 		if (prot->slab == NULL) {
@@ -2364,7 +2408,7 @@ int proto_register(struct proto *prot, i
 			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
 								 prot->rsk_prot->obj_size, 0,
-								 SLAB_HWCACHE_ALIGN, NULL);
+								 SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
 
 			if (prot->rsk_prot->slab == NULL) {
 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
@@ -2386,7 +2430,7 @@ int proto_register(struct proto *prot, i
 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0,
-						  SLAB_HWCACHE_ALIGN |
+						  SLAB_HWCACHE_ALIGN | SLAB_UBC |
 							proto_slab_flags(prot),
 						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/core/stream.c linux-2.6.32-504.3.3.el6-042stab103_6/net/core/stream.c
--- linux-2.6.32-504.3.3.el6.orig/net/core/stream.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/core/stream.c	2015-01-21 12:02:43.285224753 +0300
@@ -112,8 +112,10 @@ EXPORT_SYMBOL(sk_stream_wait_close);
  * sk_stream_wait_memory - Wait for more memory for a socket
  * @sk: socket to wait for memory
  * @timeo_p: for how long
+ * @amount - amount of memory to wait for (in UB space!)
  */
-int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+int __sk_stream_wait_memory(struct sock *sk, long *timeo_p,
+		unsigned long amount)
 {
 	int err = 0;
 	long vm_wait = 0;
@@ -135,7 +137,10 @@ int sk_stream_wait_memory(struct sock *s
 		if (signal_pending(current))
 			goto do_interrupted;
 		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
-		if (sk_stream_memory_free(sk) && !vm_wait)
+		if (amount == 0) {
+			if (sk_stream_memory_free(sk) && !vm_wait)
+				break;
+		} else if (!ub_sock_sndqueueadd_tcp(sk, amount))
 			break;
 
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -145,6 +150,8 @@ int sk_stream_wait_memory(struct sock *s
 						  sk_stream_memory_free(sk) &&
 						  vm_wait);
 		sk->sk_write_pending--;
+		if (amount > 0)
+			ub_sock_sndqueuedel(sk);
 
 		if (vm_wait) {
 			vm_wait -= current_timeo;
@@ -171,6 +178,10 @@ do_interrupted:
 	goto out;
 }
 
+int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+{
+	return __sk_stream_wait_memory(sk, timeo_p, 0);
+}
 EXPORT_SYMBOL(sk_stream_wait_memory);
 
 int sk_stream_error(struct sock *sk, int flags, int err)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/dccp/ipv6.c linux-2.6.32-504.3.3.el6-042stab103_6/net/dccp/ipv6.c
--- linux-2.6.32-504.3.3.el6.orig/net/dccp/ipv6.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/dccp/ipv6.c	2015-01-21 12:02:45.363169586 +0300
@@ -583,6 +583,8 @@ static struct sock *dccp_v6_request_recv
 	__ip6_dst_store(newsk, dst, NULL, NULL);
 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
 						      NETIF_F_TSO);
+	if (!sysctl_tcp_use_sg)
+		newsk->sk_route_caps &= ~NETIF_F_SG;
 	newdp6 = (struct dccp6_sock *)newsk;
 	newinet = inet_sk(newsk);
 	newinet->pinet6 = &newdp6->inet6;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/dccp/minisocks.c linux-2.6.32-504.3.3.el6-042stab103_6/net/dccp/minisocks.c
--- linux-2.6.32-504.3.3.el6.orig/net/dccp/minisocks.c	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/dccp/minisocks.c	2015-01-21 12:02:43.286224727 +0300
@@ -19,6 +19,8 @@
 #include <net/xfrm.h>
 #include <net/inet_timewait_sock.h>
 
+#include <bc/sock_orphan.h>
+
 #include "ackvec.h"
 #include "ccid.h"
 #include "dccp.h"
@@ -46,7 +48,8 @@ void dccp_time_wait(struct sock *sk, int
 {
 	struct inet_timewait_sock *tw = NULL;
 
-	if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+	if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets &&
+			ub_timewait_check(sk, &dccp_death_row))
 		tw = inet_twsk_alloc(sk, state);
 
 	if (tw != NULL) {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/decnet/dn_rules.c linux-2.6.32-504.3.3.el6-042stab103_6/net/decnet/dn_rules.c
--- linux-2.6.32-504.3.3.el6.orig/net/decnet/dn_rules.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/decnet/dn_rules.c	2015-01-21 12:02:42.445247053 +0300
@@ -33,7 +33,7 @@
 #include <net/dn_dev.h>
 #include <net/dn_route.h>
 
-static struct fib_rules_ops dn_fib_rules_ops;
+static struct fib_rules_ops *dn_fib_rules_ops;
 
 struct dn_fib_rule
 {
@@ -56,7 +56,7 @@ int dn_fib_lookup(struct flowi *flp, str
 	};
 	int err;
 
-	err = fib_rules_lookup(&dn_fib_rules_ops, flp, 0, &arg);
+	err = fib_rules_lookup(dn_fib_rules_ops, flp, 0, &arg);
 	res->r = arg.rule;
 
 	return err;
@@ -216,7 +216,7 @@ static void dn_fib_rule_flush_cache(stru
 	dn_rt_cache_flush(-1);
 }
 
-static struct fib_rules_ops dn_fib_rules_ops = {
+static struct fib_rules_ops dn_fib_rules_ops_template = {
 	.family		= AF_DECnet,
 	.rule_size	= sizeof(struct dn_fib_rule),
 	.addr_size	= sizeof(u16),
@@ -229,21 +229,23 @@ static struct fib_rules_ops dn_fib_rules
 	.flush_cache	= dn_fib_rule_flush_cache,
 	.nlgroup	= RTNLGRP_DECnet_RULE,
 	.policy		= dn_fib_rule_policy,
-	.rules_list	= LIST_HEAD_INIT(dn_fib_rules_ops.rules_list),
 	.owner		= THIS_MODULE,
 	.fro_net	= &init_net,
 };
 
 void __init dn_fib_rules_init(void)
 {
-	BUG_ON(fib_default_rule_add(&dn_fib_rules_ops, 0x7fff,
+	dn_fib_rules_ops =
+		fib_rules_register(&dn_fib_rules_ops_template, &init_net);
+	BUG_ON(IS_ERR(dn_fib_rules_ops));
+	BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff,
 			            RT_TABLE_MAIN, 0));
-	fib_rules_register(&dn_fib_rules_ops);
 }
 
 void __exit dn_fib_rules_cleanup(void)
 {
-	fib_rules_unregister(&dn_fib_rules_ops);
+	fib_rules_unregister(dn_fib_rules_ops);
+	rcu_barrier();
 }
 
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/decnet/netfilter/dn_rtmsg.c linux-2.6.32-504.3.3.el6-042stab103_6/net/decnet/netfilter/dn_rtmsg.c
--- linux-2.6.32-504.3.3.el6.orig/net/decnet/netfilter/dn_rtmsg.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/decnet/netfilter/dn_rtmsg.c	2015-01-21 12:02:45.436167647 +0300
@@ -107,7 +107,8 @@ static inline void dnrmg_receive_user_sk
 	if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
 		return;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	/* Eventually we might send routing messages too */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ieee802154/af_ieee802154.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ieee802154/af_ieee802154.c
--- linux-2.6.32-504.3.3.el6.orig/net/ieee802154/af_ieee802154.c	2014-12-12 23:28:53.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ieee802154/af_ieee802154.c	2015-01-21 12:02:42.671241054 +0300
@@ -147,6 +147,9 @@ static int ieee802154_dev_ioctl(struct s
 	dev_load(sock_net(sk), ifr.ifr_name);
 	dev = dev_get_by_name(sock_net(sk), ifr.ifr_name);
 
+	if (!dev)
+		return -ENODEV;
+
 	if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl)
 		ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/af_inet.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/af_inet.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/af_inet.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/af_inet.c	2015-01-21 12:02:47.886102604 +0300
@@ -115,6 +115,7 @@
 #ifdef CONFIG_IP_MROUTE
 #include <linux/mroute.h>
 #endif
+#include <bc/net.h>
 
 
 /* The inetsw table contains everything that inet_create needs to
@@ -325,6 +326,10 @@ lookup_protocol:
 			goto out_rcu_unlock;
 	}
 
+	err = vz_security_protocol_check(answer->protocol);
+	if (err < 0)
+		goto out_rcu_unlock;
+
 	err = -EPERM;
 	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
 		goto out_rcu_unlock;
@@ -346,6 +351,13 @@ lookup_protocol:
 	if (sk == NULL)
 		goto out;
 
+	err = -ENOBUFS;
+	if (ub_sock_charge(sk, PF_INET, sock->type, kern))
+		goto out_sk_free;
+	/* if charge was successful, sock_init_data() MUST be called to
+	 * set sk->sk_type. otherwise sk will be uncharged to wrong resource
+	 */
+
 	err = 0;
 	sk->sk_no_check = answer_no_check;
 	if (INET_PROTOSW_REUSE & answer_flags)
@@ -404,6 +416,9 @@ out:
 out_rcu_unlock:
 	rcu_read_unlock();
 	goto out;
+out_sk_free:
+	sk_free(sk);
+	return err;
 }
 
 
@@ -418,6 +433,9 @@ int inet_release(struct socket *sock)
 
 	if (sk) {
 		long timeout;
+		struct ve_struct *saved_env;
+
+		saved_env = set_exec_env(sk->owner_env);
 
 		inet_rps_reset_flow(sk);
 
@@ -437,6 +455,8 @@ int inet_release(struct socket *sock)
 			timeout = sk->sk_lingertime;
 		sock->sk = NULL;
 		sk->sk_prot->close(sk, timeout);
+
+		(void)set_exec_env(saved_env);
 	}
 	return 0;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/arp.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/arp.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/arp.c	2014-12-12 23:29:18.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/arp.c	2015-01-21 12:02:45.363169586 +0300
@@ -224,7 +224,7 @@ int arp_mc_map(__be32 addr, u8 *haddr, s
 
 static u32 arp_hash(const void *pkey, const struct net_device *dev)
 {
-	return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd[0]);
 }
 
 static int arp_constructor(struct neighbour *neigh)
@@ -1181,7 +1181,8 @@ int arp_ioctl(struct net *net, unsigned 
 	switch (cmd) {
 		case SIOCDARP:
 		case SIOCSARP:
-			if (!capable(CAP_NET_ADMIN))
+			if (!capable(CAP_NET_ADMIN) &&
+					!capable(CAP_VE_NET_ADMIN))
 				return -EPERM;
 		case SIOCGARP:
 			err = copy_from_user(&r, arg, sizeof(struct arpreq));
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/devinet.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/devinet.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/devinet.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/devinet.c	2015-01-21 12:02:57.978834686 +0300
@@ -110,10 +110,11 @@ static inline void devinet_sysctl_unregi
 
 /* Locks all the inet devices. */
 
-static struct in_ifaddr *inet_alloc_ifa(void)
+struct in_ifaddr *inet_alloc_ifa(void)
 {
-	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_UBC);
 }
+EXPORT_SYMBOL(inet_alloc_ifa);
 
 static void inet_rcu_free_ifa(struct rcu_head *head)
 {
@@ -146,7 +147,7 @@ void in_dev_finish_destroy(struct in_dev
 	}
 }
 
-static struct in_device *inetdev_init(struct net_device *dev)
+struct in_device *inetdev_init(struct net_device *dev)
 {
 	struct in_device *in_dev;
 
@@ -183,6 +184,7 @@ out_kfree:
 	in_dev = NULL;
 	goto out;
 }
+EXPORT_SYMBOL(inetdev_init);
 
 static void in_dev_rcu_put(struct rcu_head *head)
 {
@@ -376,10 +378,11 @@ static int __inet_insert_ifa(struct in_i
 	return 0;
 }
 
-static int inet_insert_ifa(struct in_ifaddr *ifa)
+int inet_insert_ifa(struct in_ifaddr *ifa)
 {
 	return __inet_insert_ifa(ifa, NULL, 0);
 }
+EXPORT_SYMBOL(inet_insert_ifa);
 
 static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
 {
@@ -627,7 +630,7 @@ int devinet_ioctl(struct net *net, unsig
 
 	case SIOCSIFFLAGS:
 		ret = -EACCES;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto out;
 		break;
 	case SIOCSIFADDR:	/* Set interface address (and family) */
@@ -635,7 +638,7 @@ int devinet_ioctl(struct net *net, unsig
 	case SIOCSIFDSTADDR:	/* Set the destination address */
 	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
 		ret = -EACCES;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto out;
 		ret = -EINVAL;
 		if (sin->sin_family != AF_INET)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/fib_frontend.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/fib_frontend.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/fib_frontend.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/fib_frontend.c	2015-01-21 12:02:51.294012135 +0300
@@ -264,7 +264,8 @@ int fib_validate_source(__be32 src, __be
 	if (fib_lookup(net, &fl, &res))
 		goto last_resort;
 	if (res.type != RTN_UNICAST) {
-		if (res.type != RTN_LOCAL || !accept_local)
+		if (!(dev->vz_features & NETIF_F_VENET) ||
+		    res.type != RTN_LOCAL || !accept_local)
 			goto e_inval_res;
 	}
 	*spec_dst = FIB_RES_PREFSRC(res);
@@ -467,7 +468,7 @@ int ip_rt_ioctl(struct net *net, unsigne
 	switch (cmd) {
 	case SIOCADDRT:		/* Add a route */
 	case SIOCDELRT:		/* Delete a route */
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_VE_NET_ADMIN))
 			return -EPERM;
 
 		if (copy_from_user(&rt, arg, sizeof(rt)))
@@ -976,11 +977,11 @@ static void nl_fib_lookup_exit(struct ne
 	net->ipv4.fibnl = NULL;
 }
 
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, int force, int delay)
 {
 	if (fib_sync_down_dev(dev, force))
 		fib_flush(dev_net(dev));
-	rt_cache_flush(dev_net(dev), 0);
+	rt_cache_flush(dev_net(dev), delay);
 	arp_ifdown(dev);
 }
 
@@ -1003,7 +1004,7 @@ static int fib_inetaddr_event(struct not
 			/* Last address was deleted from this interface.
 			   Disable IP.
 			 */
-			fib_disable_ip(dev, 1);
+			fib_disable_ip(dev, 1, 0);
 		} else {
 			rt_cache_flush(dev_net(dev), -1);
 		}
@@ -1018,7 +1019,12 @@ static int fib_netdev_event(struct notif
 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 
 	if (event == NETDEV_UNREGISTER) {
-		fib_disable_ip(dev, 2);
+		fib_disable_ip(dev, 2, -1);
+		return NOTIFY_DONE;
+	}
+
+	if (event == NETDEV_UNREGISTER_BATCH) {
+		rt_cache_flush_batch();
 		return NOTIFY_DONE;
 	}
 
@@ -1036,15 +1042,12 @@ static int fib_netdev_event(struct notif
 		rt_cache_flush(dev_net(dev), -1);
 		break;
 	case NETDEV_DOWN:
-		fib_disable_ip(dev, 0);
+		fib_disable_ip(dev, 0, 0);
 		break;
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
 		rt_cache_flush(dev_net(dev), 0);
 		break;
-	case NETDEV_UNREGISTER_BATCH:
-		rt_cache_flush_batch();
-		break;
 	}
 	return NOTIFY_DONE;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/fib_hash.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/fib_hash.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/fib_hash.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/fib_hash.c	2015-01-21 12:02:43.287224701 +0300
@@ -771,10 +771,10 @@ static int fn_hash_dump(struct fib_table
 void __init fib_hash_init(void)
 {
 	fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
-					 0, SLAB_PANIC, NULL);
+					 0, SLAB_PANIC | SLAB_UBC, NULL);
 
 	fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
-					  0, SLAB_PANIC, NULL);
+					  0, SLAB_PANIC | SLAB_UBC, NULL);
 
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/fib_rules.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/fib_rules.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/fib_rules.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/fib_rules.c	2015-01-21 12:02:42.445247053 +0300
@@ -283,13 +283,9 @@ int __net_init fib4_rules_init(struct ne
 	int err;
 	struct fib_rules_ops *ops;
 
-	ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL);
-	if (ops == NULL)
-		return -ENOMEM;
-	INIT_LIST_HEAD(&ops->rules_list);
-	ops->fro_net = net;
-
-	fib_rules_register(ops);
+	ops = fib_rules_register(&fib4_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
 
 	err = fib_default_rules_init(ops);
 	if (err < 0)
@@ -300,12 +296,10 @@ int __net_init fib4_rules_init(struct ne
 fail:
 	/* also cleans all rules already added */
 	fib_rules_unregister(ops);
-	kfree(ops);
 	return err;
 }
 
 void __net_exit fib4_rules_exit(struct net *net)
 {
 	fib_rules_unregister(net->ipv4.rules_ops);
-	kfree(net->ipv4.rules_ops);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_connection_sock.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_connection_sock.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_connection_sock.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_connection_sock.c	2015-01-21 12:02:45.364169559 +0300
@@ -24,6 +24,9 @@
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
 
+#include <bc/net.h>
+#include <bc/sock_orphan.h>
+
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
@@ -204,6 +207,8 @@ have_snum:
 	goto tb_not_found;
 tb_found:
 	if (!hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse > 1)
+			goto success;
 		if (((tb->fastreuse > 0 &&
 		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 		     (tb->fastreuseport > 0 &&
@@ -674,7 +679,7 @@ void inet_csk_destroy_sock(struct sock *
 
 	sk_refcnt_debug_release(sk);
 
-	percpu_counter_dec(sk->sk_prot->orphan_count);
+	ub_dec_orphan_count(sk);
 	sock_put(sk);
 }
 
@@ -754,7 +759,7 @@ void inet_csk_listen_stop(struct sock *s
 
 		sock_orphan(child);
 
-		percpu_counter_inc(sk->sk_prot->orphan_count);
+		ub_inc_orphan_count(sk);
 
 		inet_csk_destroy_sock(child);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_diag.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_diag.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_diag.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_diag.c	2015-01-21 12:02:45.364169559 +0300
@@ -705,6 +705,7 @@ static int inet_diag_dump(struct sk_buff
 	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
 	const struct inet_diag_handler *handler;
 	struct inet_hashinfo *hashinfo;
+	struct ve_struct *ve = get_exec_env();
 
 	handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
 	if (IS_ERR(handler))
@@ -730,6 +731,8 @@ static int inet_diag_dump(struct sk_buff
 			sk_nulls_for_each(sk, node, &ilb->head) {
 				struct inet_sock *inet = inet_sk(sk);
 
+				if (!ve_accessible(sk->owner_env, ve))
+					continue;
 				if (num < s_num) {
 					num++;
 					continue;
@@ -796,6 +799,8 @@ skip_listen_ht:
 		sk_nulls_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
 
+			if (!ve_accessible(sk->owner_env, ve))
+				continue;
 			if (num < s_num)
 				goto next_normal;
 			if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -820,6 +825,8 @@ next_normal:
 			inet_twsk_for_each(tw, node,
 				    &head->twchain) {
 
+				if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve)))
+					continue;
 				if (num < s_num)
 					goto next_dying;
 				if (r->id.idiag_sport != tw->tw_sport &&
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_fragment.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_fragment.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_fragment.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_fragment.c	2015-01-21 12:02:45.365169532 +0300
@@ -19,6 +19,7 @@
 #include <linux/random.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+#include <linux/sched.h>
 
 #include <net/inet_frag.h>
 
@@ -293,6 +294,10 @@ static struct inet_frag_queue *inet_frag
 		return NULL;
 
 	q->net = nf;
+#ifdef CONFIG_VE
+	q->owner_ve = get_exec_env();
+#endif
+
 	f->constructor(q, arg);
 	add_frag_mem_limit(q, f->qsize);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_timewait_sock.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_timewait_sock.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/inet_timewait_sock.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/inet_timewait_sock.c	2015-01-21 12:02:51.272012718 +0300
@@ -14,6 +14,7 @@
 #include <net/inet_timewait_sock.h>
 #include <net/ip.h>
 
+#include <bc/sock_orphan.h>
 
 /*
  * unhash a timewait socket from established hash
@@ -154,9 +155,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance)
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
 {
-	struct inet_timewait_sock *tw =
-		kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
-				 GFP_ATOMIC);
+	struct user_beancounter *ub;
+	struct inet_timewait_sock *tw;
+
+	ub = set_exec_ub(sock_bc(sk)->ub);
+	tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+			GFP_ATOMIC);
+	(void)set_exec_ub(ub);
+
 	if (tw != NULL) {
 		const struct inet_sock *inet = inet_sk(sk);
 
@@ -213,6 +219,7 @@ static int inet_twdr_do_twkill_work(stru
 rescan:
 	inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
 		__inet_twsk_del_dead_node(tw);
+		ub_timewait_dec(tw, twdr);
 		spin_unlock(&twdr->death_lock);
 		__inet_twsk_kill(tw, twdr->hashinfo);
 #ifdef CONFIG_NET_NS
@@ -313,6 +320,7 @@ void inet_twsk_deschedule(struct inet_ti
 {
 	spin_lock(&twdr->death_lock);
 	if (inet_twsk_del_dead_node(tw)) {
+		ub_timewait_dec(tw, twdr);
 		inet_twsk_put(tw);
 		if (--twdr->tw_count == 0)
 			del_timer(&twdr->tw_timer);
@@ -359,9 +367,10 @@ void inet_twsk_schedule(struct inet_time
 	spin_lock(&twdr->death_lock);
 
 	/* Unlink it, if it was scheduled */
-	if (inet_twsk_del_dead_node(tw))
+	if (inet_twsk_del_dead_node(tw)) {
+		ub_timewait_dec(tw, twdr);
 		twdr->tw_count--;
-	else
+	} else
 		atomic_inc(&tw->tw_refcnt);
 
 	if (slot >= INET_TWDR_RECYCLE_SLOTS) {
@@ -397,6 +406,7 @@ void inet_twsk_schedule(struct inet_time
 
 	hlist_add_head(&tw->tw_death_node, list);
 
+	ub_timewait_inc(tw, twdr);
 	if (twdr->tw_count++ == 0)
 		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
 	spin_unlock(&twdr->death_lock);
@@ -431,6 +441,7 @@ void inet_twdr_twcal_tick(unsigned long 
 						       &twdr->twcal_row[slot]) {
 				__inet_twsk_del_dead_node(tw);
 				__inet_twsk_kill(tw, twdr->hashinfo);
+				ub_timewait_dec(tw, twdr);
 #ifdef CONFIG_NET_NS
 				NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
 #endif
@@ -465,37 +476,48 @@ out:
 
 EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
 
-void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
 		     struct inet_timewait_death_row *twdr, int family)
 {
 	struct inet_timewait_sock *tw;
 	struct sock *sk;
 	struct hlist_nulls_node *node;
-	int h;
+	unsigned int slot;
 
-	local_bh_disable();
-	for (h = 0; h < (hashinfo->ehash_size); h++) {
-		struct inet_ehash_bucket *head =
-			inet_ehash_bucket(hashinfo, h);
-		spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
+	for (slot = 0; slot < hashinfo->ehash_size; slot++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+		rcu_read_lock();
 restart:
-		spin_lock(lock);
-		sk_nulls_for_each(sk, node, &head->twchain) {
-
+		sk_nulls_for_each_rcu(sk, node, &head->twchain) {
 			tw = inet_twsk(sk);
-			if (!net_eq(twsk_net(tw), net) ||
-			    tw->tw_family != family)
+			if ((tw->tw_family != family) ||
+				atomic_read(&twsk_net(tw)->count))
+				continue;
+
+			if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
 				continue;
 
-			atomic_inc(&tw->tw_refcnt);
-			spin_unlock(lock);
+			if (unlikely((tw->tw_family != family) ||
+				     atomic_read(&twsk_net(tw)->count))) {
+				inet_twsk_put(tw);
+				goto restart;
+			}
+
+			rcu_read_unlock();
+			local_bh_disable();
 			inet_twsk_deschedule(tw, twdr);
+			local_bh_enable();
 			inet_twsk_put(tw);
-
-			goto restart;
+			goto restart_rcu;
 		}
-		spin_unlock(lock);
+		/* If the nulls value we got at the end of this lookup is
+		 * not the expected one, we must restart lookup.
+		 * We probably met an item that was moved to another chain.
+		 */
+		if (get_nulls_value(node) != slot)
+			goto restart;
+		rcu_read_unlock();
 	}
-	local_bh_enable();
 }
 EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_forward.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_forward.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_forward.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_forward.c	2015-01-21 12:02:45.452167223 +0300
@@ -94,6 +94,24 @@ int ip_forward(struct sk_buff *skb)
 		goto drop;
 	}
 
+	/*
+	 * We try to optimize forwarding of VE packets:
+	 * do not decrement TTL (and so save skb_cow)
+	 * during forwarding of outgoing pkts from VE.
+	 * For incoming pkts we still do ttl decr,
+	 * since such skb is not cloned and does not require
+	 * actual cow. So, there is at least one place
+	 * in pkts path with mandatory ttl decr, that is
+	 * sufficient to prevent routing loops.
+	 */
+	iph = ip_hdr(skb);
+	if (
+#ifdef CONFIG_IP_ROUTE_NAT			
+	    (rt->rt_flags & RTCF_NAT) == 0 &&	  /* no NAT mangling expected */
+#endif						  /* and */
+	    (skb->dev->vz_features & NETIF_F_VENET)) /* src is VENET device */
+		goto no_ttl_decr;
+
 	/* We are about to mangle packet. Copy it! */
 	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
 		goto drop;
@@ -102,6 +120,8 @@ int ip_forward(struct sk_buff *skb)
 	/* Decrease ttl after skb cow done */
 	ip_decrease_ttl(iph);
 
+no_ttl_decr:
+
 	/*
 	 *	We now generate an ICMP HOST REDIRECT giving the route
 	 *	we calculated.
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_fragment.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_fragment.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_fragment.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_fragment.c	2015-01-21 12:02:45.365169532 +0300
@@ -179,10 +179,13 @@ static void ip_evictor(struct net *net)
  */
 static void ip_expire(unsigned long arg)
 {
+	struct inet_frag_queue *q = (struct inet_frag_queue *)arg;
 	struct ipq *qp;
 	struct net *net;
+	struct ve_struct *old_ve;
 
-	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+	qp = container_of(q, struct ipq, q);
+	old_ve = set_exec_env(q->owner_ve);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
 
 	spin_lock(&qp->q.lock);
@@ -230,6 +233,8 @@ out_put:
 out:
 	spin_unlock(&qp->q.lock);
 	ipq_put(qp);
+
+	(void)set_exec_env(old_ve);
 }
 
 /* Find the correct entry in the "incomplete datagrams" queue for
@@ -559,6 +564,7 @@ static int ip_frag_reasm(struct ipq *qp,
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(&qp->q, clone->truesize);
+		clone->owner_env = head->owner_env;
 	}
 
 	skb_shinfo(head)->frag_list = head->next;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_gre.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_gre.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_gre.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_gre.c	2015-01-21 12:02:51.219014124 +0300
@@ -51,6 +51,9 @@
 #include <net/ip6_route.h>
 #endif
 
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
 /*
    Problems & solutions
    --------------------
@@ -109,6 +112,7 @@
  */
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 
 static int ipgre_net_id __read_mostly;
@@ -445,6 +449,47 @@ static const struct ethtool_ops gre_dev_
 	.get_tso	= ethtool_op_get_tso,
 };
 
+static void ipgre_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+	struct cpt_tunnel_image v;
+	struct ip_tunnel *t;
+	struct ip_tunnel_net *itn;
+	int net_id;
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	if (dev->rtnl_link_ops == &ipgre_link_ops) {
+		net_id = ipgre_net_id;
+		v.cpt_tnl_flags = CPT_TUNNEL_GRE;
+	} else if (dev->rtnl_link_ops == &ipgre_tap_ops) {
+		net_id = gre_tap_net_id;
+		v.cpt_tnl_flags = CPT_TUNNEL_GRE_TAP;
+	} else BUG();
+
+	/* mark fb dev */
+	itn = net_generic(get_exec_env()->ve_netns, net_id);
+	if (dev == itn->fb_tunnel_dev)
+		v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV;
+
+	t = netdev_priv(dev);
+	v.cpt_i_flags = t->parms.i_flags;
+	v.cpt_o_flags = t->parms.o_flags;
+	v.cpt_i_key = t->parms.i_key;
+	v.cpt_o_key = t->parms.o_key;
+	v.cpt_i_seqno = t->i_seqno;
+	v.cpt_o_seqno = t->o_seqno;
+	v.cpt_link = dev->iflink;
+
+	BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+	memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph));
+
+	ops->write(&v, sizeof(v), ctx);
+}
+
 static const struct net_device_ops ipgre_netdev_ops = {
 	.ndo_init		= ipgre_tunnel_init,
 	.ndo_uninit		= ip_tunnel_uninit,
@@ -455,6 +500,7 @@ static const struct net_device_ops ipgre
 	.ndo_start_xmit		= ipgre_xmit,
 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
 	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_cpt		= ipgre_cpt,
 };
 
 #define GRE_FEATURES (NETIF_F_SG |		\
@@ -466,8 +512,46 @@ static void ipgre_tunnel_setup(struct ne
 {
 	dev->netdev_ops		= &ipgre_netdev_ops;
 	ip_tunnel_setup(dev, ipgre_net_id);
+	dev->vz_features |= NETIF_F_VIRTUAL;
 }
 
+static int ipgre_rst(loff_t start, struct cpt_netdev_image *di,
+		struct rst_ops *ops, struct cpt_context *ctx)
+{
+	int err;
+	struct cpt_tunnel_image v;
+	loff_t pos;
+	struct net *net = get_exec_env()->ve_netns;
+	struct ip_tunnel_net *itn;
+
+	pos = start + di->cpt_hdrlen;
+	err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL,
+			pos, &v, sizeof(v), ctx);
+	if (err)
+		return err;
+
+	/* some sanity */
+	if (v.cpt_content != CPT_CONTENT_VOID)
+		return -EINVAL;
+
+	if (v.cpt_tnl_flags & CPT_TUNNEL_GRE)
+		itn = net_generic(net, ipgre_net_id);
+	else if (v.cpt_tnl_flags & CPT_TUNNEL_GRE_TAP)
+		itn = net_generic(net, gre_tap_net_id);
+	else
+		return 1;
+
+	if (itn == NULL)
+		return -EOPNOTSUPP;
+
+	return ip_tunnel_rst(net, itn, &v, di->cpt_name);
+}
+
+static struct netdev_rst ipgre_netdev_rst = {
+	.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL,
+	.ndo_rst = ipgre_rst,
+};
+
 static void __gre_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel;
@@ -550,16 +634,19 @@ free:
 	return err;
 }
 
-static void ipgre_exit_net(struct net *net)
+static void __net_exit ipgre_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
 	ip_tunnel_delete_net(itn);
+	net_assign_generic(net, ipgre_net_id, NULL);
 	kfree(itn);
 }
 
 static struct pernet_operations ipgre_net_ops = {
 	.init = ipgre_init_net,
 	.exit = ipgre_exit_net,
+	.id   = &ipgre_net_id,
+	.size = sizeof(struct ip_tunnel_net),
 };
 
 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -659,6 +746,7 @@ static const struct net_device_ops gre_t
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_cpt		= ipgre_cpt,
 };
 
 static void ipgre_tap_setup(struct net_device *dev)
@@ -666,6 +754,7 @@ static void ipgre_tap_setup(struct net_d
 	ether_setup(dev);
 	dev->netdev_ops		= &gre_tap_netdev_ops;
 	ip_tunnel_setup(dev, gre_tap_net_id);
+	dev->vz_features |= NETIF_F_VIRTUAL;
 }
 
 static int ipgre_newlink(struct net_device *dev,
@@ -802,12 +891,15 @@ static void __net_exit ipgre_tap_exit_ne
 {
 	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
 	ip_tunnel_delete_net(itn);
+	net_assign_generic(net, gre_tap_net_id, NULL);
 	kfree(itn);
 }
 
 static struct pernet_operations ipgre_tap_net_ops = {
 	.init = ipgre_tap_init_net,
 	.exit = ipgre_tap_exit_net,
+	.id   = &gre_tap_net_id,
+	.size = sizeof(struct ip_tunnel_net),
 };
 
 static int __init ipgre_init(void)
@@ -816,11 +908,11 @@ static int __init ipgre_init(void)
 
 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
 
-	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
+	err = register_pernet_device(&ipgre_net_ops);
 	if (err < 0)
 		goto out;
 
-	err = register_pernet_gen_device(&gre_tap_net_id, &ipgre_tap_net_ops);
+	err = register_pernet_device(&ipgre_tap_net_ops);
 	if (err < 0)
 		goto pnet_tap_faied;
 
@@ -839,6 +931,8 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	register_netdev_rst(&ipgre_netdev_rst);
+
 	return 0;
 
 tap_ops_failed:
@@ -846,18 +940,19 @@ tap_ops_failed:
 rtnl_link_failed:
 	gre_cisco_unregister(&ipgre_protocol);
 add_proto_failed:
-	unregister_pernet_gen_device(gre_tap_net_id, &ipgre_tap_net_ops);
+	unregister_pernet_device(&ipgre_tap_net_ops);
 pnet_tap_faied:
-	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
+	unregister_pernet_device(&ipgre_net_ops);
 out:
 	return err;
 }
 
 static void __exit ipgre_fini(void)
 {
+	unregister_netdev_rst(&ipgre_netdev_rst);
 	gre_cisco_unregister(&ipgre_protocol);
-	unregister_pernet_gen_device(gre_tap_net_id, &ipgre_tap_net_ops);
-	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
+	unregister_pernet_device(&ipgre_tap_net_ops);
+	unregister_pernet_device(&ipgre_net_ops);
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_input.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_input.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_input.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_input.c	2015-01-21 12:02:45.365169532 +0300
@@ -193,6 +193,8 @@ static int ip_local_deliver_finish(struc
 {
 	struct net *net = dev_net(skb->dev);
 
+	if (skb->destructor)
+		skb_orphan(skb);
 	__skb_pull(skb, ip_hdrlen(skb));
 
 	/* Point into the IP datagram, just past the header. */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_options.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_options.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_options.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_options.c	2015-01-21 12:02:42.490245859 +0300
@@ -327,7 +327,7 @@ int ip_options_compile(struct net *net,
 					pp_ptr = optptr + 2;
 					goto error;
 				}
-				if (skb) {
+				if (rt) {
 					memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
 					opt->is_changed = 1;
 				}
@@ -369,7 +369,7 @@ int ip_options_compile(struct net *net,
 						goto error;
 					}
 					opt->ts = optptr - iph;
-					if (skb) {
+					if (rt)  {
 						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
 						timeptr = (__be32*)&optptr[optptr[2]+3];
 					}
@@ -599,7 +599,7 @@ int ip_options_rcv_srr(struct sk_buff *s
 	struct rtable *rt2;
 	int err;
 
-	if (!opt->srr)
+	if (!opt->srr || !rt)
 		return 0;
 
 	if (skb->pkt_type != PACKET_HOST)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_output.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_output.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_output.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_output.c	2015-01-21 12:02:58.043832962 +0300
@@ -317,6 +317,7 @@ int ip_output(struct sk_buff *skb)
 			    ip_finish_output,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
+EXPORT_SYMBOL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 {
@@ -459,6 +460,7 @@ int ip_fragment(struct sk_buff *skb, int
 	__be16 not_last_frag;
 	struct rtable *rt = skb_rtable(skb);
 	int err = 0;
+	u8 brmark = skb_get_brmark(skb);
 
 	dev = rt->u.dst.dev;
 
@@ -558,6 +560,7 @@ int ip_fragment(struct sk_buff *skb, int
 				ip_send_check(iph);
 			}
 
+			skb_set_brmark(skb, brmark);
 			err = output(skb);
 
 			if (!err)
@@ -692,6 +695,7 @@ slow_path:
 
 		ip_send_check(iph);
 
+		skb_set_brmark(skb2, brmark);
 		err = output(skb2);
 		if (err)
 			goto fail;
@@ -1481,12 +1485,13 @@ void ip_send_reply(struct sock *sk, stru
 		char			data[40];
 	} replyopts;
 	struct ipcm_cookie ipc;
-	__be32 daddr;
+	__be32 saddr, daddr;
 	struct rtable *rt = skb_rtable(skb);
 
 	if (ip_options_echo(&replyopts.opt, skb))
 		return;
 
+	saddr = ip_hdr(skb)->daddr;
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
 	ipc.shtx.flags = 0;
@@ -1504,7 +1509,7 @@ void ip_send_reply(struct sock *sk, stru
 		struct flowi fl = { .oif = arg->bound_dev_if,
 				    .nl_u = { .ip4_u =
 					      { .daddr = daddr,
-						.saddr = rt->rt_spec_dst,
+						.saddr = saddr,
 						.tos = RT_TOS(ip_hdr(skb)->tos) } },
 				    /* Not quite clean, but right. */
 				    .uli_u = { .ports =
@@ -1528,8 +1533,11 @@ void ip_send_reply(struct sock *sk, stru
 	sk->sk_priority = skb->priority;
 	sk->sk_protocol = ip_hdr(skb)->protocol;
 	sk->sk_bound_dev_if = arg->bound_dev_if;
-	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
-		       &ipc, &rt, MSG_DONTWAIT);
+	if (ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+			   &ipc, &rt, MSG_DONTWAIT)) {
+		ip_flush_pending_frames(sk);
+		goto out;
+	}
 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
 		if (arg->csumoffset >= 0)
 			*((__sum16 *)skb_transport_header(skb) +
@@ -1538,7 +1546,7 @@ void ip_send_reply(struct sock *sk, stru
 		skb->ip_summed = CHECKSUM_NONE;
 		ip_push_pending_frames(sk);
 	}
-
+out:
 	bh_unlock_sock(sk);
 
 	ip_rt_put(rt);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_sockglue.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_sockglue.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_sockglue.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_sockglue.c	2015-01-21 12:02:45.748159365 +0300
@@ -954,7 +954,7 @@ mc_msf_out:
 	case IP_IPSEC_POLICY:
 	case IP_XFRM_POLICY:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			break;
 		err = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_tunnel.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_tunnel.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ip_tunnel.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ip_tunnel.c	2015-01-21 12:02:51.146016064 +0300
@@ -41,6 +41,7 @@
 #include <linux/if_vlan.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <linux/cpt_image.h>
 
 #include <net/sock.h>
 #include <net/ip.h>
@@ -370,6 +371,37 @@ static struct ip_tunnel *ip_tunnel_creat
 	return nt;
 }
 
+int ip_tunnel_rst(struct net *net, struct ip_tunnel_net *itn,
+		  const struct cpt_tunnel_image *v, const char *name)
+{
+	struct ip_tunnel_parm p;
+	struct ip_tunnel *nt;
+
+	strcpy(p.name, name);
+	p.link = 0;
+	if (cpt_object_has(v, cpt_link))
+		p.link = v->cpt_link;
+	p.i_flags = v->cpt_i_flags;
+	p.o_flags = v->cpt_o_flags;
+	p.i_key = v->cpt_i_key;
+	p.o_key = v->cpt_o_key;
+
+	BUILD_BUG_ON(sizeof(v->cpt_iphdr) != sizeof(p.iph));
+	memcpy(&p.iph, &v->cpt_iphdr, sizeof(p.iph));
+
+	if (v->cpt_tnl_flags & CPT_TUNNEL_FBDEV)
+		nt = netdev_priv(itn->fb_tunnel_dev);
+	else
+		nt = ip_tunnel_create(net, itn, &p);
+	if (!nt)
+		return -ENOMEM;
+
+	nt->i_seqno = v->cpt_i_seqno;
+	nt->o_seqno = v->cpt_o_seqno;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rst);
+
 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
 {
 	if (INET_ECN_is_ce(iph->tos)) {
@@ -688,7 +720,7 @@ int ip_tunnel_ioctl(struct net_device *d
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 		if (p->iph.ttl)
 			p->iph.frag_off |= htons(IP_DF);
@@ -739,7 +771,7 @@ int ip_tunnel_ioctl(struct net_device *d
 
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 
 		if (dev == itn->fb_tunnel_dev) {
@@ -786,7 +818,7 @@ static void ip_tunnel_dev_free(struct ne
 	free_netdev(dev);
 }
 
-void ip_tunnel_dellink(struct net_device *dev)
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct net *net = dev_net(dev);
 	struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -796,7 +828,7 @@ void ip_tunnel_dellink(struct net_device
 
 	if (itn->fb_tunnel_dev != dev) {
 		ip_tunnel_del(netdev_priv(dev));
-		unregister_netdevice(dev);
+		unregister_netdevice_queue(dev, head);
 	}
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ipconfig.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ipconfig.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ipconfig.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ipconfig.c	2015-01-21 12:02:45.366169505 +0300
@@ -192,19 +192,20 @@ static int __init ic_open_devs(void)
 	struct ic_device *d, **last;
 	struct net_device *dev;
 	unsigned short oflags;
+	struct net *net = get_exec_env()->ve_netns;
 
 	last = &ic_first_dev;
 	rtnl_lock();
 
 	/* bring loopback device up first */
-	for_each_netdev(&init_net, dev) {
+	for_each_netdev(net, dev) {
 		if (!(dev->flags & IFF_LOOPBACK))
 			continue;
 		if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
 			printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
 	}
 
-	for_each_netdev(&init_net, dev) {
+	for_each_netdev(net, dev) {
 		if (dev->flags & IFF_LOOPBACK)
 			continue;
 		if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
@@ -459,9 +460,6 @@ ic_rarp_recv(struct sk_buff *skb, struct
 	unsigned char *sha, *tha;		/* s for "source", t for "target" */
 	struct ic_device *d;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		goto drop;
-
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 		return NET_RX_DROP;
 
@@ -885,9 +883,6 @@ static int __init ic_bootp_recv(struct s
 	struct ic_device *d;
 	int len, ext_len;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		goto drop;
-
 	/* Perform verifications before taking the lock.  */
 	if (skb->pkt_type == PACKET_OTHERHOST)
 		goto drop;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ipip.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ipip.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ipip.c	2014-12-12 23:29:29.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ipip.c	2015-01-21 12:02:51.238013621 +0300
@@ -106,6 +106,7 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <linux/vzcalluser.h>
 
 #include <net/sock.h>
 #include <net/ip.h>
@@ -116,6 +117,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 
@@ -144,6 +148,9 @@ static struct ip_tunnel * ipip_tunnel_lo
 	struct ip_tunnel *t;
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
+	if (ipn == NULL)
+		return NULL;
+
 	for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
@@ -609,7 +616,7 @@ ipip_tunnel_ioctl (struct net_device *de
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 
 		err = -EFAULT;
@@ -668,7 +675,7 @@ ipip_tunnel_ioctl (struct net_device *de
 
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 
 		if (dev == ipn->fb_tunnel_dev) {
@@ -703,11 +710,14 @@ static int ipip_tunnel_change_mtu(struct
 	return 0;
 }
 
+static void ipip_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx);
 static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_uninit	= ipip_tunnel_uninit,
 	.ndo_start_xmit	= ipip_tunnel_xmit,
 	.ndo_do_ioctl	= ipip_tunnel_ioctl,
 	.ndo_change_mtu	= ipip_tunnel_change_mtu,
+	.ndo_cpt	= ipip_cpt,
 
 };
 
@@ -765,25 +775,133 @@ static struct xfrm_tunnel ipip_handler =
 static const char banner[] __initconst =
 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
 
-static void ipip_destroy_tunnels(struct ipip_net *ipn)
+static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
 {
 	int prio;
 
 	for (prio = 1; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t;
-			while ((t = ipn->tunnels[prio][h]) != NULL)
-				unregister_netdevice(t->dev);
+			struct ip_tunnel *t = ipn->tunnels[prio][h];
+
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = t->next;
+			}
 		}
 	}
 }
 
+static void ipip_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+	struct cpt_tunnel_image v;
+	struct ip_tunnel *t;
+	struct ipip_net *ipn;
+
+	t = netdev_priv(dev);
+	ipn = net_generic(get_exec_env()->ve_netns, ipip_net_id);
+	BUG_ON(ipn == NULL);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	/* mark fb dev */
+	v.cpt_tnl_flags = 0;
+	if (dev == ipn->fb_tunnel_dev)
+		v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV;
+
+	v.cpt_i_flags = t->parms.i_flags;
+	v.cpt_o_flags = t->parms.o_flags;
+	v.cpt_i_key = t->parms.i_key;
+	v.cpt_o_key = t->parms.o_key;
+
+	BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+	memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph));
+
+	ops->write(&v, sizeof(v), ctx);
+}
+
+static int ipip_rst(loff_t start, struct cpt_netdev_image *di,
+		struct rst_ops *ops, struct cpt_context *ctx)
+{
+	int err = -ENODEV;
+	struct cpt_tunnel_image v;
+	struct net_device *dev;
+	struct ip_tunnel *t;
+	loff_t pos;
+	int fbdev;
+	struct ipip_net *ipn;
+
+	ipn = net_generic(get_exec_env()->ve_netns, ipip_net_id);
+	if (ipn == NULL)
+		return -EOPNOTSUPP;
+
+	pos = start + di->cpt_hdrlen;
+	err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL,
+			pos, &v, sizeof(v), ctx);
+	if (err)
+		return err;
+
+	/* some sanity */
+	if (v.cpt_content != CPT_CONTENT_VOID)
+		return -EINVAL;
+
+	if (v.cpt_tnl_flags & (~CPT_TUNNEL_FBDEV))
+		return 1;
+
+	if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) {
+		fbdev = 1;
+		err = 0;
+		dev = ipn->fb_tunnel_dev;
+	} else {
+		fbdev = 0;
+		err = -ENOMEM;
+		dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name,
+				ipip_tunnel_setup);
+		if (!dev)
+			goto out;
+	}
+
+	t = netdev_priv(dev);
+	t->parms.i_flags = v.cpt_i_flags;
+	t->parms.o_flags = v.cpt_o_flags;
+	t->parms.i_key = v.cpt_i_key;
+	t->parms.o_key = v.cpt_o_key;
+
+	BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+	memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph));
+
+	if (!fbdev) {
+		ipip_tunnel_init(dev);
+		err = register_netdevice(dev);
+		if (err) {
+			free_netdev(dev);
+			goto out;
+		}
+
+		dev_hold(dev);
+		ipip_tunnel_link(ipn, t);
+	}
+out:
+	return err;
+}
+
+static struct netdev_rst ipip_netdev_rst = {
+	.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL,
+	.ndo_rst = ipip_rst,
+};
+
 static int ipip_init_net(struct net *net)
 {
 	int err;
 	struct ipip_net *ipn;
 
+	if (!(get_exec_env()->features & VE_FEATURE_IPIP))
+		return net_assign_generic(net, ipip_net_id, NULL);
+
 	err = -ENOMEM;
 	ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
 	if (ipn == NULL)
@@ -827,18 +945,25 @@ err_alloc:
 static void ipip_exit_net(struct net *net)
 {
 	struct ipip_net *ipn;
+	LIST_HEAD(list);
 
 	ipn = net_generic(net, ipip_net_id);
+	if (ipn == NULL) /* no VE_FEATURE_IPIP */
+		return;
+
 	rtnl_lock();
-	ipip_destroy_tunnels(ipn);
-	unregister_netdevice(ipn->fb_tunnel_dev);
+	ipip_destroy_tunnels(ipn, &list);
+	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
+	net_assign_generic(net, ipip_net_id, NULL);
 	kfree(ipn);
 }
 
 static struct pernet_operations ipip_net_ops = {
 	.init = ipip_init_net,
 	.exit = ipip_exit_net,
+	.id = &ipip_net_id,
 };
 
 static int __init ipip_init(void)
@@ -847,23 +972,26 @@ static int __init ipip_init(void)
 
 	printk(banner);
 
-	err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
+	err = register_pernet_device(&ipip_net_ops);
 	if (err < 0)
 		return err;
 	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
 	if (err < 0) {
-		unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
+		unregister_pernet_device(&ipip_net_ops);
 		printk(KERN_INFO "ipip init: can't register tunnel\n");
-	}
+	} else
+		register_netdev_rst(&ipip_netdev_rst);
+
 	return err;
 }
 
 static void __exit ipip_fini(void)
 {
+	unregister_netdev_rst(&ipip_netdev_rst);
 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
 		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
 
-	unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
+	unregister_pernet_device(&ipip_net_ops);
 }
 
 module_init(ipip_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/ipmr.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ipmr.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/ipmr.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/ipmr.c	2015-01-21 12:02:51.165015558 +0300
@@ -239,13 +239,9 @@ static int __net_init ipmr_rules_init(st
 	struct mr_table *mrt;
 	int err;
 
-	ops = kmemdup(&ipmr_rules_ops_template, sizeof(*ops), GFP_KERNEL);
-	if (ops == NULL)
-		return -ENOMEM;
-	INIT_LIST_HEAD(&ops->rules_list);
-	ops->fro_net = net;
-
-	fib_rules_register(ops);
+	ops = fib_rules_register(&ipmr_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
 
 	INIT_LIST_HEAD(&net->ipv4.mr_tables);
 
@@ -266,7 +262,6 @@ err2:
 	kfree(mrt);
 err1:
 	fib_rules_unregister(ops);
-	kfree(ops);
 	return err;
 }
 
@@ -279,7 +274,6 @@ static void __net_exit ipmr_rules_exit(s
 		ipmr_free_table(mrt);
 	}
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
-	kfree(net->ipv4.mr_rules_ops);
 }
 #else
 #define ipmr_for_each_table(mrt, net) \
@@ -543,7 +537,8 @@ failure:
  *	@notify: Set to 1, if the caller is a notifier_call
  */
 
-static int vif_delete(struct mr_table *mrt, int vifi, int notify)
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
+		      struct list_head *head)
 {
 	struct vif_device *v;
 	struct net_device *dev;
@@ -587,7 +582,7 @@ static int vif_delete(struct mr_table *m
 	}
 
 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
-		unregister_netdevice(dev);
+		unregister_netdevice_queue(dev, head);
 
 	dev_put(dev);
 	return 0;
@@ -1131,14 +1126,16 @@ static void mroute_clean_tables(struct m
 {
 	int i;
 	struct mfc_cache *c, *next;
+	LIST_HEAD(list);
 
 	/*
 	 *	Shut down all active vif entries
 	 */
 	for (i = 0; i < mrt->maxvif; i++) {
 		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
-			vif_delete(mrt, i, 0);
+			vif_delete(mrt, i, 0, &list);
 	}
+	unregister_netdevice_many(&list);
 
 	/*
 	 *	Wipe the cache
@@ -1249,7 +1246,7 @@ int ip_mroute_setsockopt(struct sock *sk
 		if (optname == MRT_ADD_VIF) {
 			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
 		} else {
-			ret = vif_delete(mrt, vif.vifc_vifi, 0);
+			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
 		}
 		rtnl_unlock();
 		return ret;
@@ -1444,6 +1441,7 @@ static int ipmr_device_event(struct noti
 	struct mr_table *mrt;
 	struct vif_device *v;
 	int ct;
+	LIST_HEAD(list);
 
 	if (!net_eq(dev_net(dev), net))
 		return NOTIFY_DONE;
@@ -1455,9 +1453,10 @@ static int ipmr_device_event(struct noti
 		v = &mrt->vif_table[0];
 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
 			if (v->dev == dev)
-				vif_delete(mrt, ct, 1);
+				vif_delete(mrt, ct, 1, &list);
 		}
 	}
+	unregister_netdevice_many(&list);
 	return NOTIFY_DONE;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/arp_tables.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/arp_tables.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/arp_tables.c	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/arp_tables.c	2015-01-21 12:02:58.555819374 +0300
@@ -24,6 +24,7 @@
 #include <net/compat.h>
 #include <net/sock.h>
 #include <asm/uaccess.h>
+#include <linux/fence-watchdog.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp/arp_tables.h>
@@ -109,6 +110,14 @@ static inline int arp_packet_match(const
 
 #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	if (FWINV((arpinfo->flags & ARPT_WDOGTMO) && !fence_wdog_tmo_match(),
+		  ARPT_INV_WDOGTMO)) {
+		dprintf("Watchdog timeout mismatch.\n");
+		return 0;
+	}
+#endif
+
 	if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
 		  ARPT_INV_ARPOP)) {
 		dprintf("ARP operation field mismatch.\n");
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ip_queue.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ip_queue.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ip_queue.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ip_queue.c	2015-01-21 12:02:45.436167647 +0300
@@ -409,7 +409,8 @@ ipq_dev_drop(int ifindex)
 static inline void
 __ipq_rcv_skb(struct sk_buff *skb)
 {
-	int status, type, pid, flags, nlmsglen, skblen;
+	int status, type, pid, flags;
+	unsigned int nlmsglen, skblen;
 	struct nlmsghdr *nlh;
 
 	skblen = skb->len;
@@ -437,7 +438,8 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (type <= IPQM_BASE)
 		return;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	write_lock_bh(&queue_lock);
@@ -467,8 +469,12 @@ __ipq_rcv_skb(struct sk_buff *skb)
 static void
 ipq_rcv_skb(struct sk_buff *skb)
 {
+	struct ve_struct *old_ve;
+
 	mutex_lock(&ipqnl_mutex);
+	old_ve = set_exec_env(skb->owner_env);
 	__ipq_rcv_skb(skb);
+	(void)set_exec_env(old_ve);
 	mutex_unlock(&ipqnl_mutex);
 }
 
@@ -478,9 +484,6 @@ ipq_rcv_dev_event(struct notifier_block 
 {
 	struct net_device *dev = ptr;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	/* Drop any packets associated with the downed device */
 	if (event == NETDEV_DOWN)
 		ipq_dev_drop(dev->ifindex);
@@ -500,7 +503,7 @@ ipq_rcv_nl_event(struct notifier_block *
 	if (event == NETLINK_URELEASE &&
 	    n->protocol == NETLINK_FIREWALL && n->pid) {
 		write_lock_bh(&queue_lock);
-		if ((n->net == &init_net) && (n->pid == peer_pid))
+		if (n->pid == peer_pid)
 			__ipq_reset();
 		write_unlock_bh(&queue_lock);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ip_tables.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ip_tables.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ip_tables.c	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ip_tables.c	2015-01-21 12:02:47.740106481 +0300
@@ -321,6 +321,9 @@ ipt_do_table(struct sk_buff *skb,
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
 
+	if (ve_xt_table_forbidden(table))
+		return NF_ACCEPT;
+
 	/* Initialization */
 	ip = ip_hdr(skb);
 	indev = in ? in->name : nulldevname;
@@ -466,8 +469,8 @@ mark_source_chains(struct xt_table_info 
 			int visited = e->comefrom & (1 << hook);
 
 			if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
-				printk("iptables: loop hook %u pos %u %08X.\n",
-				       hook, pos, e->comefrom);
+				ve_printk(VE_LOG, "iptables: loop hook %u pos "
+					"%u %08X.\n", hook, pos, e->comefrom);
 				return 0;
 			}
 			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
@@ -618,9 +621,10 @@ find_check_match(struct ipt_entry_match 
 	struct xt_match *match;
 	int ret;
 
-	match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
-						      m->u.user.revision),
-					"ipt_%s", m->u.user.name);
+	match = ve0_try_then_request_module(xt_find_match(AF_INET,
+							  m->u.user.name,
+							  m->u.user.revision),
+					    "ipt_%s", m->u.user.name);
 	if (IS_ERR(match) || !match) {
 		duprintf("find_check_match: `%s' not found\n", m->u.user.name);
 		return match ? PTR_ERR(match) : -ENOENT;
@@ -684,10 +688,10 @@ find_check_entry(struct ipt_entry *e, co
 		goto cleanup_matches;
 
 	t = ipt_get_target(e);
-	target = try_then_request_module(xt_find_target(AF_INET,
-							t->u.user.name,
-							t->u.user.revision),
-					 "ipt_%s", t->u.user.name);
+	target = ve0_try_then_request_module(xt_find_target(AF_INET,
+							    t->u.user.name,
+							    t->u.user.revision),
+					     "ipt_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
 		duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
 		ret = target ? PTR_ERR(target) : -ENOENT;
@@ -950,7 +954,7 @@ static struct xt_counters * alloc_counte
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = ub_vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1127,8 +1131,8 @@ static int get_info(struct net *net, voi
 	if (compat)
 		xt_compat_lock(AF_INET);
 #endif
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
-				    "iptable_%s", name);
+	t = ve0_try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+					"iptable_%s", name);
 	if (t && !IS_ERR(t)) {
 		struct ipt_getinfo info;
 		const struct xt_table_info *private = t->private;
@@ -1217,14 +1221,14 @@ __do_replace(struct net *net, const char
 	void *loc_cpu_old_entry;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
-				    "iptable_%s", name);
+	t = ve0_try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+					"iptable_%s", name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free_newinfo_counters_untrans;
@@ -1382,7 +1386,7 @@ do_add_counters(struct net *net, void __
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = ub_vmalloc_node(len - size, numa_node_id());
 	if (!paddc)
 		return -ENOMEM;
 
@@ -1491,9 +1495,10 @@ compat_find_calc_match(struct ipt_entry_
 {
 	struct xt_match *match;
 
-	match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
-						      m->u.user.revision),
-					"ipt_%s", m->u.user.name);
+	match = ve0_try_then_request_module(xt_find_match(AF_INET,
+							  m->u.user.name,
+							  m->u.user.revision),
+					    "ipt_%s", m->u.user.name);
 	if (IS_ERR(match) || !match) {
 		duprintf("compat_check_calc_match: `%s' not found\n",
 			 m->u.user.name);
@@ -1576,10 +1581,10 @@ check_compat_entry_size_and_hooks(struct
 		goto release_matches;
 
 	t = compat_ipt_get_target(e);
-	target = try_then_request_module(xt_find_target(AF_INET,
-							t->u.user.name,
-							t->u.user.revision),
-					 "ipt_%s", t->u.user.name);
+	target = ve0_try_then_request_module(xt_find_target(AF_INET,
+							    t->u.user.name,
+							    t->u.user.revision),
+					     "ipt_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
 		duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
 			 t->u.user.name);
@@ -1612,7 +1617,7 @@ check_compat_entry_size_and_hooks(struct
 out:
 	module_put(t->u.kernel.target->me);
 release_matches:
-	IPT_MATCH_ITERATE(e, compat_release_match, &j);
+	COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, &j);
 	return ret;
 }
 
@@ -1857,13 +1862,15 @@ compat_do_replace(struct net *net, void 
 	return ret;
 }
 
+static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int);
+
 static int
 compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 		      unsigned int len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1876,8 +1883,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	i
 		break;
 
 	default:
-		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
-		ret = -EINVAL;
+		ret = do_ipt_set_ctl(sk, cmd, user, len);
 	}
 
 	return ret;
@@ -1974,7 +1980,7 @@ compat_do_ipt_get_ctl(struct sock *sk, i
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1996,7 +2002,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2021,7 +2027,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2053,10 +2059,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd,
 		else
 			target = 0;
 
-		try_then_request_module(xt_find_revision(AF_INET, rev.name,
-							 rev.revision,
-							 target, &ret),
-					"ipt_%s", rev.name);
+		ve0_try_then_request_module(xt_find_revision(AF_INET, rev.name,
+							     rev.revision,
+							     target, &ret),
+					    "ipt_%s", rev.name);
 		break;
 	}
 
@@ -2075,7 +2081,7 @@ struct xt_table *ipt_register_table(stru
 	int ret;
 	struct xt_table_info *newinfo;
 	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+		= { 0, 0, 0, 0, { 0 }, { 0 }, { } };
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
@@ -2221,12 +2227,25 @@ static struct xt_match icmp_matchstruct 
 
 static int __net_init ip_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NFPROTO_IPV4);
+	int res;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES))
+		return 0;
+
+	res = xt_proto_init(net, NFPROTO_IPV4);
+	if (!res)
+		net_ipt_module_set(net, VE_IP_IPTABLES);
+	return res;
 }
 
 static void __net_exit ip_tables_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES))
+		return;
+
 	xt_proto_fini(net, NFPROTO_IPV4);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES);
 }
 
 static struct pernet_operations ip_tables_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_CLUSTERIP.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_CLUSTERIP.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_CLUSTERIP.c	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_CLUSTERIP.c	2015-01-21 12:02:45.408168391 +0300
@@ -20,6 +20,7 @@
 #include <linux/icmp.h>
 #include <linux/if_arp.h>
 #include <linux/seq_file.h>
+#include <linux/nsproxy.h>
 #include <linux/netfilter_arp.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
@@ -383,7 +384,8 @@ static bool clusterip_tg_check(const str
 				return false;
 			}
 
-			dev = dev_get_by_name(&init_net, e->ip.iniface);
+			dev = dev_get_by_name(get_exec_env()->ve_netns,
+						e->ip.iniface);
 			if (!dev) {
 				printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
 				return false;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_LOG.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_LOG.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_LOG.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_LOG.c	2015-01-21 12:02:45.409168364 +0300
@@ -47,32 +47,32 @@ static void dump_packet(const struct nf_
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
-		printk("TRUNCATED");
+		ve_printk(VE_LOG, "TRUNCATED");
 		return;
 	}
 
 	/* Important fields:
 	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
 	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
-	printk("SRC=%pI4 DST=%pI4 ",
+	ve_printk(VE_LOG, "SRC=%pI4 DST=%pI4 ",
 	       &ih->saddr, &ih->daddr);
 
 	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
-	printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+	ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
 	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
 	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
 
 	/* Max length: 6 "CE DF MF " */
 	if (ntohs(ih->frag_off) & IP_CE)
-		printk("CE ");
+		ve_printk(VE_LOG, "CE ");
 	if (ntohs(ih->frag_off) & IP_DF)
-		printk("DF ");
+		ve_printk(VE_LOG, "DF ");
 	if (ntohs(ih->frag_off) & IP_MF)
-		printk("MF ");
+		ve_printk(VE_LOG, "MF ");
 
 	/* Max length: 11 "FRAG:65535 " */
 	if (ntohs(ih->frag_off) & IP_OFFSET)
-		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+		ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
 	if ((logflags & IPT_LOG_IPOPT)
 	    && ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_
 		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
 					optsize, _opt);
 		if (op == NULL) {
-			printk("TRUNCATED");
+			ve_printk(VE_LOG, "TRUNCATED");
 			return;
 		}
 
 		/* Max length: 127 "OPT (" 15*4*2chars ") " */
-		printk("OPT (");
+		ve_printk(VE_LOG, "OPT (");
 		for (i = 0; i < optsize; i++)
-			printk("%02X", op[i]);
-		printk(") ");
+			ve_printk(VE_LOG, "%02X", op[i]);
+		ve_printk(VE_LOG, ") ");
 	}
 
 	switch (ih->protocol) {
@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_
 		const struct tcphdr *th;
 
 		/* Max length: 10 "PROTO=TCP " */
-		printk("PROTO=TCP ");
+		ve_printk(VE_LOG, "PROTO=TCP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -110,41 +110,41 @@ static void dump_packet(const struct nf_
 		th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
 					sizeof(_tcph), &_tcph);
 		if (th == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u ",
+		ve_printk(VE_LOG, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
 		if (logflags & IPT_LOG_TCPSEQ)
-			printk("SEQ=%u ACK=%u ",
+			ve_printk(VE_LOG, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
-		printk("WINDOW=%u ", ntohs(th->window));
+		ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window));
 		/* Max length: 9 "RES=0x3F " */
-		printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
 		if (th->cwr)
-			printk("CWR ");
+			ve_printk(VE_LOG, "CWR ");
 		if (th->ece)
-			printk("ECE ");
+			ve_printk(VE_LOG, "ECE ");
 		if (th->urg)
-			printk("URG ");
+			ve_printk(VE_LOG, "URG ");
 		if (th->ack)
-			printk("ACK ");
+			ve_printk(VE_LOG, "ACK ");
 		if (th->psh)
-			printk("PSH ");
+			ve_printk(VE_LOG, "PSH ");
 		if (th->rst)
-			printk("RST ");
+			ve_printk(VE_LOG, "RST ");
 		if (th->syn)
-			printk("SYN ");
+			ve_printk(VE_LOG, "SYN ");
 		if (th->fin)
-			printk("FIN ");
+			ve_printk(VE_LOG, "FIN ");
 		/* Max length: 11 "URGP=65535 " */
-		printk("URGP=%u ", ntohs(th->urg_ptr));
+		ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr));
 
 		if ((logflags & IPT_LOG_TCPOPT)
 		    && th->doff * 4 > sizeof(struct tcphdr)) {
@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_
 						iphoff+ih->ihl*4+sizeof(_tcph),
 						optsize, _opt);
 			if (op == NULL) {
-				printk("TRUNCATED");
+				ve_printk(VE_LOG, "TRUNCATED");
 				return;
 			}
 
 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
-			printk("OPT (");
+			ve_printk(VE_LOG, "OPT (");
 			for (i = 0; i < optsize; i++)
-				printk("%02X", op[i]);
-			printk(") ");
+				ve_printk(VE_LOG, "%02X", op[i]);
+			ve_printk(VE_LOG, ") ");
 		}
 		break;
 	}
@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_
 
 		if (ih->protocol == IPPROTO_UDP)
 			/* Max length: 10 "PROTO=UDP "     */
-			printk("PROTO=UDP " );
+			ve_printk(VE_LOG, "PROTO=UDP " );
 		else	/* Max length: 14 "PROTO=UDPLITE " */
-			printk("PROTO=UDPLITE ");
+			ve_printk(VE_LOG, "PROTO=UDPLITE ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_
 		uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_udph), &_udph);
 		if (uh == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u LEN=%u ",
+		ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ",
 		       ntohs(uh->source), ntohs(uh->dest),
 		       ntohs(uh->len));
 		break;
@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_
 			    [ICMP_ADDRESSREPLY] = 12 };
 
 		/* Max length: 11 "PROTO=ICMP " */
-		printk("PROTO=ICMP ");
+		ve_printk(VE_LOG, "PROTO=ICMP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_
 		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
 					 sizeof(_icmph), &_icmph);
 		if (ich == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 18 "TYPE=255 CODE=255 " */
-		printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+		ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code);
 
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		if (ich->type <= NR_ICMP_TYPES
 		    && required_len[ich->type]
 		    && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
-			printk("INCOMPLETE [%u bytes] ",
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
@@ -250,35 +250,35 @@ static void dump_packet(const struct nf_
 		case ICMP_ECHOREPLY:
 		case ICMP_ECHO:
 			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			printk("ID=%u SEQ=%u ",
+			ve_printk(VE_LOG, "ID=%u SEQ=%u ",
 			       ntohs(ich->un.echo.id),
 			       ntohs(ich->un.echo.sequence));
 			break;
 
 		case ICMP_PARAMETERPROB:
 			/* Max length: 14 "PARAMETER=255 " */
-			printk("PARAMETER=%u ",
+			ve_printk(VE_LOG, "PARAMETER=%u ",
 			       ntohl(ich->un.gateway) >> 24);
 			break;
 		case ICMP_REDIRECT:
 			/* Max length: 24 "GATEWAY=255.255.255.255 " */
-			printk("GATEWAY=%pI4 ", &ich->un.gateway);
+			ve_printk(VE_LOG, "GATEWAY=%pI4 ", &ich->un.gateway);
 			/* Fall through */
 		case ICMP_DEST_UNREACH:
 		case ICMP_SOURCE_QUENCH:
 		case ICMP_TIME_EXCEEDED:
 			/* Max length: 3+maxlen */
 			if (!iphoff) { /* Only recurse once. */
-				printk("[");
+				ve_printk(VE_LOG, "[");
 				dump_packet(info, skb,
 					    iphoff + ih->ihl*4+sizeof(_icmph));
-				printk("] ");
+				ve_printk(VE_LOG, "] ");
 			}
 
 			/* Max length: 10 "MTU=65535 " */
 			if (ich->type == ICMP_DEST_UNREACH
 			    && ich->code == ICMP_FRAG_NEEDED)
-				printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+				ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu));
 		}
 		break;
 	}
@@ -291,19 +291,19 @@ static void dump_packet(const struct nf_
 			break;
 
 		/* Max length: 9 "PROTO=AH " */
-		printk("PROTO=AH ");
+		ve_printk(VE_LOG, "PROTO=AH ");
 
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_ahdr), &_ahdr);
 		if (ah == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Length: 15 "SPI=0xF1234567 " */
-		printk("SPI=0x%x ", ntohl(ah->spi));
+		ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi));
 		break;
 	}
 	case IPPROTO_ESP: {
@@ -311,7 +311,7 @@ static void dump_packet(const struct nf_
 		const struct ip_esp_hdr *eh;
 
 		/* Max length: 10 "PROTO=ESP " */
-		printk("PROTO=ESP ");
+		ve_printk(VE_LOG, "PROTO=ESP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -320,25 +320,25 @@ static void dump_packet(const struct nf_
 		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_esph), &_esph);
 		if (eh == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Length: 15 "SPI=0xF1234567 " */
-		printk("SPI=0x%x ", ntohl(eh->spi));
+		ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi));
 		break;
 	}
 	/* Max length: 10 "PROTO 255 " */
 	default:
-		printk("PROTO=%u ", ih->protocol);
+		ve_printk(VE_LOG, "PROTO=%u ", ih->protocol);
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
 	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			printk("UID=%u GID=%u ",
+			ve_printk(VE_LOG, "UID=%u GID=%u ",
 				skb->sk->sk_socket->file->f_cred->fsuid,
 				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -386,7 +386,7 @@ ipt_log_packet(u_int8_t pf,
 		loginfo = &default_loginfo;
 
 	spin_lock_bh(&log_lock);
-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
 	       prefix,
 	       in ? in->name : "",
 	       out ? out->name : "");
@@ -397,30 +397,30 @@ ipt_log_packet(u_int8_t pf,
 
 		physindev = skb->nf_bridge->physindev;
 		if (physindev && in != physindev)
-			printk("PHYSIN=%s ", physindev->name);
+			ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name);
 		physoutdev = skb->nf_bridge->physoutdev;
 		if (physoutdev && out != physoutdev)
-			printk("PHYSOUT=%s ", physoutdev->name);
+			ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name);
 	}
 #endif
 
 	if (in && !out) {
 		/* MAC logging for input chain only. */
-		printk("MAC=");
+		ve_printk(VE_LOG, "MAC=");
 		if (skb->dev && skb->dev->hard_header_len
 		    && skb->mac_header != skb->network_header) {
 			int i;
 			const unsigned char *p = skb_mac_header(skb);
 			for (i = 0; i < skb->dev->hard_header_len; i++,p++)
-				printk("%02x%c", *p,
+				ve_printk(VE_LOG, "%02x%c", *p,
 				       i==skb->dev->hard_header_len - 1
 				       ? ' ':':');
 		} else
-			printk(" ");
+			ve_printk(VE_LOG, " ");
 	}
 
 	dump_packet(loginfo, skb, 0);
-	printk("\n");
+	ve_printk(VE_LOG, "\n");
 	spin_unlock_bh(&log_lock);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_MASQUERADE.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_MASQUERADE.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_MASQUERADE.c	2015-01-21 12:02:45.409168364 +0300
@@ -88,6 +88,7 @@ masquerade_tg(struct sk_buff *skb, const
 	return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
 }
 
+#if 0
 static int
 device_cmp(struct nf_conn *i, void *ifindex)
 {
@@ -134,6 +135,7 @@ static struct notifier_block masq_dev_no
 static struct notifier_block masq_inet_notifier = {
 	.notifier_call	= masq_inet_event,
 };
+#endif
 
 static struct xt_target masquerade_tg_reg __read_mostly = {
 	.name		= "MASQUERADE",
@@ -152,12 +154,16 @@ static int __init masquerade_tg_init(voi
 
 	ret = xt_register_target(&masquerade_tg_reg);
 
+#if 0
+/*	These notifiers are unnecessary and may
+	lead to oops in virtual environments */
 	if (ret == 0) {
 		/* Register for device down reports */
 		register_netdevice_notifier(&masq_dev_notifier);
 		/* Register IP address change reports */
 		register_inetaddr_notifier(&masq_inet_notifier);
 	}
+#endif
 
 	return ret;
 }
@@ -165,8 +171,8 @@ static int __init masquerade_tg_init(voi
 static void __exit masquerade_tg_exit(void)
 {
 	xt_unregister_target(&masquerade_tg_reg);
-	unregister_netdevice_notifier(&masq_dev_notifier);
-	unregister_inetaddr_notifier(&masq_inet_notifier);
+/*	unregister_netdevice_notifier(&masq_dev_notifier);
+	unregister_inetaddr_notifier(&masq_inet_notifier);*/
 }
 
 module_init(masquerade_tg_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_REDIRECT.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_REDIRECT.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_REDIRECT.c	2015-01-21 12:02:45.409168364 +0300
@@ -67,8 +67,13 @@ redirect_tg(struct sk_buff *skb, const s
 
 		rcu_read_lock();
 		indev = __in_dev_get_rcu(skb->dev);
-		if (indev && (ifa = indev->ifa_list))
+		if (indev && (ifa = indev->ifa_list)) {
+			/* because of venet device specific, we should use
+			 * second ifa in the list */
+			if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next)
+				ifa = ifa->ifa_next;
 			newdst = ifa->ifa_local;
+		}
 		rcu_read_unlock();
 
 		if (!newdst)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_REJECT.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/ipt_REJECT.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/ipt_REJECT.c	2015-01-21 12:02:45.409168364 +0300
@@ -174,13 +174,13 @@ static bool reject_tg_check(const struct
 	const struct ipt_entry *e = par->entryinfo;
 
 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
-		printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
+		ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n");
 		return false;
 	} else if (rejinfo->with == IPT_TCP_RESET) {
 		/* Must specify that it's a TCP packet */
 		if (e->ip.proto != IPPROTO_TCP
 		    || (e->ip.invflags & XT_INV_PROTO)) {
-			printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
+			ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n");
 			return false;
 		}
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/iptable_filter.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/iptable_filter.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/iptable_filter.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/iptable_filter.c	2015-01-21 12:02:47.740106481 +0300
@@ -128,17 +128,27 @@ module_param(forward, bool, 0000);
 
 static int __net_init iptable_filter_net_init(struct net *net)
 {
+	if (!net_ipt_permitted(net, VE_IP_FILTER))
+		return 0;
+
 	/* Register table */
 	net->ipv4.iptable_filter =
 		ipt_register_table(net, &packet_filter, &initial_table.repl);
 	if (IS_ERR(net->ipv4.iptable_filter))
 		return PTR_ERR(net->ipv4.iptable_filter);
+
+	net_ipt_module_set(net, VE_IP_FILTER);
 	return 0;
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_FILTER))
+		return;
+
 	ipt_unregister_table(net->ipv4.iptable_filter);
+
+	net_ipt_module_clear(net, VE_IP_FILTER);
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/iptable_mangle.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/iptable_mangle.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/iptable_mangle.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/iptable_mangle.c	2015-01-21 12:02:47.740106481 +0300
@@ -198,17 +198,27 @@ static struct nf_hook_ops ipt_ops[] __re
 
 static int __net_init iptable_mangle_net_init(struct net *net)
 {
+	if (!net_ipt_permitted(net, VE_IP_MANGLE))
+		return 0;
+
 	/* Register table */
 	net->ipv4.iptable_mangle =
 		ipt_register_table(net, &packet_mangler, &initial_table.repl);
 	if (IS_ERR(net->ipv4.iptable_mangle))
 		return PTR_ERR(net->ipv4.iptable_mangle);
+
+	net_ipt_module_set(net, VE_IP_MANGLE);
 	return 0;
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
+		return;
+
 	ipt_unregister_table(net->ipv4.iptable_mangle);
+
+	net_ipt_module_clear(net, VE_IP_MANGLE);
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/iptable_raw.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/iptable_raw.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/iptable_raw.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/iptable_raw.c	2015-01-21 12:02:45.533165072 +0300
@@ -90,17 +90,28 @@ static struct nf_hook_ops ipt_ops[] __re
 
 static int __net_init iptable_raw_net_init(struct net *net)
 {
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES))
+		return 0;
+
 	/* Register table */
 	net->ipv4.iptable_raw =
 		ipt_register_table(net, &packet_raw, &initial_table.repl);
 	if (IS_ERR(net->ipv4.iptable_raw))
 		return PTR_ERR(net->ipv4.iptable_raw);
+
+	net_ipt_module_set(net, VE_IP_IPTABLES);
+
 	return 0;
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES))
+		return;
+
 	ipt_unregister_table(net->ipv4.iptable_raw);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES);
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c	2015-01-21 12:02:47.740106481 +0300
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
+#include <net/net_namespace.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/icmp.h>
@@ -197,7 +198,7 @@ static ctl_table ip_ct_sysctl_table[] = 
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_MAX,
 		.procname	= "ip_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -367,6 +368,33 @@ struct nf_conntrack_l3proto nf_conntrack
 	.me		 = THIS_MODULE,
 };
 
+static int nf_conntrack_l3proto_ipv4_init_net(struct net *net)
+{
+	if (!net_ipt_permitted(net, VE_IP_CONNTRACK))
+		return 0;
+	/*
+	 * FIXME:
+	 * Need virtualize per-net sysctls
+	 */
+
+	net_ipt_module_set(net, VE_IP_CONNTRACK);
+	return 0;
+}
+
+static void nf_conntrack_l3proto_ipv4_fini_net(struct net *net)
+{
+	/* A dummy call in a sake of consistency */
+	if (net_is_ipt_module_set(net, VE_IP_CONNTRACK))
+		net_ipt_module_clear(net, VE_IP_CONNTRACK);
+
+	return;
+}
+
+static struct pernet_operations nf_conntrack_ipv4_net_ops = {
+	.init = nf_conntrack_l3proto_ipv4_init_net,
+	.exit = nf_conntrack_l3proto_ipv4_fini_net,
+};
+
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
@@ -381,6 +409,12 @@ static int __init nf_conntrack_l3proto_i
 	need_conntrack();
 	nf_defrag_ipv4_enable();
 
+	ret = register_pernet_subsys(&nf_conntrack_ipv4_net_ops);
+	if (ret) {
+		printk(KERN_ERR "nf_conntrack_ipv4: Unable to register pernet operations\n");
+		return ret;
+	}
+
 	ret = nf_register_sockopt(&so_getorigdst);
 	if (ret < 0) {
 		printk(KERN_ERR "Unable to register netfilter socket option\n");
@@ -452,6 +486,7 @@ static void __exit nf_conntrack_l3proto_
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
 	nf_unregister_sockopt(&so_getorigdst);
+	unregister_pernet_subsys(&nf_conntrack_ipv4_net_ops);
 }
 
 module_init(nf_conntrack_l3proto_ipv4_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/nf_nat_core.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/nf_nat_core.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/nf_nat_core.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/nf_nat_core.c	2015-01-21 12:02:51.072018028 +0300
@@ -282,6 +282,22 @@ out:
 	rcu_read_unlock();
 }
 
+void nf_nat_hash_conntrack(struct net *net, struct nf_conn *ct)
+{
+	unsigned int srchash;
+	struct nf_conn_nat *nat;
+
+	srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	spin_lock_bh(&nf_nat_lock);
+	/* nf_conntrack_alter_reply might re-allocate exntension aera */
+	nat = nfct_nat(ct);
+	nat->ct = ct;
+	hlist_add_head_rcu(&nat->bysource,
+			   &net->ipv4.nat_bysource[srchash]);
+	spin_unlock_bh(&nf_nat_lock);
+}
+EXPORT_SYMBOL_GPL(nf_nat_hash_conntrack);
+
 unsigned int
 nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
@@ -329,18 +345,8 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_DST_NAT;
 	}
 
-	if (maniptype == IP_NAT_MANIP_SRC) {
-		unsigned int srchash;
-
-		srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		spin_lock_bh(&nf_nat_lock);
-		/* nf_conntrack_alter_reply might re-allocate exntension aera */
-		nat = nfct_nat(ct);
-		nat->ct = ct;
-		hlist_add_head_rcu(&nat->bysource,
-				   &net->ipv4.nat_bysource[srchash]);
-		spin_unlock_bh(&nf_nat_lock);
-	}
+	if (maniptype == IP_NAT_MANIP_SRC)
+		nf_nat_hash_conntrack(net, ct);
 
 	/* It's done. */
 	if (maniptype == IP_NAT_MANIP_DST)
@@ -579,7 +585,7 @@ static void nf_nat_cleanup_conntrack(str
 	if (nat == NULL || nat->ct == NULL)
 		return;
 
-	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
+	NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
 
 	spin_lock_bh(&nf_nat_lock);
 	hlist_del_rcu(&nat->bysource);
@@ -706,6 +712,9 @@ nfnetlink_parse_nat_setup(struct nf_conn
 
 static int __net_init nf_nat_net_init(struct net *net)
 {
+	if (net_ipt_permitted(net, VE_IP_NAT))
+		net_ipt_module_set(net, VE_IP_NAT);
+
 	/* Leave them the same for the moment. */
 	net->ipv4.nat_htable_size = net->ct.htable_size;
 	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
@@ -733,6 +742,8 @@ static void __net_exit nf_nat_net_exit(s
 	synchronize_rcu();
 	nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
 			     net->ipv4.nat_htable_size);
+
+	net_ipt_module_clear(net, VE_IP_NAT);
 }
 
 static struct pernet_operations nf_nat_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/nf_nat_rule.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/nf_nat_rule.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter/nf_nat_rule.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter/nf_nat_rule.c	2015-01-21 12:02:47.740106481 +0300
@@ -186,16 +186,27 @@ static struct xt_target ipt_dnat_reg __r
 
 static int __net_init nf_nat_rule_net_init(struct net *net)
 {
+	if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT))
+		return 0;
+
 	net->ipv4.nat_table = ipt_register_table(net, &nat_table,
 						 &nat_initial_table.repl);
 	if (IS_ERR(net->ipv4.nat_table))
 		return PTR_ERR(net->ipv4.nat_table);
+
+	net_ipt_module_set(net, VE_IP_IPTABLE_NAT);
+
 	return 0;
 }
 
 static void __net_exit nf_nat_rule_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLE_NAT))
+		return;
+
 	ipt_unregister_table(net->ipv4.nat_table);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLE_NAT);
 }
 
 static struct pernet_operations nf_nat_rule_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/netfilter.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/netfilter.c	2015-01-21 12:02:42.560244001 +0300
@@ -201,9 +201,9 @@ static __sum16 nf_ip_checksum_partial(st
 	return csum;
 }
 
-static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
+static int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl)
 {
-	return ip_route_output_key(&init_net, (struct rtable **)dst, fl);
+	return ip_route_output_key(net, (struct rtable **)dst, fl);
 }
 
 static const struct nf_afinfo nf_ip_afinfo = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/proc.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/proc.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/proc.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/proc.c	2015-01-21 12:02:43.288224674 +0300
@@ -54,7 +54,7 @@ static int sockstat_seq_show(struct seq_
 	int orphans, sockets;
 
 	local_bh_disable();
-	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+	orphans = percpu_counter_sum_positive(&get_exec_ub()->ub_orphan_count);
 	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
 	local_bh_enable();
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/raw.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/raw.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/raw.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/raw.c	2015-01-21 12:02:47.558111312 +0300
@@ -886,7 +886,7 @@ static struct sock *raw_get_first(struct
 		struct hlist_node *node;
 
 		sk_for_each(sk, node, &state->h->ht[state->bucket])
-			if (sock_net(sk) == seq_file_net(seq))
+			if (net_access_allowed(sock_net(sk), seq_file_net(seq)))
 				goto found;
 	}
 	sk = NULL;
@@ -902,7 +902,7 @@ static struct sock *raw_get_next(struct 
 		sk = sk_next(sk);
 try_again:
 		;
-	} while (sk && sock_net(sk) != seq_file_net(seq));
+	} while (sk && !net_access_allowed(sock_net(sk), seq_file_net(seq)));
 
 	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
 		sk = sk_head(&state->h->ht[state->bucket]);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/route.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/route.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/route.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/route.c	2015-01-21 12:02:57.856837925 +0300
@@ -69,6 +69,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/nsproxy.h>
 #include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
@@ -116,6 +117,7 @@
 
 #define RT_GC_TIMEOUT (300*HZ)
 
+int ip_rt_src_check		= 1;
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
@@ -914,6 +916,7 @@ void rt_cache_flush(struct net *net, int
 	if (delay >= 0)
 		rt_do_flush(!in_softirq());
 }
+EXPORT_SYMBOL(rt_cache_flush);
 
 /* Flush previous cache invalidated entries from the cache */
 void rt_cache_flush_batch(void)
@@ -1477,6 +1480,9 @@ void ip_rt_redirect(__be32 old_gw, __be3
 				rt->u.dst.xfrm		= NULL;
 #endif
 				rt->rt_genid		= rt_genid(net);
+#ifdef CONFIG_VE
+				rt->fl.owner_env = get_exec_env();
+#endif
 				rt->rt_flags		|= RTCF_REDIRECTED;
 
 				/* Gateway is different ... */
@@ -1940,9 +1946,12 @@ static int ip_route_input_mc(struct sk_b
 #ifdef CONFIG_NET_CLS_ROUTE
 	rth->u.dst.tclassid = itag;
 #endif
+#ifdef CONFIG_VE
+	rth->fl.owner_env = get_exec_env();
+#endif
 	rth->rt_iif	=
 	rth->fl.iif	= dev->ifindex;
-	rth->u.dst.dev	= init_net.loopback_dev;
+	rth->u.dst.dev	= get_exec_env()->ve_netns->loopback_dev;
 	dev_hold(rth->u.dst.dev);
 	rth->idev	= in_dev_get(rth->u.dst.dev);
 	rth->fl.oif	= 0;
@@ -2083,6 +2092,9 @@ static int __mkroute_input(struct sk_buf
 	rth->fl.fl4_src	= saddr;
 	rth->rt_src	= saddr;
 	rth->rt_gateway	= daddr;
+#ifdef CONFIG_VE
+	rth->fl.owner_env = get_exec_env();
+#endif
 	rth->rt_iif 	=
 		rth->fl.iif	= in_dev->dev->ifindex;
 	rth->u.dst.dev	= (out_dev)->dev;
@@ -2285,6 +2297,9 @@ local_input:
 	rth->idev	= in_dev_get(rth->u.dst.dev);
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
+#ifdef CONFIG_VE
+	rth->fl.owner_env = get_exec_env();
+#endif
 	rth->u.dst.input= ip_local_deliver;
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	if (res.type == RTN_UNREACHABLE) {
@@ -2482,6 +2497,9 @@ static int __mkroute_output(struct rtabl
 	rth->fl.mark    = oldflp->mark;
 	rth->rt_dst	= fl->fl4_dst;
 	rth->rt_src	= fl->fl4_src;
+#ifdef CONFIG_VE
+	rth->fl.owner_env = get_exec_env();
+#endif
 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
 	/* get references to the devices that are to be hold by the routing
 	   cache entry */
@@ -2623,7 +2641,7 @@ static int ip_route_output_slow(struct n
 			goto make_route;
 		}
 
-		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC) && ip_rt_src_check) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
 			dev_out = ip_dev_find(net, oldflp->fl4_src);
 			if (dev_out == NULL)
@@ -3554,3 +3572,25 @@ void __init ip_static_sysctl_init(void)
 EXPORT_SYMBOL(__ip_select_ident);
 EXPORT_SYMBOL(ip_route_input);
 EXPORT_SYMBOL(ip_route_output_key);
+
+static void ip_rt_dump_dst(void *o)
+{
+	struct rtable *rt = (struct rtable *)o;
+
+	if (rt->u.dst.flags & DST_FREE)
+		return;
+
+	printk("=== %p\n", o);
+	dst_dump_one(&rt->u.dst);
+	printk("\tidev %p gen %x flags %x type %d\n", rt->idev,
+			rt->rt_genid, rt->rt_flags, (int)rt->rt_type);
+}
+
+void ip_rt_dump_dsts(void)
+{
+	printk("IPv4 dst cache (%d entries):\n", atomic_read(&ipv4_dst_ops.entries));
+	slab_obj_walk(ipv4_dst_ops.kmem_cachep, ip_rt_dump_dst);
+}
+
+void (*ip6_rt_dump_dsts)(void);
+EXPORT_SYMBOL_GPL(ip6_rt_dump_dsts);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/syncookies.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/syncookies.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/syncookies.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/syncookies.c	2015-01-21 12:02:42.595243071 +0300
@@ -337,7 +337,7 @@ struct sock *cookie_v4_check(struct sock
 					       { .sport = th->dest,
 						 .dport = th->source } } };
 		security_req_classify_flow(req, &fl);
-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
 			reqsk_free(req);
 			goto out;
 		}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/sysctl_net_ipv4.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/sysctl_net_ipv4.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/sysctl_net_ipv4.c	2015-01-21 12:02:46.973126842 +0300
@@ -33,6 +33,9 @@ static int tcp_syn_retries_max = MAX_TCP
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
+int sysctl_tcp_use_sg = 1;
+EXPORT_SYMBOL(sysctl_tcp_use_sg);
+
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
 {
@@ -402,7 +405,7 @@ static struct ctl_table ipv4_table[] = {
 		.procname	= "tcp_syncookies",
 		.data		= &sysctl_tcp_syncookies,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_dointvec
 	},
 #endif
@@ -844,6 +847,27 @@ static struct ctl_table ipv4_table[] = {
 		.strategy	= sysctl_intvec,
 		.extra1		= &zero
 	},
+	{
+		.procname       = "tcp_max_tw_kmem_fraction",
+		.data           = &sysctl_tcp_max_tw_kmem_fraction,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname       = "tcp_max_tw_buckets_ub",
+		.data           = &sysctl_tcp_max_tw_buckets_ub,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "tcp_use_sg",
+		.data		= &sysctl_tcp_use_sg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp.c	2015-01-21 12:02:45.367169478 +0300
@@ -272,6 +272,10 @@
 #include <net/netdma.h>
 #include <net/sock.h>
 
+#include <bc/sock_orphan.h>
+#include <bc/net.h>
+#include <bc/tcp.h>
+
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
@@ -378,6 +382,7 @@ unsigned int tcp_poll(struct file *file,
 	unsigned int mask;
 	struct sock *sk = sock->sk;
 	struct tcp_sock *tp = tcp_sk(sk);
+	int check_send_space;
 
 	inet_rps_record_flow(sk);
 
@@ -392,6 +397,19 @@ unsigned int tcp_poll(struct file *file,
 
 	mask = 0;
 
+	check_send_space = 1;
+#ifdef CONFIG_BEANCOUNTERS
+	if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) {
+		unsigned long size;
+		size = MAX_TCP_HEADER + tp->mss_cache;
+		if (size > SOCK_MIN_UBCSPACE)
+			size = SOCK_MIN_UBCSPACE;
+		size = skb_charge_size(size);   
+		if (ub_sock_makewres_poll(sk, size))
+			check_send_space = 0;
+	}
+#endif
+
 	/*
 	 * POLLHUP is certainly not done right. But poll() doesn't
 	 * have a notion of HUP in just one direction, and for a
@@ -439,7 +457,7 @@ unsigned int tcp_poll(struct file *file,
 		if (tp->rcv_nxt - tp->copied_seq >= target)
 			mask |= POLLIN | POLLRDNORM;
 
-		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+		if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) {
 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
@@ -792,15 +810,23 @@ static ssize_t do_tcp_sendpages(struct s
 		int copy, i, can_coalesce;
 		int offset = poffset % PAGE_SIZE;
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
+		unsigned long chargesize = 0;
 
 		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
 new_segment:
+			chargesize = 0;
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
 
+			chargesize = skb_charge_size(MAX_TCP_HEADER +
+					tp->mss_cache);
+			if (ub_sock_getwres_tcp(sk, chargesize) < 0)
+				goto wait_for_ubspace;
 			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
 			if (!skb)
 				goto wait_for_memory;
+			ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+			chargesize = 0;
 
 			skb_entail(sk, skb);
 			copy = size_goal;
@@ -857,10 +883,15 @@ new_segment:
 wait_for_sndbuf:
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
+		ub_sock_retwres_tcp(sk, chargesize,
+			skb_charge_size(MAX_TCP_HEADER + tp->mss_cache));
+		chargesize = 0;
+wait_for_ubspace:
 		if (copied)
 			tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 
-		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+		err = __sk_stream_wait_memory(sk, &timeo, chargesize);
+		if (err != 0)
 			goto do_error;
 
 		mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -896,12 +927,8 @@ ssize_t tcp_sendpage(struct socket *sock
 	return res;
 }
 
-#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
-#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
-
-static inline int select_size(struct sock *sk)
+static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	int tmp = tp->mss_cache;
 
 	if (sk->sk_route_caps & NETIF_F_SG) {
@@ -959,6 +986,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru
 	while (--iovlen >= 0) {
 		size_t seglen = iov->iov_len;
 		unsigned char __user *from = iov->iov_base;
+		unsigned long chargesize = 0;
 
 		iov++;
 
@@ -974,17 +1002,27 @@ int tcp_sendmsg(struct kiocb *iocb, stru
 			}
 
 			if (copy <= 0) {
+				unsigned long size;
 new_segment:
 				/* Allocate new segment. If the interface is SG,
 				 * allocate skb fitting to single page.
 				 */
+				chargesize = 0;
 				if (!sk_stream_memory_free(sk))
 					goto wait_for_sndbuf;
 
-				skb = sk_stream_alloc_skb(sk, select_size(sk),
+				size = select_size(sk, tp);
+				chargesize = skb_charge_size(MAX_TCP_HEADER +
+						size);
+				if (ub_sock_getwres_tcp(sk, chargesize) < 0)
+					goto wait_for_ubspace;
+				skb = sk_stream_alloc_skb(sk, size,
 						sk->sk_allocation);
 				if (!skb)
 					goto wait_for_memory;
+				ub_skb_set_charge(skb, sk, chargesize,
+						UB_TCPSNDBUF);
+				chargesize = 0;
 
 				/*
 				 * Check whether we can use HW checksum.
@@ -1031,6 +1069,7 @@ new_segment:
 				} else if (page) {
 					if (off == PAGE_SIZE) {
 						put_page(page);
+						ub_sock_tcp_detachpage(sk);
 						TCP_PAGE(sk) = page = NULL;
 						off = 0;
 					}
@@ -1044,6 +1083,9 @@ new_segment:
 					goto wait_for_memory;
 
 				if (!page) {
+					chargesize = PAGE_SIZE;
+					if (ub_sock_tcp_chargepage(sk) < 0)
+						goto wait_for_ubspace;
 					/* Allocate new cache page. */
 					if (!(page = sk_stream_alloc_page(sk)))
 						goto wait_for_memory;
@@ -1075,7 +1117,8 @@ new_segment:
 					} else if (off + copy < PAGE_SIZE) {
 						get_page(page);
 						TCP_PAGE(sk) = page;
-					}
+					} else
+						ub_sock_tcp_detachpage(sk);
 				}
 
 				TCP_OFF(sk) = off + copy;
@@ -1106,10 +1149,15 @@ new_segment:
 wait_for_sndbuf:
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
+			ub_sock_retwres_tcp(sk, chargesize,
+				skb_charge_size(MAX_TCP_HEADER+tp->mss_cache));
+			chargesize = 0;
+wait_for_ubspace:
 			if (copied)
 				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 
-			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+			err = __sk_stream_wait_memory(sk, &timeo, chargesize);
+			if (err != 0)
 				goto do_error;
 
 			mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1207,8 +1255,10 @@ void tcp_cleanup_rbuf(struct sock *sk, i
 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
-	     KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
-	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+	     KERN_INFO "cleanup rbuf bug (%d/%s): copied %X seq %X/%X rcvnxt %X\n",
+	     VEID(get_exec_env()), current->comm,
+	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
+	     TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
 #endif
 
 	if (inet_csk_ack_scheduled(sk)) {
@@ -1474,8 +1524,9 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 				goto found_ok_skb;
 			if (tcp_hdr(skb)->fin)
 				goto found_fin_ok;
-			WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
+			WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2 (%d/%s): "
 					"copied %X seq %X rcvnxt %X fl %X\n",
+					VEID(get_exec_env()), current->comm,
 					*seq, TCP_SKB_CB(skb)->seq,
 					tp->rcv_nxt, flags);
 		}
@@ -1538,8 +1589,19 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
 			tp->ucopy.len = len;
 
-			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
-				!(flags & (MSG_PEEK | MSG_TRUNC)));
+			if (WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+				!(flags & (MSG_PEEK | MSG_TRUNC)))) {
+				printk("KERNEL: assertion: tp->copied_seq == "
+						"tp->rcv_nxt || ...\n");
+				printk("VE%u pid %d comm %.16s\n", 
+						(get_exec_env() ?
+						 VEID(get_exec_env()) : 0),
+						current->pid, current->comm);
+				printk("flags=0x%x, len=%d, copied_seq=%d, "
+						"rcv_nxt=%d\n", flags,
+						(int)len, tp->copied_seq,
+						tp->rcv_nxt);
+			}
 
 			/* Ugly... If prequeue is not empty, we have to
 			 * process it before releasing socket, otherwise
@@ -1963,7 +2025,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	ub_inc_orphan_count(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -2003,14 +2065,19 @@ adjudge_to_death:
 		}
 	}
 	if (sk->sk_state != TCP_CLOSE) {
-		int orphan_count = percpu_counter_read_positive(
-						sk->sk_prot->orphan_count);
+		int orphans = ub_get_orphan_count(sk);
 
 		sk_mem_reclaim(sk);
-		if (tcp_too_many_orphans(sk, orphan_count)) {
-			if (net_ratelimit())
+		if (ub_too_many_orphans(sk, orphans)) {
+			if (net_ratelimit()) {
+				int ubid = 0;
+#ifdef CONFIG_BEANCOUNTERS
+				ubid = sock_has_ubc(sk) ?
+					   sock_bc(sk)->ub->ub_uid : 0;
+#endif
 				printk(KERN_INFO "TCP: too many of orphaned "
-				       "sockets\n");
+				       "sockets (%d in CT%d)\n", orphans, ubid);
+			}
 			tcp_set_state(sk, TCP_CLOSE);
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			NET_INC_STATS_BH(sock_net(sk),
@@ -2087,6 +2154,7 @@ int tcp_disconnect(struct sock *sk, int 
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_cnt = 0;
 	tp->bytes_acked = 0;
+	tp->advmss = 65535;
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
@@ -2946,10 +3014,11 @@ void __init tcp_init(void)
 
 	percpu_counter_init(&tcp_sockets_allocated, 0);
 	percpu_counter_init(&tcp_orphan_count, 0);
+	percpu_counter_init(&get_ub0()->ub_orphan_count, 0);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
 
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
@@ -3010,6 +3079,11 @@ void __init tcp_init(void)
 	sysctl_tcp_mem[1] = limit;
 	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
 
+	if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096)
+		sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096;
+	if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096)
+		sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096;
+
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
 	max_share = min(4UL*1024*1024, limit);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_input.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_input.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_input.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_input.c	2015-01-21 12:02:43.289224647 +0300
@@ -72,6 +72,8 @@
 #include <asm/unaligned.h>
 #include <net/netdma.h>
 
+#include <bc/tcp.h>
+
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
@@ -323,7 +325,7 @@ static void tcp_grow_window(struct sock 
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
-	    !tcp_memory_pressure) {
+	    ub_tcp_rmem_allows_expand(sk)) {
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
@@ -400,6 +402,8 @@ static void tcp_init_buffer_space(struct
 
 	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
+
+	ub_tcp_update_maxadvmss(sk);
 }
 
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
@@ -412,7 +416,7 @@ static void tcp_clamp_window(struct sock
 
 	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
-	    !tcp_memory_pressure &&
+	    !ub_tcp_memory_pressure(sk) &&
 	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 				    sysctl_tcp_rmem[2]);
@@ -4409,19 +4413,19 @@ static void tcp_ofo_queue(struct sock *s
 static int tcp_prune_ofo_queue(struct sock *sk);
 static int tcp_prune_queue(struct sock *sk);
 
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb)
 {
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    !sk_rmem_schedule(sk, size)) {
+	    !sk_rmem_schedule(sk, skb)) {
 
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
 
-		if (!sk_rmem_schedule(sk, size)) {
+		if (!sk_rmem_schedule(sk, skb)) {
 			if (!tcp_prune_ofo_queue(sk))
 				return -1;
 
-			if (!sk_rmem_schedule(sk, size))
+			if (!sk_rmem_schedule(sk, skb))
 				return -1;
 		}
 	}
@@ -4473,8 +4477,8 @@ static void tcp_data_queue(struct sock *
 		if (eaten <= 0) {
 queue_and_out:
 			if (eaten < 0 &&
-			    tcp_try_rmem_schedule(sk, skb->truesize))
-				goto drop;
+			    tcp_try_rmem_schedule(sk, skb))
+				goto drop_part;
 
 			skb_set_owner_r(skb, sk);
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -4518,6 +4522,12 @@ out_of_window:
 drop:
 		__kfree_skb(skb);
 		return;
+
+drop_part:
+		if (after(tp->copied_seq, tp->rcv_nxt))
+			tp->rcv_nxt = tp->copied_seq;
+		__kfree_skb(skb);
+		return;
 	}
 
 	/* Out of window. F.e. zero window probe. */
@@ -4544,7 +4554,7 @@ drop:
 
 	TCP_ECN_check_ce(tp, skb);
 
-	if (tcp_try_rmem_schedule(sk, skb->truesize))
+	if (tcp_try_rmem_schedule(sk, skb))
 		goto drop;
 
 	/* Disable header prediction. */
@@ -4730,6 +4740,10 @@ restart:
 		nskb = alloc_skb(copy + header, GFP_ATOMIC);
 		if (!nskb)
 			return;
+		if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) {
+			kfree_skb(nskb);
+			return;
+		}
 
 		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
 		skb_set_network_header(nskb, (skb_network_header(skb) -
@@ -4858,7 +4872,7 @@ static int tcp_prune_queue(struct sock *
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		tcp_clamp_window(sk);
-	else if (tcp_memory_pressure)
+	else if (ub_tcp_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
@@ -4924,7 +4938,7 @@ static int tcp_should_expand_sndbuf(stru
 		return 0;
 
 	/* If we are under global TCP memory pressure, do not expand.  */
-	if (tcp_memory_pressure)
+	if (ub_tcp_memory_pressure(sk))
 		return 0;
 
 	/* If we are under soft global TCP memory pressure, do not expand.  */
@@ -5436,6 +5450,10 @@ int tcp_rcv_established(struct sock *sk,
 
 				if ((int)skb->truesize > sk->sk_forward_alloc)
 					goto step5;
+				/* This is OK not to try to free memory here.
+				 * Do this below on slow path. Den */
+				if (ub_tcprcvbuf_charge(sk, skb) < 0)
+					goto step5;
 
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_ipv4.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_ipv4.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_ipv4.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_ipv4.c	2015-01-21 12:02:58.105831318 +0300
@@ -74,6 +74,8 @@
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
 
+#include <bc/tcp.h>
+
 #include <linux/inet.h>
 #include <linux/ipv6.h>
 #include <linux/stddef.h>
@@ -717,7 +719,8 @@ static void tcp_v4_timewait_ack(struct s
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
-			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcptw->tw_rcv_wnd >>
+				(tw->tw_rcv_wscale & TW_WSCALE_MASK),
 			tcptw->tw_ts_recent,
 			tw->tw_bound_dev_if,
 			tcp_twsk_md5_key(tcptw),
@@ -796,8 +799,9 @@ static void syn_flood_warning(struct sk_
 	if (time_after(jiffies, (warntime + HZ * 60))) {
 		warntime = jiffies;
 		printk(KERN_INFO
-		       "possible SYN flooding on port %d. Sending cookies.\n",
-		       ntohs(tcp_hdr(skb)->dest));
+		       "possible SYN flooding on ctid %u, port %d. "
+		       "Sending cookies.\n",
+		       skb->owner_env->veid, ntohs(tcp_hdr(skb)->dest));
 	}
 }
 #endif
@@ -1197,12 +1201,14 @@ struct request_sock_ops tcp_request_sock
 	.destructor	=	tcp_v4_reqsk_destructor,
 	.send_reset	=	tcp_v4_send_reset,
 };
+EXPORT_SYMBOL(tcp_request_sock_ops);
 
 #ifdef CONFIG_TCP_MD5SIG
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
 };
+EXPORT_SYMBOL(tcp_request_sock_ipv4_ops);
 #endif
 
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
@@ -1510,6 +1516,10 @@ static __sum16 tcp_v4_checksum_init(stru
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct sock *rsk;
+	struct user_beancounter *ub;
+
+	ub = set_exec_ub(sock_bc(sk)->ub);
+
 #ifdef CONFIG_TCP_MD5SIG
 	/*
 	 * We really want to reject the packet as early as possible
@@ -1528,7 +1538,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc
 			goto reset;
 		}
 		TCP_CHECK_TIMER(sk);
-		return 0;
+		goto restore_context;
 	}
 
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1545,7 +1555,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc
 				rsk = nsk;
 				goto reset;
 			}
-			return 0;
+			goto restore_context;
 		}
 	}
 
@@ -1555,6 +1565,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc
 		goto reset;
 	}
 	TCP_CHECK_TIMER(sk);
+
+restore_context:
+	(void)set_exec_ub(ub);
 	return 0;
 
 reset:
@@ -1566,7 +1579,7 @@ discard:
 	 * might be destroyed here. This current version compiles correctly,
 	 * but you have been warned.
 	 */
-	return 0;
+	goto restore_context;
 
 csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
@@ -1842,6 +1855,8 @@ static int tcp_v4_init_sock(struct sock 
 	tp->snd_cwnd_clamp = ~0;
 	tp->mss_cache = 536;
 
+	tp->advmss = 65535; /* max value */
+
 	tp->reordering = sysctl_tcp_reordering;
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 
@@ -1905,6 +1920,8 @@ void tcp_v4_destroy_sock(struct sock *sk
 	 * If sendmsg cached page exists, toss it.
 	 */
 	if (sk->sk_sndmsg_page) {
+		/* queue is empty, uncharge */
+		ub_sock_tcp_detachpage(sk);
 		__free_page(sk->sk_sndmsg_page);
 		sk->sk_sndmsg_page = NULL;
 	}
@@ -1979,7 +1996,9 @@ get_req:
 	}
 get_sk:
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
+		if (!net_access_allowed(sock_net(sk), net))
+			continue;
+		if (sk->sk_family == st->family) {
 			cur = sk;
 			goto out;
 		}
@@ -2043,7 +2062,7 @@ static void *established_get_first(struc
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family ||
-			    !net_eq(sock_net(sk), net)) {
+			    !net_access_allowed(sock_net(sk), net)) {
 				continue;
 			}
 			rc = sk;
@@ -2053,7 +2072,7 @@ static void *established_get_first(struc
 		inet_twsk_for_each(tw, node,
 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
 			if (tw->tw_family != st->family ||
-			    !net_eq(twsk_net(tw), net)) {
+			    !net_access_allowed(twsk_net(tw), net)) {
 				continue;
 			}
 			rc = tw;
@@ -2080,7 +2099,8 @@ static void *established_get_next(struct
 		tw = cur;
 		tw = tw_next(tw);
 get_tw:
-		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
+		while (tw && (tw->tw_family != st->family ||
+		       !net_access_allowed(twsk_net(tw), net))) {
 			tw = tw_next(tw);
 		}
 		if (tw) {
@@ -2103,7 +2123,8 @@ get_tw:
 		sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+		if (sk->sk_family == st->family &&
+		    net_access_allowed(sock_net(sk), net))
 			goto found;
 	}
 
@@ -2508,12 +2529,17 @@ static int __net_init tcp_sk_init(struct
 static void __net_exit tcp_sk_exit(struct net *net)
 {
 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
-	inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
-       .init = tcp_sk_init,
-       .exit = tcp_sk_exit,
+       .init	   = tcp_sk_init,
+       .exit	   = tcp_sk_exit,
+       .exit_batch = tcp_sk_exit_batch,
 };
 
 void __init tcp_v4_init(void)
@@ -2523,6 +2549,93 @@ void __init tcp_v4_init(void)
 		panic("Failed to create the TCP control socket.\n");
 }
 
+#ifdef CONFIG_VE
+static void tcp_kill_ve_onesk(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Check the assumed state of the socket. */
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		printk(KERN_WARNING "Killing sk: dead %d, state %d, "
+			"wrseq %u unseq %u, wrqu %d.\n",
+			sock_flag(sk, SOCK_DEAD), sk->sk_state,
+			tp->write_seq, tp->snd_una,
+			!skb_queue_empty(&sk->sk_write_queue));
+		sk->sk_err = ECONNRESET;
+		sk->sk_error_report(sk);
+	}
+
+	tcp_send_active_reset(sk, GFP_ATOMIC);
+	switch (sk->sk_state) {
+		case TCP_FIN_WAIT1:
+		case TCP_CLOSING:
+			/* In these 2 states the peer may want us to retransmit
+			 * some data and/or FIN.  Entering "resetting mode"
+			 * instead.
+			 */
+			tcp_time_wait(sk, TCP_CLOSE, 0);
+			break;
+		case TCP_FIN_WAIT2:
+			/* By some reason the socket may stay in this state
+			 * without turning into a TW bucket.  Fix it.
+			 */
+			tcp_time_wait(sk, TCP_FIN_WAIT2, 0);
+			break;
+		default:
+			/* Just jump into CLOSED state. */
+			tcp_done(sk);
+			break;
+	}
+}
+
+void tcp_v4_kill_ve_sockets(struct ve_struct *envid)
+{
+	struct inet_ehash_bucket *head;
+	int i, retry;
+
+	/* alive */
+again:
+	retry = 0;
+	local_bh_disable();
+	head = tcp_hashinfo.ehash;
+	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
+		struct sock *sk;
+		struct hlist_nulls_node *node;
+		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i);
+more_work:
+		spin_lock(lock);
+		sk_nulls_for_each(sk, node, &head[i].chain) {
+			if (ve_accessible_strict(sk->owner_env, envid)) {
+				sock_hold(sk);
+				spin_unlock(lock);
+
+				bh_lock_sock(sk);
+				if (sock_owned_by_user(sk)) {
+					retry = 1;
+					bh_unlock_sock(sk);
+					sock_put(sk);
+					break;
+				}
+				/* sk might have disappeared from the hash before
+				 * we got the lock */
+				if (sk->sk_state != TCP_CLOSE)
+					tcp_kill_ve_onesk(sk);
+				bh_unlock_sock(sk);
+				sock_put(sk);
+				goto more_work;
+			}
+		}
+		spin_unlock(lock);
+	}
+	local_bh_enable();
+	if (retry) {
+		schedule_timeout_interruptible(HZ);
+		goto again;
+	}
+}
+EXPORT_SYMBOL(tcp_v4_kill_ve_sockets);
+#endif
+
 EXPORT_SYMBOL(ipv4_specific);
 EXPORT_SYMBOL(tcp_hashinfo);
 EXPORT_SYMBOL(tcp_prot);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_minisocks.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_minisocks.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_minisocks.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_minisocks.c	2015-01-21 12:02:45.368169451 +0300
@@ -26,6 +26,9 @@
 #include <net/inet_common.h>
 #include <net/xfrm.h>
 
+#include <bc/net.h>
+#include <bc/sock_orphan.h>
+
 #ifdef CONFIG_SYSCTL
 #define SYNC_INIT 0 /* let the user enable it */
 #else
@@ -36,6 +39,11 @@ int sysctl_tcp_syncookies __read_mostly 
 EXPORT_SYMBOL(sysctl_tcp_syncookies);
 
 int sysctl_tcp_abort_on_overflow __read_mostly;
+int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384;
+int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536;
+
+EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction);
+EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub);
 
 struct inet_timewait_death_row tcp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
@@ -51,6 +59,7 @@ struct inet_timewait_death_row tcp_death
 	.twcal_hand	= -1,
 	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
 					    (unsigned long)&tcp_death_row),
+	.ub_managed	= 1,
 };
 
 EXPORT_SYMBOL_GPL(tcp_death_row);
@@ -280,7 +289,8 @@ void tcp_time_wait(struct sock *sk, int 
 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
 
-	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets &&
+			ub_timewait_check(sk, &tcp_death_row))
 		tw = inet_twsk_alloc(sk, state);
 
 	if (tw != NULL) {
@@ -293,6 +303,8 @@ void tcp_time_wait(struct sock *sk, int 
 		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+		if (sk->sk_user_data != NULL)
+			tw->tw_rcv_wscale |= TW_WSCALE_SPEC;
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if (tw->tw_family == PF_INET6) {
@@ -327,6 +339,7 @@ void tcp_time_wait(struct sock *sk, int 
 			}
 		} while (0);
 #endif
+		tw->tw_owner_env = VEID(sk->owner_env);
 
 		/* Linkage updates. */
 		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
@@ -347,11 +360,16 @@ void tcp_time_wait(struct sock *sk, int 
 				   TCP_TIMEWAIT_LEN);
 		inet_twsk_put(tw);
 	} else {
+		int ubid = 0;
 		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
 		 * non-graceful socket closings.
 		 */
-		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+#ifdef CONFIG_BEANCOUNTERS
+		if (sock_has_ubc(sk))
+			ubid = sock_bc(sk)->ub->ub_uid;
+#endif
+		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid);
 	}
 
 	tcp_update_metrics(sk);
@@ -392,6 +410,8 @@ struct sock *tcp_create_openreq_child(st
 		struct tcp_sock *newtp;
 
 		/* Now setup tcp_sock */
+		newsk->owner_env = sk->owner_env;
+
 		newtp = tcp_sk(newsk);
 		newtp->pred_flags = 0;
 		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_output.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_output.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_output.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_output.c	2015-01-21 12:02:47.489113145 +0300
@@ -39,6 +39,9 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 
+#include <bc/net.h>
+#include <bc/tcp.h>
+
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse __read_mostly = 1;
 
@@ -374,11 +377,6 @@ static void tcp_init_nondata_skb(struct 
 	TCP_SKB_CB(skb)->end_seq = seq;
 }
 
-static inline int tcp_urg_mode(const struct tcp_sock *tp)
-{
-	return tp->snd_una != tp->snd_up;
-}
-
 #define OPTION_SACK_ADVERTISE	(1 << 0)
 #define OPTION_TS		(1 << 1)
 #define OPTION_MD5		(1 << 2)
@@ -669,11 +667,17 @@ static void tcp_tasklet_func(unsigned lo
 		if (!sock_owned_by_user(sk)) {
 			if ((1 << sk->sk_state) &
 			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-			     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+			     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+				struct ve_struct *ve;
+
+				/* tcp_time_stamp depends on exec_env */
+				ve = set_exec_env(sk->owner_env);
 				tcp_write_xmit(sk,
 					       tcp_current_mss(sk),
 					       tcp_sk(sk)->nonagle, 0,
 					       GFP_ATOMIC);
+				set_exec_env(ve);
+			}
 		} else {
 			/* defer the work to tcp_release_cb() */
 			set_bit(TSQ_OWNED, &tp->tsq_flags);
@@ -753,6 +757,13 @@ void tcp_wfree(struct sk_buff *skb)
 	}
 }
 
+static int skb_header_size(struct sock *sk, int tcp_hlen)
+{
+	struct ip_options *opt = inet_sk(sk)->opt;
+	return tcp_hlen + sizeof(struct iphdr) +
+		(opt ? opt->optlen : 0)	+ ETH_HLEN /* For hard header */;
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -777,6 +788,7 @@ static int tcp_transmit_skb(struct sock 
 	__u8 *md5_hash_location;
 	struct tcphdr *th;
 	int err;
+	int header_size;
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
 
@@ -807,6 +819,20 @@ static int tcp_transmit_skb(struct sock 
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
+	/* Unfortunately, we can have skb from outside world here
+	 * with size insufficient for header. It is impossible to make
+	 * guess when we queue skb, so the decision should be made
+	 * here. Den
+	 */
+	header_size = skb_header_size(sk, tcp_header_size);
+	if (skb->data - header_size < skb->head) {
+		int delta = header_size - skb_headroom(skb);
+		err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta),
+				0, GFP_ATOMIC);
+		if (err)
+			return err;
+	}
+
 	if (tcp_packets_in_flight(tp) == 0) {
 		tcp_ca_event(sk, CA_EVENT_TX_START);
 		skb->ooo_okay = 1;
@@ -986,15 +1012,21 @@ int tcp_fragment(struct sock *sk, struct
 	if (nsize < 0)
 		nsize = 0;
 
-	if (skb_cloned(skb) &&
-	    skb_is_nonlinear(skb) &&
-	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-		return -ENOMEM;
+	if (skb_cloned(skb) && skb_is_nonlinear(skb)) {
+		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			return -ENOMEM;
+		ub_skb_uncharge(skb);
+		ub_tcpsndbuf_charge_forced(sk, skb);
+	}
 
 	/* Get a new skb... force flag on. */
 	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
 	if (buff == NULL)
 		return -ENOMEM; /* We'll just try again later. */
+	if (ub_tcpsndbuf_charge(sk, buff) < 0) {
+		kfree_skb(buff);
+		return -ENOMEM;
+	}
 
 	sk->sk_wmem_queued += buff->truesize;
 	sk_mem_charge(sk, buff->truesize);
@@ -1459,6 +1491,11 @@ static int tso_fragment(struct sock *sk,
 	if (unlikely(buff == NULL))
 		return -ENOMEM;
 
+	if (ub_tcpsndbuf_charge(sk, buff) < 0) {
+		kfree_skb(buff);
+		return -ENOMEM;
+	}
+
 	sk->sk_wmem_queued += buff->truesize;
 	sk_mem_charge(sk, buff->truesize);
 	buff->truesize += nlen;
@@ -1923,7 +1960,7 @@ u32 __tcp_select_window(struct sock *sk)
 	if (free_space < (full_space >> 1)) {
 		icsk->icsk_ack.quick = 0;
 
-		if (tcp_memory_pressure)
+		if (ub_tcp_shrink_rcvbuf(sk))
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
 					       4U * tp->advmss);
 
@@ -2367,6 +2404,7 @@ void tcp_send_fin(struct sock *sk)
 				break;
 			yield();
 		}
+		ub_tcpsndbuf_charge_forced(sk, skb);
 
 		/* Reserve space for headers and prepare control bits. */
 		skb_reserve(skb, MAX_TCP_HEADER);
@@ -2426,6 +2464,10 @@ int tcp_send_synack(struct sock *sk)
 			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
 			if (nskb == NULL)
 				return -ENOMEM;
+			if (ub_tcpsndbuf_charge(sk, nskb) < 0) {
+				kfree_skb(nskb);
+				return -ENOMEM;
+			}
 			tcp_unlink_write_queue(skb, sk);
 			skb_header_release(nskb);
 			__tcp_add_write_queue_head(sk, nskb);
@@ -2536,6 +2578,7 @@ static void tcp_connect_init(struct sock
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u8 rcv_wscale;
+	static int once = 0;
 
 	/* We'll fix this up when we get a response from the other end.
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -2555,11 +2598,25 @@ static void tcp_connect_init(struct sock
 	tcp_mtup_init(sk);
 	tcp_sync_mss(sk, dst_mtu(dst));
 
+	if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) {
+		once = 1;
+
+		printk("Oops in connect_init! dst->advmss=%d\n",
+						dst_metric(dst, RTAX_ADVMSS));
+		printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU));
+		printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, "
+				"advmss=%d, user_mss=%d\n",
+				sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss,
+				tp->mss_cache, tp->advmss, tp->rx_opt.user_mss);
+	}
+
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
 		tp->advmss = tp->rx_opt.user_mss;
+	if (tp->advmss == 0)
+		tp->advmss = 1460;
 
 	tcp_initialize_rcv_mss(sk);
 
@@ -2602,6 +2659,10 @@ int tcp_connect(struct sock *sk)
 	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
 	if (unlikely(buff == NULL))
 		return -ENOBUFS;
+	if (ub_tcpsndbuf_charge(sk, buff) < 0) {
+		kfree_skb(buff);
+		return -ENOBUFS;
+	}
 
 	/* Reserve space for headers. */
 	skb_reserve(buff, MAX_TCP_HEADER);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_timer.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_timer.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/tcp_timer.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/tcp_timer.c	2015-01-21 12:02:45.369169424 +0300
@@ -20,6 +20,8 @@
 
 #include <linux/module.h>
 #include <net/tcp.h>
+#include <bc/sock_orphan.h>
+#include <bc/tcp.h>
 
 int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
 int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
@@ -66,7 +68,8 @@ static void tcp_write_err(struct sock *s
 static int tcp_out_of_resources(struct sock *sk, int do_reset)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int orphans = percpu_counter_read_positive(&tcp_orphan_count);
+	int orphans = ub_get_orphan_count(sk);
+	int orph = orphans;
 
 	/* If peer does not open window for long time, or did not transmit
 	 * anything for long time, penalize it. */
@@ -77,10 +80,16 @@ static int tcp_out_of_resources(struct s
 	if (sk->sk_err_soft)
 		orphans <<= 1;
 
-	if (tcp_too_many_orphans(sk, orphans)) {
-		if (net_ratelimit())
-			printk(KERN_INFO "Out of socket memory\n");
-
+	if (ub_too_many_orphans(sk, orphans)) {
+		if (net_ratelimit()) {
+			int ubid = 0;
+#ifdef CONFIG_BEANCOUNTERS
+			ubid = sock_has_ubc(sk) ?
+					sock_bc(sk)->ub->ub_uid : 0;
+#endif
+			printk(KERN_INFO "Orphaned socket dropped "
+			       "(%d,%d in CT%d)\n", orph, orphans, ubid);
+		}
 		/* Catch exceptional cases, when connection requires reset.
 		 *      1. Last segment was sent recently. */
 		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
@@ -180,6 +189,9 @@ static void tcp_delack_timer(unsigned lo
 	struct sock *sk = (struct sock *)data;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct ve_struct *ve;
+
+	ve = set_exec_env(sk->owner_env);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
@@ -234,6 +246,8 @@ out:
 out_unlock:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+
+	(void)set_exec_env(ve);
 }
 
 static void tcp_probe_timer(struct sock *sk)
@@ -241,10 +255,13 @@ static void tcp_probe_timer(struct sock 
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
+	struct ve_struct *ve;
+
+	ve = set_exec_env(sk->owner_env);
 
 	if (tp->packets_out || !tcp_send_head(sk)) {
 		icsk->icsk_probes_out = 0;
-		return;
+		goto out;
 	}
 
 	/* *WARNING* RFC 1122 forbids this
@@ -270,7 +287,7 @@ static void tcp_probe_timer(struct sock 
 		max_probes = tcp_orphan_retries(sk, alive);
 
 		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
-			return;
+			goto out;
 	}
 
 	if (icsk->icsk_probes_out > max_probes) {
@@ -279,6 +296,9 @@ static void tcp_probe_timer(struct sock 
 		/* Only send another probe if we didn't close things up. */
 		tcp_send_probe0(sk);
 	}
+
+out:
+	(void)set_exec_env(ve);
 }
 
 /*
@@ -289,6 +309,9 @@ void tcp_retransmit_timer(struct sock *s
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct ve_struct *ve;
+
+	ve = set_exec_env(sk->owner_env);
 
 	if (!tp->packets_out)
 		goto out;
@@ -412,7 +435,8 @@ out_reset_timer:
 	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
 		__sk_dst_reset(sk);
 
-out:;
+out:
+	(void)set_exec_env(ve);
 }
 
 static void tcp_write_timer(unsigned long data)
@@ -420,6 +444,9 @@ static void tcp_write_timer(unsigned lon
 	struct sock *sk = (struct sock *)data;
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int event;
+	struct ve_struct *ve;
+
+	ve = set_exec_env(sk->owner_env);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
@@ -454,6 +481,8 @@ out:
 out_unlock:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+
+	(void)set_exec_env(ve);
 }
 
 /*
@@ -484,6 +513,9 @@ static void tcp_keepalive_timer (unsigne
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 elapsed;
+	struct ve_struct *ve;
+
+	ve = set_exec_env(sk->owner_env);
 
 	/* Only process if socket is not in use. */
 	bh_lock_sock(sk);
@@ -564,4 +596,5 @@ death:
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	(void)set_exec_env(ve);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv4/udp.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/udp.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv4/udp.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv4/udp.c	2015-01-21 12:02:47.558111312 +0300
@@ -142,6 +142,7 @@ static int udp_lib_lport_inuse(struct ne
 		    sk2 != sk					&&
 		    (bitmap || sk2->sk_hash == num)		&&
 		    (!sk2->sk_reuse || !sk->sk_reuse)		&&
+		    sk->sk_reuse != 2 &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
 			|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
@@ -1696,7 +1697,7 @@ static struct sock *udp_get_first(struct
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 		spin_lock_bh(&hslot->lock);
 		sk_nulls_for_each(sk, node, &hslot->head) {
-			if (!net_eq(sock_net(sk), net))
+			if (!net_access_allowed(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
 				goto found;
@@ -1715,7 +1716,7 @@ static struct sock *udp_get_next(struct 
 
 	do {
 		sk = sk_nulls_next(sk);
-	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+	} while (sk && (!net_access_allowed(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
 		if (state->bucket < UDP_HTABLE_SIZE)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/addrconf.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/addrconf.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/addrconf.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/addrconf.c	2015-01-21 12:02:57.979834660 +0300
@@ -407,9 +407,8 @@ static struct inet6_dev * ipv6_add_dev(s
 	    dev->type == ARPHRD_TUNNEL6 ||
 	    dev->type == ARPHRD_SIT ||
 	    dev->type == ARPHRD_NONE) {
-		printk(KERN_INFO
-		       "%s: Disabled Privacy Extensions\n",
-		       dev->name);
+		ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n",
+			dev->name));
 		ndev->cnf.use_tempaddr = -1;
 	} else {
 		in6_dev_hold(ndev);
@@ -629,7 +628,7 @@ ipv6_add_addr(struct inet6_dev *idev, co
 		goto out;
 	}
 
-	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC);
 
 	if (ifa == NULL) {
 		ADBG(("ipv6_add_addr: malloc failed\n"));
@@ -2135,7 +2134,7 @@ err_exit:
 /*
  *	Manual configuration of address on an interface
  */
-static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
+int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 			  unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
 			  __u32 valid_lft)
 {
@@ -2202,6 +2201,7 @@ static int inet6_addr_add(struct net *ne
 
 	return PTR_ERR(ifp);
 }
+EXPORT_SYMBOL(inet6_addr_add);
 
 static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx,
 			  unsigned int plen)
@@ -2233,7 +2233,8 @@ static int inet6_addr_del(struct net *ne
 			   disable IPv6 on this interface.
 			 */
 			if (idev->addr_list == NULL)
-				addrconf_ifdown(idev->dev, 1);
+				addrconf_ifdown(idev->dev,
+						!(idev->dev->flags & IFF_LOOPBACK));
 			return 0;
 		}
 	}
@@ -2247,7 +2248,7 @@ int addrconf_add_ifaddr(struct net *net,
 	struct in6_ifreq ireq;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
@@ -2266,7 +2267,7 @@ int addrconf_del_ifaddr(struct net *net,
 	struct in6_ifreq ireq;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
@@ -2706,7 +2707,6 @@ static int addrconf_ifdown(struct net_de
 	ASSERT_RTNL();
 
 	rt6_ifdown(net, dev);
-	neigh_ifdown(&nd_tbl, dev);
 
 	idev = __in6_dev_get(dev);
 	if (idev == NULL)
@@ -2818,6 +2818,9 @@ put_ifa:
 static void addrconf_rs_timer(unsigned long data)
 {
 	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+	struct ve_struct *old_env;
+	
+	old_env = set_exec_env(ifp->idev->dev->owner_env);
 
 	if (ifp->idev->cnf.forwarding)
 		goto out;
@@ -2852,6 +2855,7 @@ static void addrconf_rs_timer(unsigned l
 
 out:
 	in6_ifa_put(ifp);
+	(void)set_exec_env(old_env);
 }
 
 /*
@@ -2888,6 +2892,7 @@ static void addrconf_dad_start(struct in
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
 	    idev->cnf.accept_dad < 1 ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
+	    dev->owner_env->disable_net ||
 	    ifp->flags & IFA_F_NODAD) {
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
 		spin_unlock_bh(&ifp->lock);
@@ -2928,7 +2933,9 @@ static void addrconf_dad_timer(unsigned 
 	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
 	struct inet6_dev *idev = ifp->idev;
 	struct in6_addr mcaddr;
+	struct ve_struct *old_env;
 
+	old_env = set_exec_env(ifp->idev->dev->owner_env);
 	if (!ifp->probes && addrconf_dad_end(ifp))
 		goto out;
 
@@ -2962,6 +2969,7 @@ static void addrconf_dad_timer(unsigned 
 	ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
 out:
 	in6_ifa_put(ifp);
+	(void)set_exec_env(old_env);
 }
 
 static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
@@ -3181,6 +3189,7 @@ static void addrconf_verify(unsigned lon
 	struct inet6_ifaddr *ifp;
 	unsigned long now, next;
 	int i;
+	struct ve_struct *old_env;
 
 	spin_lock_bh(&addrconf_verify_lock);
 	now = jiffies;
@@ -3201,6 +3210,8 @@ restart:
 			if (ifp->flags & IFA_F_PERMANENT)
 				continue;
 
+			old_env = set_exec_env(ifp->idev->dev->owner_env);
+
 			spin_lock(&ifp->lock);
 			age = (now - ifp->tstamp) / HZ;
 
@@ -3216,9 +3227,11 @@ restart:
 				in6_ifa_hold(ifp);
 				read_unlock(&addrconf_hash_lock);
 				ipv6_del_addr(ifp);
+				(void)set_exec_env(old_env);
 				goto restart;
 			} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
 				spin_unlock(&ifp->lock);
+				set_exec_env(old_env);
 				continue;
 			} else if (age >= ifp->prefered_lft) {
 				/* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
@@ -3240,6 +3253,7 @@ restart:
 
 					ipv6_ifa_notify(0, ifp);
 					in6_ifa_put(ifp);
+					(void)set_exec_env(old_env);
 					goto restart;
 				}
 #ifdef CONFIG_IPV6_PRIVACY
@@ -3261,6 +3275,7 @@ restart:
 						ipv6_create_tempaddr(ifpub, ifp);
 						in6_ifa_put(ifpub);
 						in6_ifa_put(ifp);
+						(void)set_exec_env(old_env);
 						goto restart;
 					}
 				} else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
@@ -3273,6 +3288,7 @@ restart:
 					next = ifp->tstamp + ifp->prefered_lft * HZ;
 				spin_unlock(&ifp->lock);
 			}
+			(void)set_exec_env(old_env);
 		}
 		read_unlock(&addrconf_hash_lock);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/af_inet6.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/af_inet6.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/af_inet6.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/af_inet6.c	2015-01-21 12:02:50.572031301 +0300
@@ -56,6 +56,10 @@
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
+#include <bc/net.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -78,6 +82,12 @@ struct ipv6_params ipv6_defaults = {
 
 static int disable_ipv6_mod = 0;
 
+bool ipv6_is_enabled()
+{
+	return !disable_ipv6_mod;
+}
+EXPORT_SYMBOL(ipv6_is_enabled);
+
 module_param_named(disable, disable_ipv6_mod, int, 0444);
 MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional");
 
@@ -157,6 +167,10 @@ lookup_protocol:
 			goto out_rcu_unlock;
 	}
 
+	err = vz_security_protocol_check(answer->protocol);
+	if (err < 0)
+		goto out_rcu_unlock;
+
 	err = -EPERM;
 	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
 		goto out_rcu_unlock;
@@ -174,6 +188,13 @@ lookup_protocol:
 	if (sk == NULL)
 		goto out;
 
+	err = -ENOBUFS;
+	if (ub_sock_charge(sk, PF_INET6, sock->type, kern))
+		goto out_sk_free;
+	/* if charge was successful, sock_init_data() MUST be called to
+	 * set sk->sk_type. otherwise sk will be uncharged to wrong resource
+	 */
+
 	sock_init_data(sock, sk);
 
 	err = 0;
@@ -249,6 +270,9 @@ out:
 out_rcu_unlock:
 	rcu_read_unlock();
 	goto out;
+out_sk_free:
+	sk_free(sk);
+	return err;
 }
 
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/exthdrs_core.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/exthdrs_core.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/exthdrs_core.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/exthdrs_core.c	2015-01-21 12:02:58.108831237 +0300
@@ -162,7 +162,7 @@ int ipv6_find_tlv(struct sk_buff *skb, i
 EXPORT_SYMBOL_GPL(ipv6_find_tlv);
 EXPORT_SYMBOL(ipv6_ext_hdr);
 EXPORT_SYMBOL(ipv6_skip_exthdr);
-EXPORT_SYMBOL_GPL(ipv6_skip_exthdr_fragoff);
+EXPORT_SYMBOL(ipv6_skip_exthdr_fragoff);
 
 /*
  * find the offset to specified header or the protocol number of last header
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/fib6_rules.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/fib6_rules.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/fib6_rules.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/fib6_rules.c	2015-01-21 12:02:42.446247027 +0300
@@ -263,16 +263,14 @@ static struct fib_rules_ops fib6_rules_o
 
 static int fib6_rules_net_init(struct net *net)
 {
+	struct fib_rules_ops *ops;
 	int err = -ENOMEM;
 
-	net->ipv6.fib6_rules_ops = kmemdup(&fib6_rules_ops_template,
-					   sizeof(*net->ipv6.fib6_rules_ops),
-					   GFP_KERNEL);
-	if (!net->ipv6.fib6_rules_ops)
-		goto out;
+	ops = fib_rules_register(&fib6_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv6.fib6_rules_ops = ops;
 
-	net->ipv6.fib6_rules_ops->fro_net = net;
-	INIT_LIST_HEAD(&net->ipv6.fib6_rules_ops->rules_list);
 
 	err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0,
 				   RT6_TABLE_LOCAL, FIB_RULE_PERMANENT);
@@ -282,25 +280,19 @@ static int fib6_rules_net_init(struct ne
 	err = fib_default_rule_add(net->ipv6.fib6_rules_ops,
 				   0x7FFE, RT6_TABLE_MAIN, 0);
 	if (err)
-		goto out_fib6_default_rule_add;
+		goto out_fib6_rules_ops;
 
-	err = fib_rules_register(net->ipv6.fib6_rules_ops);
-	if (err)
-		goto out_fib6_default_rule_add;
 out:
 	return err;
 
-out_fib6_default_rule_add:
-	fib_rules_cleanup_ops(net->ipv6.fib6_rules_ops);
 out_fib6_rules_ops:
-	kfree(net->ipv6.fib6_rules_ops);
+	fib_rules_unregister(ops);
 	goto out;
 }
 
 static void fib6_rules_net_exit(struct net *net)
 {
 	fib_rules_unregister(net->ipv6.fib6_rules_ops);
-	kfree(net->ipv6.fib6_rules_ops);
 }
 
 static struct pernet_operations fib6_rules_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6_fib.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6_fib.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6_fib.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6_fib.c	2015-01-21 12:02:45.371169372 +0300
@@ -179,11 +179,9 @@ static void fib6_link_table(struct net *
 
 	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 
-	/*
-	 * No protection necessary, this is the only list mutatation
-	 * operation, tables never disappear once they exist.
-	 */
+	write_lock_bh(&tb->tb6_lock);
 	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
+	write_unlock_bh(&tb->tb6_lock);
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -1446,10 +1444,14 @@ void fib6_clean_all(struct net *net, int
 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(table, node, head, tb6_hlist) {
+			struct ve_struct *old_env;
+			
+			old_env = set_exec_env(table->owner_env);
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
 					func, prune, arg);
 			write_unlock_bh(&table->tb6_lock);
+			(void)set_exec_env(old_env);
 		}
 	}
 	rcu_read_unlock();
@@ -1566,6 +1568,9 @@ static int fib6_net_init(struct net *net
 	if (!net->ipv6.fib6_main_tbl)
 		goto out_fib_table_hash;
 
+#ifdef CONFIG_VE
+	net->ipv6.fib6_main_tbl->owner_env = get_exec_env();
+#endif
 	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
 	net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
@@ -1576,6 +1581,10 @@ static int fib6_net_init(struct net *net
 					   GFP_KERNEL);
 	if (!net->ipv6.fib6_local_tbl)
 		goto out_fib6_main_tbl;
+
+#ifdef CONFIG_VE
+	net->ipv6.fib6_local_tbl->owner_env = get_exec_env();
+#endif
 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
 	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
@@ -1621,7 +1630,7 @@ int __init fib6_init(void)
 
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
 					   sizeof(struct fib6_node),
-					   0, SLAB_HWCACHE_ALIGN,
+					   0, SLAB_HWCACHE_ALIGN|SLAB_UBC,
 					   NULL);
 	if (!fib6_node_kmem)
 		goto out;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6_output.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6_output.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6_output.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6_output.c	2015-01-21 12:02:58.044832935 +0300
@@ -172,6 +172,7 @@ int ip6_output(struct sk_buff *skb)
 	else
 		return ip6_output2(skb);
 }
+EXPORT_SYMBOL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP)
@@ -408,6 +409,9 @@ int ip6_forward(struct sk_buff *skb)
 		goto drop;
 	}
 
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
 	skb_forward_csum(skb);
 
 	/*
@@ -516,6 +520,20 @@ int ip6_forward(struct sk_buff *skb)
 		return -EMSGSIZE;
 	}
 
+	/*
+	 * We try to optimize forwarding of VE packets:
+	 * do not decrement TTL (and so save skb_cow)
+	 * during forwarding of outgoing pkts from VE.
+	 * For incoming pkts we still do ttl decr,
+	 * since such skb is not cloned and does not require
+	 * actual cow. So, there is at least one place
+	 * in pkts path with mandatory ttl decr, that is
+	 * sufficient to prevent routing loops.
+	 */
+	hdr = ipv6_hdr(skb);
+	if (skb->dev->vz_features & NETIF_F_VENET) /* src is VENET device */
+		goto no_ttl_decr;
+
 	if (skb_cow(skb, dst->dev->hard_header_len)) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 		goto drop;
@@ -527,6 +545,7 @@ int ip6_forward(struct sk_buff *skb)
 
 	hdr->hop_limit--;
 
+no_ttl_decr:
 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 		       ip6_forward_finish);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6_tunnel.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6_tunnel.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6_tunnel.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6_tunnel.c	2015-01-21 12:02:51.221014072 +0300
@@ -1221,7 +1221,7 @@ ip6_tnl_ioctl(struct net_device *dev, st
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			break;
 		err = -EFAULT;
 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
@@ -1255,7 +1255,7 @@ ip6_tnl_ioctl(struct net_device *dev, st
 		break;
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			break;
 
 		if (dev == ip6n->fb_tnl_dev) {
@@ -1389,29 +1389,25 @@ static void ip6_tnl_destroy_tunnels(stru
 {
 	int h;
 	struct ip6_tnl *t;
+	LIST_HEAD(list);
 
 	for (h = 0; h < HASH_SIZE; h++) {
-		while ((t = ip6n->tnls_r_l[h]) != NULL)
-			unregister_netdevice(t->dev);
+		t = ip6n->tnls_r_l[h];
+		while (t != NULL) {
+			unregister_netdevice_queue(t->dev, &list);
+			t = t->next;
+		}
 	}
 
 	t = ip6n->tnls_wc[0];
-	unregister_netdevice(t->dev);
+	unregister_netdevice_queue(t->dev, &list);
+	unregister_netdevice_many(&list);
 }
 
 static int ip6_tnl_init_net(struct net *net)
 {
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 	int err;
-	struct ip6_tnl_net *ip6n;
-
-	err = -ENOMEM;
-	ip6n = kzalloc(sizeof(struct ip6_tnl_net), GFP_KERNEL);
-	if (ip6n == NULL)
-		goto err_alloc;
-
-	err = net_assign_generic(net, ip6_tnl_net_id, ip6n);
-	if (err < 0)
-		goto err_assign;
 
 	ip6n->tnls[0] = ip6n->tnls_wc;
 	ip6n->tnls[1] = ip6n->tnls_r_l;
@@ -1434,27 +1430,23 @@ static int ip6_tnl_init_net(struct net *
 err_register:
 	free_netdev(ip6n->fb_tnl_dev);
 err_alloc_dev:
-	/* nothing */
-err_assign:
-	kfree(ip6n);
-err_alloc:
 	return err;
 }
 
 static void ip6_tnl_exit_net(struct net *net)
 {
-	struct ip6_tnl_net *ip6n;
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
-	ip6n = net_generic(net, ip6_tnl_net_id);
 	rtnl_lock();
 	ip6_tnl_destroy_tunnels(ip6n);
 	rtnl_unlock();
-	kfree(ip6n);
 }
 
 static struct pernet_operations ip6_tnl_net_ops = {
 	.init = ip6_tnl_init_net,
 	.exit = ip6_tnl_exit_net,
+	.id   = &ip6_tnl_net_id,
+	.size = sizeof(struct ip6_tnl_net),
 };
 
 /**
@@ -1467,7 +1459,7 @@ static int __init ip6_tunnel_init(void)
 {
 	int  err;
 
-	err = register_pernet_gen_device(&ip6_tnl_net_id, &ip6_tnl_net_ops);
+	err = register_pernet_device(&ip6_tnl_net_ops);
 	if (err < 0)
 		goto out;
 
@@ -1488,7 +1480,7 @@ static int __init ip6_tunnel_init(void)
 unreg_ip4ip6:
 	xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
 unreg_pernet_dev:
-	unregister_pernet_gen_device(ip6_tnl_net_id, &ip6_tnl_net_ops);
+	unregister_pernet_device(&ip6_tnl_net_ops);
 out:
 	return err;
 }
@@ -1505,7 +1497,7 @@ static void __exit ip6_tunnel_cleanup(vo
 	if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
 		printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n");
 
-	unregister_pernet_gen_device(ip6_tnl_net_id, &ip6_tnl_net_ops);
+	unregister_pernet_device(&ip6_tnl_net_ops);
 }
 
 module_init(ip6_tunnel_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6mr.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6mr.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/ip6mr.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ip6mr.c	2015-01-21 12:02:51.171015400 +0300
@@ -477,7 +477,7 @@ failure:
  *	Delete a VIF entry
  */
 
-static int mif6_delete(struct net *net, int vifi)
+static int mif6_delete(struct net *net, int vifi, struct list_head *head)
 {
 	struct mif_device *v;
 	struct net_device *dev;
@@ -519,7 +519,7 @@ static int mif6_delete(struct net *net, 
 		in6_dev->cnf.mc_forwarding--;
 
 	if (v->flags & MIFF_REGISTER)
-		unregister_netdevice(dev);
+		unregister_netdevice_queue(dev, head);
 
 	dev_put(dev);
 	return 0;
@@ -976,6 +976,7 @@ static int ip6mr_device_event(struct not
 	struct net *net = dev_net(dev);
 	struct mif_device *v;
 	int ct;
+	LIST_HEAD(list);
 
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
@@ -983,8 +984,10 @@ static int ip6mr_device_event(struct not
 	v = &net->ipv6.vif6_table[0];
 	for (ct = 0; ct < net->ipv6.maxvif; ct++, v++) {
 		if (v->dev == dev)
-			mif6_delete(net, ct);
+			mif6_delete(net, ct, &list);
 	}
+	unregister_netdevice_many(&list);
+
 	return NOTIFY_DONE;
 }
 
@@ -1188,14 +1191,16 @@ static int ip6mr_mfc_add(struct net *net
 static void mroute_clean_tables(struct net *net)
 {
 	int i;
+	LIST_HEAD(list);
 
 	/*
 	 *	Shut down all active vif entries
 	 */
 	for (i = 0; i < net->ipv6.maxvif; i++) {
 		if (!(net->ipv6.vif6_table[i].flags & VIFF_STATIC))
-			mif6_delete(net, i);
+			mif6_delete(net, i, &list);
 	}
+	unregister_netdevice_many(&list);
 
 	/*
 	 *	Wipe the cache
@@ -1325,7 +1330,7 @@ int ip6_mroute_setsockopt(struct sock *s
 		if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
 			return -EFAULT;
 		rtnl_lock();
-		ret = mif6_delete(net, mifi);
+		ret = mif6_delete(net, mifi, NULL);
 		rtnl_unlock();
 		return ret;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/mcast.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/mcast.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/mcast.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/mcast.c	2015-01-21 12:02:57.980834634 +0300
@@ -196,6 +196,7 @@ int ipv6_sock_mc_join(struct sock *sk, i
 
 	return 0;
 }
+EXPORT_SYMBOL(ipv6_sock_mc_join);
 
 /*
  *	socket leave on multicast group
@@ -2308,15 +2309,18 @@ static void igmp6_leave_group(struct ifm
 static void mld_gq_timer_expire(unsigned long data)
 {
 	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
 
 	idev->mc_gq_running = 0;
 	mld_send_report(idev, NULL);
 	in6_dev_put(idev);
+	set_exec_env(old_env);
 }
 
 static void mld_ifc_timer_expire(unsigned long data)
 {
 	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
 
 	mld_send_cr(idev);
 	if (idev->mc_ifc_count) {
@@ -2325,6 +2329,7 @@ static void mld_ifc_timer_expire(unsigne
 			mld_ifc_start_timer(idev, idev->mc_maxdelay);
 	}
 	in6_dev_put(idev);
+	set_exec_env(old_env);
 }
 
 static void mld_ifc_event(struct inet6_dev *idev)
@@ -2339,6 +2344,7 @@ static void mld_ifc_event(struct inet6_d
 static void igmp6_timer_handler(unsigned long data)
 {
 	struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+	struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env);
 
 	if (mld_in_v1_mode(ma->idev))
 		igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
@@ -2350,6 +2356,7 @@ static void igmp6_timer_handler(unsigned
 	ma->mca_flags &= ~MAF_TIMER_RUNNING;
 	spin_unlock(&ma->mca_lock);
 	ma_put(ma);
+	set_exec_env(old_env);
 }
 
 /* Device changing type */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/ndisc.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ndisc.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/ndisc.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/ndisc.c	2015-01-21 12:02:42.498245647 +0300
@@ -351,14 +351,7 @@ EXPORT_SYMBOL(ndisc_mc_map);
 
 static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
 {
-	const u32 *p32 = pkey;
-	u32 addr_hash, i;
-
-	addr_hash = 0;
-	for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
-		addr_hash ^= *p32++;
-
-	return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd);
+	return ndisc_hashfn(pkey, dev, nd_tbl.hash_rnd);
 }
 
 static int ndisc_constructor(struct neighbour *neigh)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6_queue.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6_queue.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6_queue.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6_queue.c	2015-01-21 12:02:45.437167621 +0300
@@ -411,7 +411,8 @@ ipq_dev_drop(int ifindex)
 static inline void
 __ipq_rcv_skb(struct sk_buff *skb)
 {
-	int status, type, pid, flags, nlmsglen, skblen;
+	int status, type, pid, flags;
+	unsigned int nlmsglen, skblen;
 	struct nlmsghdr *nlh;
 
 	skblen = skb->len;
@@ -439,7 +440,8 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (type <= IPQM_BASE)
 		return;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	write_lock_bh(&queue_lock);
@@ -469,8 +471,12 @@ __ipq_rcv_skb(struct sk_buff *skb)
 static void
 ipq_rcv_skb(struct sk_buff *skb)
 {
+	struct ve_struct *old_ve;
+
 	mutex_lock(&ipqnl_mutex);
+	old_ve = set_exec_env(skb->owner_env);
 	__ipq_rcv_skb(skb);
+	(void)set_exec_env(old_ve);
 	mutex_unlock(&ipqnl_mutex);
 }
 
@@ -480,9 +486,6 @@ ipq_rcv_dev_event(struct notifier_block 
 {
 	struct net_device *dev = ptr;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	/* Drop any packets associated with the downed device */
 	if (event == NETDEV_DOWN)
 		ipq_dev_drop(dev->ifindex);
@@ -502,7 +505,7 @@ ipq_rcv_nl_event(struct notifier_block *
 	if (event == NETLINK_URELEASE &&
 	    n->protocol == NETLINK_IP6_FW && n->pid) {
 		write_lock_bh(&queue_lock);
-		if ((n->net == &init_net) && (n->pid == peer_pid))
+		if (n->pid == peer_pid)
 			__ipq_reset();
 		write_unlock_bh(&queue_lock);
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6_tables.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6_tables.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6_tables.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6_tables.c	2015-01-21 12:02:47.741106455 +0300
@@ -351,6 +351,9 @@ ip6t_do_table(struct sk_buff *skb,
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
 
+	if (ve_xt_table_forbidden(table))
+		return NF_ACCEPT;
+
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
@@ -650,9 +653,10 @@ find_check_match(struct ip6t_entry_match
 	struct xt_match *match;
 	int ret;
 
-	match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name,
-						      m->u.user.revision),
-					"ip6t_%s", m->u.user.name);
+	match = ve0_try_then_request_module(xt_find_match(AF_INET6,
+							  m->u.user.name,
+							  m->u.user.revision),
+					    "ip6t_%s", m->u.user.name);
 	if (IS_ERR(match) || !match) {
 		duprintf("find_check_match: `%s' not found\n", m->u.user.name);
 		return match ? PTR_ERR(match) : -ENOENT;
@@ -717,10 +721,10 @@ find_check_entry(struct ip6t_entry *e, c
 		goto cleanup_matches;
 
 	t = ip6t_get_target(e);
-	target = try_then_request_module(xt_find_target(AF_INET6,
-							t->u.user.name,
-							t->u.user.revision),
-					 "ip6t_%s", t->u.user.name);
+	target = ve0_try_then_request_module(xt_find_target(AF_INET6,
+							    t->u.user.name,
+							    t->u.user.revision),
+					     "ip6t_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
 		duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
 		ret = target ? PTR_ERR(target) : -ENOENT;
@@ -1160,8 +1164,8 @@ static int get_info(struct net *net, voi
 	if (compat)
 		xt_compat_lock(AF_INET6);
 #endif
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
-				    "ip6table_%s", name);
+	t = ve0_try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
+					"ip6table_%s", name);
 	if (t && !IS_ERR(t)) {
 		struct ip6t_getinfo info;
 		const struct xt_table_info *private = t->private;
@@ -1257,8 +1261,8 @@ __do_replace(struct net *net, const char
 		goto out;
 	}
 
-	t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
-				    "ip6table_%s", name);
+	t = ve0_try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
+					"ip6table_%s", name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free_newinfo_counters_untrans;
@@ -1528,9 +1532,10 @@ compat_find_calc_match(struct ip6t_entry
 {
 	struct xt_match *match;
 
-	match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name,
-						      m->u.user.revision),
-					"ip6t_%s", m->u.user.name);
+	match = ve0_try_then_request_module(xt_find_match(AF_INET6,
+							  m->u.user.name,
+							  m->u.user.revision),
+					    "ip6t_%s", m->u.user.name);
 	if (IS_ERR(match) || !match) {
 		duprintf("compat_check_calc_match: `%s' not found\n",
 			 m->u.user.name);
@@ -1613,10 +1618,10 @@ check_compat_entry_size_and_hooks(struct
 		goto release_matches;
 
 	t = compat_ip6t_get_target(e);
-	target = try_then_request_module(xt_find_target(AF_INET6,
-							t->u.user.name,
-							t->u.user.revision),
-					 "ip6t_%s", t->u.user.name);
+	target = ve0_try_then_request_module(xt_find_target(AF_INET6,
+							    t->u.user.name,
+							    t->u.user.revision),
+					     "ip6t_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
 		duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
 			 t->u.user.name);
@@ -1649,7 +1654,7 @@ check_compat_entry_size_and_hooks(struct
 out:
 	module_put(t->u.kernel.target->me);
 release_matches:
-	IP6T_MATCH_ITERATE(e, compat_release_match, &j);
+	COMPAT_IP6T_MATCH_ITERATE(e, compat_release_match, &j);
 	return ret;
 }
 
@@ -1899,7 +1904,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, 
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2010,7 +2015,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, 
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2032,7 +2037,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2057,7 +2062,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2089,10 +2094,11 @@ do_ip6t_get_ctl(struct sock *sk, int cmd
 		else
 			target = 0;
 
-		try_then_request_module(xt_find_revision(AF_INET6, rev.name,
-							 rev.revision,
-							 target, &ret),
-					"ip6t_%s", rev.name);
+		ve0_try_then_request_module(xt_find_revision(AF_INET6,
+							     rev.name,
+							     rev.revision,
+							     target, &ret),
+					    "ip6t_%s", rev.name);
 		break;
 	}
 
@@ -2111,7 +2117,7 @@ struct xt_table *ip6t_register_table(str
 	int ret;
 	struct xt_table_info *newinfo;
 	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+		= { 0, 0, 0, 0, { 0 }, { 0 }, { } };
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
@@ -2256,12 +2262,25 @@ static struct xt_match icmp6_matchstruct
 
 static int __net_init ip6_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NFPROTO_IPV6);
+	int res;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES6))
+		return 0;
+
+	res = xt_proto_init(net, NFPROTO_IPV6);
+	if (!res)
+		net_ipt_module_set(net, VE_IP_IPTABLES6);
+	return res;
 }
 
 static void __net_exit ip6_tables_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6))
+		return;
+
 	xt_proto_fini(net, NFPROTO_IPV6);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES6);
 }
 
 static struct pernet_operations ip6_tables_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6t_LOG.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6t_LOG.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6t_LOG.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6t_LOG.c	2015-01-21 12:02:45.542164833 +0300
@@ -56,15 +56,15 @@ static void dump_packet(const struct nf_
 
 	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
 	if (ih == NULL) {
-		printk("TRUNCATED");
+		ve_printk(VE_LOG, "TRUNCATED");
 		return;
 	}
 
 	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-	printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+	ve_printk(VE_LOG, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-	printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	ve_printk(VE_LOG, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
 	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
 	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
 	       ih->hop_limit,
@@ -79,35 +79,35 @@ static void dump_packet(const struct nf_
 
 		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
 		if (hp == NULL) {
-			printk("TRUNCATED");
+			ve_printk(VE_LOG, "TRUNCATED");
 			return;
 		}
 
 		/* Max length: 48 "OPT (...) " */
 		if (logflags & IP6T_LOG_IPOPT)
-			printk("OPT ( ");
+			ve_printk(VE_LOG, "OPT ( ");
 
 		switch (currenthdr) {
 		case IPPROTO_FRAGMENT: {
 			struct frag_hdr _fhdr;
 			const struct frag_hdr *fh;
 
-			printk("FRAG:");
+			ve_printk(VE_LOG, "FRAG:");
 			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
 						&_fhdr);
 			if (fh == NULL) {
-				printk("TRUNCATED ");
+				ve_printk(VE_LOG, "TRUNCATED ");
 				return;
 			}
 
 			/* Max length: 6 "65535 " */
-			printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
+			ve_printk(VE_LOG, "%u ", ntohs(fh->frag_off) & 0xFFF8);
 
 			/* Max length: 11 "INCOMPLETE " */
 			if (fh->frag_off & htons(0x0001))
-				printk("INCOMPLETE ");
+				ve_printk(VE_LOG, "INCOMPLETE ");
 
-			printk("ID:%08x ", ntohl(fh->identification));
+			ve_printk(VE_LOG, "ID:%08x ", ntohl(fh->identification));
 
 			if (ntohs(fh->frag_off) & 0xFFF8)
 				fragment = 1;
@@ -121,7 +121,7 @@ static void dump_packet(const struct nf_
 		case IPPROTO_HOPOPTS:
 			if (fragment) {
 				if (logflags & IP6T_LOG_IPOPT)
-					printk(")");
+					ve_printk(VE_LOG, ")");
 				return;
 			}
 			hdrlen = ipv6_optlen(hp);
@@ -133,10 +133,10 @@ static void dump_packet(const struct nf_
 				const struct ip_auth_hdr *ah;
 
 				/* Max length: 3 "AH " */
-				printk("AH ");
+				ve_printk(VE_LOG, "AH ");
 
 				if (fragment) {
-					printk(")");
+					ve_printk(VE_LOG, ")");
 					return;
 				}
 
@@ -147,13 +147,13 @@ static void dump_packet(const struct nf_
 					 * Max length: 26 "INCOMPLETE [65535
 					 *  bytes] )"
 					 */
-					printk("INCOMPLETE [%u bytes] )",
+					ve_printk(VE_LOG, "INCOMPLETE [%u bytes] )",
 					       skb->len - ptr);
 					return;
 				}
 
 				/* Length: 15 "SPI=0xF1234567 */
-				printk("SPI=0x%x ", ntohl(ah->spi));
+				ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi));
 
 			}
 
@@ -165,10 +165,10 @@ static void dump_packet(const struct nf_
 				const struct ip_esp_hdr *eh;
 
 				/* Max length: 4 "ESP " */
-				printk("ESP ");
+				ve_printk(VE_LOG, "ESP ");
 
 				if (fragment) {
-					printk(")");
+					ve_printk(VE_LOG, ")");
 					return;
 				}
 
@@ -178,23 +178,23 @@ static void dump_packet(const struct nf_
 				eh = skb_header_pointer(skb, ptr, sizeof(_esph),
 							&_esph);
 				if (eh == NULL) {
-					printk("INCOMPLETE [%u bytes] )",
+					ve_printk(VE_LOG, "INCOMPLETE [%u bytes] )",
 					       skb->len - ptr);
 					return;
 				}
 
 				/* Length: 16 "SPI=0xF1234567 )" */
-				printk("SPI=0x%x )", ntohl(eh->spi) );
+				ve_printk(VE_LOG, "SPI=0x%x )", ntohl(eh->spi) );
 
 			}
 			return;
 		default:
 			/* Max length: 20 "Unknown Ext Hdr 255" */
-			printk("Unknown Ext Hdr %u", currenthdr);
+			ve_printk(VE_LOG, "Unknown Ext Hdr %u", currenthdr);
 			return;
 		}
 		if (logflags & IP6T_LOG_IPOPT)
-			printk(") ");
+			ve_printk(VE_LOG, ") ");
 
 		currenthdr = hp->nexthdr;
 		ptr += hdrlen;
@@ -206,7 +206,7 @@ static void dump_packet(const struct nf_
 		const struct tcphdr *th;
 
 		/* Max length: 10 "PROTO=TCP " */
-		printk("PROTO=TCP ");
+		ve_printk(VE_LOG, "PROTO=TCP ");
 
 		if (fragment)
 			break;
@@ -214,40 +214,40 @@ static void dump_packet(const struct nf_
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
 		if (th == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u ",
+		ve_printk(VE_LOG, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
 		if (logflags & IP6T_LOG_TCPSEQ)
-			printk("SEQ=%u ACK=%u ",
+			ve_printk(VE_LOG, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
-		printk("WINDOW=%u ", ntohs(th->window));
+		ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window));
 		/* Max length: 9 "RES=0x3C " */
-		printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		ve_printk(VE_LOG, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
 		if (th->cwr)
-			printk("CWR ");
+			ve_printk(VE_LOG, "CWR ");
 		if (th->ece)
-			printk("ECE ");
+			ve_printk(VE_LOG, "ECE ");
 		if (th->urg)
-			printk("URG ");
+			ve_printk(VE_LOG, "URG ");
 		if (th->ack)
-			printk("ACK ");
+			ve_printk(VE_LOG, "ACK ");
 		if (th->psh)
-			printk("PSH ");
+			ve_printk(VE_LOG, "PSH ");
 		if (th->rst)
-			printk("RST ");
+			ve_printk(VE_LOG, "RST ");
 		if (th->syn)
-			printk("SYN ");
+			ve_printk(VE_LOG, "SYN ");
 		if (th->fin)
-			printk("FIN ");
+			ve_printk(VE_LOG, "FIN ");
 		/* Max length: 11 "URGP=65535 " */
-		printk("URGP=%u ", ntohs(th->urg_ptr));
+		ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr));
 
 		if ((logflags & IP6T_LOG_TCPOPT)
 		    && th->doff * 4 > sizeof(struct tcphdr)) {
@@ -261,15 +261,15 @@ static void dump_packet(const struct nf_
 						ptr + sizeof(struct tcphdr),
 						optsize, _opt);
 			if (op == NULL) {
-				printk("OPT (TRUNCATED)");
+				ve_printk(VE_LOG, "OPT (TRUNCATED)");
 				return;
 			}
 
 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
-			printk("OPT (");
+			ve_printk(VE_LOG, "OPT (");
 			for (i =0; i < optsize; i++)
-				printk("%02X", op[i]);
-			printk(") ");
+				ve_printk(VE_LOG, "%02X", op[i]);
+			ve_printk(VE_LOG, ") ");
 		}
 		break;
 	}
@@ -280,9 +280,9 @@ static void dump_packet(const struct nf_
 
 		if (currenthdr == IPPROTO_UDP)
 			/* Max length: 10 "PROTO=UDP "     */
-			printk("PROTO=UDP " );
+			ve_printk(VE_LOG, "PROTO=UDP " );
 		else	/* Max length: 14 "PROTO=UDPLITE " */
-			printk("PROTO=UDPLITE ");
+			ve_printk(VE_LOG, "PROTO=UDPLITE ");
 
 		if (fragment)
 			break;
@@ -290,12 +290,12 @@ static void dump_packet(const struct nf_
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
 		if (uh == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u LEN=%u ",
+		ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ",
 		       ntohs(uh->source), ntohs(uh->dest),
 		       ntohs(uh->len));
 		break;
@@ -305,7 +305,7 @@ static void dump_packet(const struct nf_
 		const struct icmp6hdr *ic;
 
 		/* Max length: 13 "PROTO=ICMPv6 " */
-		printk("PROTO=ICMPv6 ");
+		ve_printk(VE_LOG, "PROTO=ICMPv6 ");
 
 		if (fragment)
 			break;
@@ -313,18 +313,18 @@ static void dump_packet(const struct nf_
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
 		if (ic == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 18 "TYPE=255 CODE=255 " */
-		printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+		ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
 
 		switch (ic->icmp6_type) {
 		case ICMPV6_ECHO_REQUEST:
 		case ICMPV6_ECHO_REPLY:
 			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			printk("ID=%u SEQ=%u ",
+			ve_printk(VE_LOG, "ID=%u SEQ=%u ",
 				ntohs(ic->icmp6_identifier),
 				ntohs(ic->icmp6_sequence));
 			break;
@@ -335,35 +335,35 @@ static void dump_packet(const struct nf_
 
 		case ICMPV6_PARAMPROB:
 			/* Max length: 17 "POINTER=ffffffff " */
-			printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
+			ve_printk(VE_LOG, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
 			/* Fall through */
 		case ICMPV6_DEST_UNREACH:
 		case ICMPV6_PKT_TOOBIG:
 		case ICMPV6_TIME_EXCEED:
 			/* Max length: 3+maxlen */
 			if (recurse) {
-				printk("[");
+				ve_printk(VE_LOG, "[");
 				dump_packet(info, skb, ptr + sizeof(_icmp6h),
 					    0);
-				printk("] ");
+				ve_printk(VE_LOG, "] ");
 			}
 
 			/* Max length: 10 "MTU=65535 " */
 			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
-				printk("MTU=%u ", ntohl(ic->icmp6_mtu));
+				ve_printk(VE_LOG, "MTU=%u ", ntohl(ic->icmp6_mtu));
 		}
 		break;
 	}
 	/* Max length: 10 "PROTO=255 " */
 	default:
-		printk("PROTO=%u ", currenthdr);
+		ve_printk(VE_LOG, "PROTO=%u ", currenthdr);
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
 	if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			printk("UID=%u GID=%u ",
+			ve_printk(VE_LOG, "UID=%u GID=%u ",
 				skb->sk->sk_socket->file->f_cred->fsuid,
 				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -397,14 +397,14 @@ ip6t_log_packet(u_int8_t pf,
 		loginfo = &default_loginfo;
 
 	spin_lock_bh(&log_lock);
-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
 		prefix,
 		in ? in->name : "",
 		out ? out->name : "");
 	if (in && !out) {
 		unsigned int len;
 		/* MAC logging for input chain only. */
-		printk("MAC=");
+		ve_printk(VE_LOG, "MAC=");
 		if (skb->dev && (len = skb->dev->hard_header_len) &&
 		    skb->mac_header != skb->network_header) {
 			const unsigned char *p = skb_mac_header(skb);
@@ -416,23 +416,23 @@ ip6t_log_packet(u_int8_t pf,
 
 			if (p != NULL) {
 				for (i = 0; i < len; i++)
-					printk("%02x%s", p[i],
+					ve_printk(VE_LOG, "%02x%s", p[i],
 					       i == len - 1 ? "" : ":");
 			}
-			printk(" ");
+			ve_printk(VE_LOG, " ");
 
 			if (skb->dev->type == ARPHRD_SIT) {
 				const struct iphdr *iph =
 					(struct iphdr *)skb_mac_header(skb);
-				printk("TUNNEL=%pI4->%pI4 ",
+				ve_printk(VE_LOG, "TUNNEL=%pI4->%pI4 ",
 				       &iph->saddr, &iph->daddr);
 			}
 		} else
-			printk(" ");
+			ve_printk(VE_LOG, " ");
 	}
 
 	dump_packet(loginfo, skb, skb_network_offset(skb), 1);
-	printk("\n");
+	ve_printk(VE_LOG, "\n");
 	spin_unlock_bh(&log_lock);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6table_filter.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6table_filter.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6table_filter.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6table_filter.c	2015-01-21 12:02:47.741106455 +0300
@@ -121,17 +121,27 @@ module_param(forward, bool, 0000);
 
 static int __net_init ip6table_filter_net_init(struct net *net)
 {
+	if (!net_ipt_permitted(net, VE_IP_FILTER6))
+		return 0;
+
 	/* Register table */
 	net->ipv6.ip6table_filter =
 		ip6t_register_table(net, &packet_filter, &initial_table.repl);
 	if (IS_ERR(net->ipv6.ip6table_filter))
 		return PTR_ERR(net->ipv6.ip6table_filter);
+
+	net_ipt_module_set(net, VE_IP_FILTER6);
 	return 0;
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_FILTER6))
+		return;
+
 	ip6t_unregister_table(net->ipv6.ip6table_filter);
+
+	net_ipt_module_clear(net, VE_IP_FILTER6);
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6table_mangle.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6table_mangle.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter/ip6table_mangle.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter/ip6table_mangle.c	2015-01-21 12:02:47.741106455 +0300
@@ -172,17 +172,27 @@ static struct nf_hook_ops ip6t_ops[] __r
 
 static int __net_init ip6table_mangle_net_init(struct net *net)
 {
+	if (!net_ipt_permitted(net, VE_IP_MANGLE6))
+		return 0;
+
 	/* Register table */
 	net->ipv6.ip6table_mangle =
 		ip6t_register_table(net, &packet_mangler, &initial_table.repl);
 	if (IS_ERR(net->ipv6.ip6table_mangle))
 		return PTR_ERR(net->ipv6.ip6table_mangle);
+
+	net_ipt_module_set(net, VE_IP_MANGLE6);
 	return 0;
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_MANGLE6))
+		return;
+
 	ip6t_unregister_table(net->ipv6.ip6table_mangle);
+
+	net_ipt_module_clear(net, VE_IP_MANGLE6);
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/netfilter.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/netfilter.c	2015-01-21 12:02:42.560244001 +0300
@@ -94,9 +94,10 @@ static int nf_ip6_reroute(struct sk_buff
 	return 0;
 }
 
-static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl)
+static int nf_ip6_route(struct net *net, struct dst_entry **dst,
+			struct flowi *fl)
 {
-	*dst = ip6_route_output(&init_net, NULL, fl);
+	*dst = ip6_route_output(net, NULL, fl);
 	return (*dst)->error;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/reassembly.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/reassembly.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/reassembly.c	2014-12-12 23:29:37.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/reassembly.c	2015-01-21 12:02:45.372169346 +0300
@@ -173,11 +173,14 @@ static void ip6_frag_expire(unsigned lon
 {
 	struct frag_queue *fq;
 	struct net *net;
+	struct ve_struct *old_ve;
 
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
+	old_ve = set_exec_env(fq->q.owner_ve);
 
 	ip6_expire_frag_queue(net, fq, &ip6_frags);
+	(void)set_exec_env(old_ve);
 }
 
 static __inline__ struct frag_queue *
@@ -432,6 +435,7 @@ static int ip6_frag_reasm(struct frag_qu
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(&fq->q, clone->truesize);
+		clone->owner_env = head->owner_env;
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/route.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/route.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/route.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/route.c	2015-01-21 12:02:58.037833121 +0300
@@ -838,7 +838,7 @@ static struct rt6_info *ip6_pol_route_in
 	return ip6_pol_route(net, table, fl->iif, fl, flags, true);
 }
 
-void ip6_route_input(struct sk_buff *skb)
+void __ip6_route_input(struct sk_buff *skb, struct in6_addr *daddr)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct net *net = dev_net(skb->dev);
@@ -847,7 +847,7 @@ void ip6_route_input(struct sk_buff *skb
 		.iif = skb->dev->ifindex,
 		.nl_u = {
 			.ip6_u = {
-				.daddr = iph->daddr,
+				.daddr = *daddr,
 				.saddr = iph->saddr,
 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
 			},
@@ -856,11 +856,17 @@ void ip6_route_input(struct sk_buff *skb
 		.proto = iph->nexthdr,
 	};
 
-	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
+	if (rt6_need_strict(daddr) && skb->dev->type != ARPHRD_PIMREG)
 		flags |= RT6_LOOKUP_F_IFACE;
 
 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
 }
+EXPORT_SYMBOL(__ip6_route_input);
+
+void ip6_route_input(struct sk_buff *skb)
+{
+	__ip6_route_input(skb, &ipv6_hdr(skb)->daddr);
+}
 
 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 					     struct flowi *fl, int flags)
@@ -1923,7 +1929,7 @@ int ipv6_route_ioctl(struct net *net, un
 	switch(cmd) {
 	case SIOCADDRT:		/* Add a route */
 	case SIOCDELRT:		/* Delete a route */
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_VE_NET_ADMIN))
 			return -EPERM;
 		err = copy_from_user(&rtmsg, arg,
 				     sizeof(struct in6_rtmsg));
@@ -2791,6 +2797,7 @@ struct ctl_table *ipv6_route_sysctl_init
 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
+		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
 	}
 
 	return table;
@@ -2899,6 +2906,26 @@ static struct notifier_block ip6_route_d
 	.priority = 0,
 };
 
+static void ip6_rt_dump_dst(void *o)
+{
+	struct rt6_info *r = (struct rt6_info *)o;
+
+	if (r->u.dst.flags & DST_FREE)
+		return;
+
+	printk("=== %p\n", o);
+	dst_dump_one(&r->u.dst);
+	printk("\tidev %p flags %x ref %d prot %d\n",
+			r->rt6i_dev, r->rt6i_flags, atomic_read(&r->rt6i_ref),
+			(int)r->rt6i_protocol);
+}
+
+static void _ip6_rt_dump_dsts(void)
+{
+	printk("IPv6 dst cache:\n");
+	slab_obj_walk(ip6_dst_ops_template.kmem_cachep, ip6_rt_dump_dst);
+}
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -2953,6 +2980,7 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto out_register_late_subsys;
 
+	ip6_rt_dump_dsts = _ip6_rt_dump_dsts;
 out:
 	return ret;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/sit.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/sit.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/sit.c	2014-12-12 23:29:29.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/sit.c	2015-01-21 12:02:51.238013621 +0300
@@ -32,6 +32,7 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <linux/vzcalluser.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -53,6 +54,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
 /*
    This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
 
@@ -87,6 +91,9 @@ static struct ip_tunnel * ipip6_tunnel_l
 	struct ip_tunnel *t;
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 
+	if (sitn == NULL)
+		return NULL;
+
 	for (t = sitn->tunnels_r_l[h0^h1]; t; t = t->next) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
@@ -251,7 +258,7 @@ static int ipip6_tunnel_get_prl(struct i
 	/* For simple GET or for root users,
 	 * we try harder to allocate.
 	 */
-	kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
+	kp = (cmax <= 1 || capable(CAP_NET_ADMIN) || capable(CAP_VE_NET_ADMIN)) ?
 		kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
 		NULL;
 
@@ -823,7 +830,7 @@ ipip6_tunnel_ioctl (struct net_device *d
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 
 		err = -EFAULT;
@@ -881,7 +888,7 @@ ipip6_tunnel_ioctl (struct net_device *d
 
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 
 		if (dev == sitn->fb_tunnel_dev) {
@@ -914,7 +921,7 @@ ipip6_tunnel_ioctl (struct net_device *d
 	case SIOCDELPRL:
 	case SIOCCHGPRL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 			goto done;
 		err = -EINVAL;
 		if (dev == sitn->fb_tunnel_dev)
@@ -954,11 +961,14 @@ static int ipip6_tunnel_change_mtu(struc
 	return 0;
 }
 
+static void sit_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx);
 static const struct net_device_ops ipip6_netdev_ops = {
 	.ndo_uninit	= ipip6_tunnel_uninit,
 	.ndo_start_xmit	= ipip6_tunnel_xmit,
 	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
 	.ndo_change_mtu	= ipip6_tunnel_change_mtu,
+	.ndo_cpt	= sit_cpt,
 };
 
 static void ipip6_tunnel_setup(struct net_device *dev)
@@ -1014,25 +1024,133 @@ static struct xfrm_tunnel sit_handler = 
 	.priority	=	1,
 };
 
-static void sit_destroy_tunnels(struct sit_net *sitn)
+static void sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head)
 {
 	int prio;
 
 	for (prio = 1; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t;
-			while ((t = sitn->tunnels[prio][h]) != NULL)
-				unregister_netdevice(t->dev);
+			struct ip_tunnel *t = sitn->tunnels[prio][h];
+
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = t->next;
+			}
+		}
+	}
+}
+
+static void sit_cpt(struct net_device *dev,
+		struct cpt_ops *ops, struct cpt_context *ctx)
+{
+	struct cpt_tunnel_image v;
+	struct ip_tunnel *t;
+	struct sit_net *sitn;
+
+	t = netdev_priv(dev);
+	sitn = net_generic(get_exec_env()->ve_netns, sit_net_id);
+	BUG_ON(sitn == NULL);
+
+	v.cpt_next = CPT_NULL;
+	v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL;
+	v.cpt_hdrlen = sizeof(v);
+	v.cpt_content = CPT_CONTENT_VOID;
+
+	/* mark fb dev */
+	v.cpt_tnl_flags = CPT_TUNNEL_SIT;
+	if (dev == sitn->fb_tunnel_dev)
+		v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV;
+
+	v.cpt_i_flags = t->parms.i_flags;
+	v.cpt_o_flags = t->parms.o_flags;
+	v.cpt_i_key = t->parms.i_key;
+	v.cpt_o_key = t->parms.o_key;
+
+	BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+	memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph));
+
+	ops->write(&v, sizeof(v), ctx);
+}
+
+static int sit_rst(loff_t start, struct cpt_netdev_image *di,
+		struct rst_ops *ops, struct cpt_context *ctx)
+{
+	int err = -ENODEV;
+	struct cpt_tunnel_image v;
+	struct net_device *dev;
+	struct ip_tunnel *t;
+	loff_t pos;
+	int fbdev;
+	struct sit_net *sitn;
+
+	sitn = net_generic(get_exec_env()->ve_netns, sit_net_id);
+	if (sitn == NULL)
+		return -EOPNOTSUPP;
+
+	pos = start + di->cpt_hdrlen;
+	err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL,
+			pos, &v, sizeof(v), ctx);
+	if (err)
+		return err;
+
+	/* some sanity */
+	if (v.cpt_content != CPT_CONTENT_VOID)
+		return -EINVAL;
+
+	if (!(v.cpt_tnl_flags & CPT_TUNNEL_SIT))
+		return 1;
+
+	if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) {
+		fbdev = 1;
+		err = 0;
+		dev = sitn->fb_tunnel_dev;
+	} else {
+		fbdev = 0;
+		err = -ENOMEM;
+		dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name,
+				ipip6_tunnel_setup);
+		if (!dev)
+			goto out;
+	}
+
+	t = netdev_priv(dev);
+	t->parms.i_flags = v.cpt_i_flags;
+	t->parms.o_flags = v.cpt_o_flags;
+	t->parms.i_key = v.cpt_i_key;
+	t->parms.o_key = v.cpt_o_key;
+
+	BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+	memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph));
+
+	if (!fbdev) {
+		ipip6_tunnel_init(dev);
+		err = register_netdevice(dev);
+		if (err) {
+			free_netdev(dev);
+			goto out;
 		}
+
+		dev_hold(dev);
+		ipip6_tunnel_link(sitn, t);
 	}
+out:
+	return err;
 }
 
+static struct netdev_rst sit_netdev_rst = {
+	.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL,
+	.ndo_rst = sit_rst,
+};
+
 static int sit_init_net(struct net *net)
 {
 	int err;
 	struct sit_net *sitn;
 
+	if (!(get_exec_env()->features & VE_FEATURE_SIT))
+		return net_assign_generic(net, sit_net_id, NULL);
+
 	err = -ENOMEM;
 	sitn = kzalloc(sizeof(struct sit_net), GFP_KERNEL);
 	if (sitn == NULL)
@@ -1076,25 +1194,33 @@ err_alloc:
 static void sit_exit_net(struct net *net)
 {
 	struct sit_net *sitn;
+	LIST_HEAD(list);
 
 	sitn = net_generic(net, sit_net_id);
+	if (sitn == NULL) /* no VE_FEATURE_SIT */
+		return;
+
 	rtnl_lock();
-	sit_destroy_tunnels(sitn);
-	unregister_netdevice(sitn->fb_tunnel_dev);
+	sit_destroy_tunnels(sitn, &list);
+	unregister_netdevice_queue(sitn->fb_tunnel_dev, &list);
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
+	net_assign_generic(net, sit_net_id, NULL);
 	kfree(sitn);
 }
 
 static struct pernet_operations sit_net_ops = {
 	.init = sit_init_net,
 	.exit = sit_exit_net,
+	.id = &sit_net_id,
 };
 
 static void __exit sit_cleanup(void)
 {
+	unregister_netdev_rst(&sit_netdev_rst);
 	xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
 
-	unregister_pernet_gen_device(sit_net_id, &sit_net_ops);
+	unregister_pernet_device(&sit_net_ops);
 }
 
 static int __init sit_init(void)
@@ -1103,16 +1229,17 @@ static int __init sit_init(void)
 
 	printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n");
 
-	err = register_pernet_gen_device(&sit_net_id, &sit_net_ops);
+	err = register_pernet_device(&sit_net_ops);
 	if (err < 0)
 		return err;
 
 	err = xfrm4_tunnel_register(&sit_handler, AF_INET6);
 	if (err < 0) {
-		unregister_pernet_gen_device(sit_net_id, &sit_net_ops);
+		unregister_pernet_device(&sit_net_ops);
 		printk(KERN_INFO "sit init: Can't add protocol\n");
 		return -EAGAIN;
-	}
+	} else
+		register_netdev_rst(&sit_netdev_rst);
 
 	return err;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/ipv6/tcp_ipv6.c linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/tcp_ipv6.c
--- linux-2.6.32-504.3.3.el6.orig/net/ipv6/tcp_ipv6.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/ipv6/tcp_ipv6.c	2015-01-21 12:02:58.105831318 +0300
@@ -63,6 +63,8 @@
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
 
+#include <bc/tcp.h>
+
 #include <asm/uaccess.h>
 
 #include <linux/proc_fs.h>
@@ -78,7 +80,7 @@ static void	tcp_v6_reqsk_send_ack(struct
 
 static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
 
-static const struct inet_connection_sock_af_ops ipv6_mapped;
+const struct inet_connection_sock_af_ops ipv6_mapped;
 static const struct inet_connection_sock_af_ops ipv6_specific;
 #ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
@@ -522,13 +524,15 @@ static inline void syn_flood_warning(str
 #ifdef CONFIG_SYN_COOKIES
 	if (sysctl_tcp_syncookies)
 		printk(KERN_INFO
-		       "TCPv6: Possible SYN flooding on port %d. "
-		       "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest));
+		       "TCPv6: Possible SYN flooding on ctid %u, port %d. "
+		       "Sending cookies.\n",
+		       skb->owner_env->veid, ntohs(tcp_hdr(skb)->dest));
 	else
 #endif
 		printk(KERN_INFO
-		       "TCPv6: Possible SYN flooding on port %d. "
-		       "Dropping request.\n", ntohs(tcp_hdr(skb)->dest));
+		       "TCPv6: Possible SYN flooding on ctid %u, port %d. "
+		       "Dropping request.\n",
+		       skb->owner_env->veid, ntohs(tcp_hdr(skb)->dest));
 }
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
@@ -892,12 +896,14 @@ struct request_sock_ops tcp6_request_soc
 	.destructor	=	tcp_v6_reqsk_destructor,
 	.send_reset	=	tcp_v6_send_reset
 };
+EXPORT_SYMBOL(tcp6_request_sock_ops);
 
 #ifdef CONFIG_TCP_MD5SIG
-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.md5_lookup	=	tcp_v6_reqsk_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
 };
+EXPORT_SYMBOL(tcp_request_sock_ipv6_ops);
 #endif
 
 static struct timewait_sock_ops tcp6_timewait_sock_ops = {
@@ -1454,6 +1460,7 @@ static int tcp_v6_do_rcv(struct sock *sk
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp;
 	struct sk_buff *opt_skb = NULL;
+	struct user_beancounter *ub;
 
 	/* Imagine: socket is IPv6. IPv4 packet arrives,
 	   goes to IPv4 receive handler and backlogged.
@@ -1466,6 +1473,8 @@ static int tcp_v6_do_rcv(struct sock *sk
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_do_rcv(sk, skb);
 
+	ub = set_exec_ub(sock_bc(sk)->ub);
+
 #ifdef CONFIG_TCP_MD5SIG
 	if (tcp_v6_inbound_md5_hash (sk, skb))
 		goto discard;
@@ -1503,7 +1512,7 @@ static int tcp_v6_do_rcv(struct sock *sk
 		TCP_CHECK_TIMER(sk);
 		if (opt_skb)
 			goto ipv6_pktoptions;
-		return 0;
+		goto restore_context;
 	}
 
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1525,7 +1534,7 @@ static int tcp_v6_do_rcv(struct sock *sk
 				goto reset;
 			if (opt_skb)
 				__kfree_skb(opt_skb);
-			return 0;
+			goto restore_context;
 		}
 	} else
 		sock_rps_save_rxhash(sk, skb->rxhash);
@@ -1536,6 +1545,9 @@ static int tcp_v6_do_rcv(struct sock *sk
 	TCP_CHECK_TIMER(sk);
 	if (opt_skb)
 		goto ipv6_pktoptions;
+
+restore_context:
+	(void)set_exec_ub(ub);
 	return 0;
 
 reset:
@@ -1544,7 +1556,7 @@ discard:
 	if (opt_skb)
 		__kfree_skb(opt_skb);
 	kfree_skb(skb);
-	return 0;
+	goto restore_context;
 csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 	goto discard;
@@ -1577,7 +1589,7 @@ ipv6_pktoptions:
 	}
 
 	kfree_skb(opt_skb);
-	return 0;
+	goto restore_context;
 }
 
 static int tcp_v6_rcv(struct sk_buff *skb)
@@ -1769,7 +1781,7 @@ static const struct tcp_sock_af_ops tcp_
  *	TCP over IPv4 via INET6 API
  */
 
-static const struct inet_connection_sock_af_ops ipv6_mapped = {
+const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.queue_xmit	   = ip_queue_xmit,
 	.send_check	   = tcp_v4_send_check,
 	.rebuild_header	   = inet_sk_rebuild_header,
@@ -1787,6 +1799,7 @@ static const struct inet_connection_sock
 	.compat_getsockopt = compat_ipv6_getsockopt,
 #endif
 };
+EXPORT_SYMBOL(ipv6_mapped);
 
 #ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
@@ -2098,12 +2111,17 @@ static int tcpv6_net_init(struct net *ne
 static void tcpv6_net_exit(struct net *net)
 {
 	inet_ctl_sock_destroy(net->ipv6.tcp_sk);
-	inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET6);
+}
+
+static void tcpv6_net_exit_batch(struct list_head *net_exit_list)
+{
+	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
 }
 
 static struct pernet_operations tcpv6_net_ops = {
-	.init = tcpv6_net_init,
-	.exit = tcpv6_net_exit,
+	.init	    = tcpv6_net_init,
+	.exit	    = tcpv6_net_exit,
+	.exit_batch = tcpv6_net_exit_batch,
 };
 
 int __init tcpv6_init(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/key/af_key.c linux-2.6.32-504.3.3.el6-042stab103_6/net/key/af_key.c
--- linux-2.6.32-504.3.3.el6.orig/net/key/af_key.c	2014-12-12 23:29:27.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/key/af_key.c	2015-01-21 12:02:51.204014524 +0300
@@ -186,7 +186,7 @@ static int pfkey_create(struct net *net,
 	struct sock *sk;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
 		return -EPERM;
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
@@ -3798,28 +3798,14 @@ static struct xfrm_mgr pfkeyv2_mgr =
 
 static int __net_init pfkey_net_init(struct net *net)
 {
-	struct netns_pfkey *net_pfkey;
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
 	int rv;
 
-	net_pfkey = kmalloc(sizeof(struct netns_pfkey), GFP_KERNEL);
-	if (!net_pfkey) {
-		rv = -ENOMEM;
-		goto out_kmalloc;
-	}
 	INIT_HLIST_HEAD(&net_pfkey->table);
 	atomic_set(&net_pfkey->socks_nr, 0);
-	rv = net_assign_generic(net, pfkey_net_id, net_pfkey);
-	if (rv < 0)
-		goto out_assign;
+
 	rv = pfkey_init_proc(net);
-	if (rv < 0)
-		goto out_proc;
-	return 0;
 
-out_proc:
-out_assign:
-	kfree(net_pfkey);
-out_kmalloc:
 	return rv;
 }
 
@@ -3829,19 +3815,20 @@ static void __net_exit pfkey_net_exit(st
 
 	pfkey_exit_proc(net);
 	BUG_ON(!hlist_empty(&net_pfkey->table));
-	kfree(net_pfkey);
 }
 
 static struct pernet_operations pfkey_net_ops = {
 	.init = pfkey_net_init,
 	.exit = pfkey_net_exit,
+	.id   = &pfkey_net_id,
+	.size = sizeof(struct netns_pfkey),
 };
 
 static void __exit ipsec_pfkey_exit(void)
 {
 	xfrm_unregister_km(&pfkeyv2_mgr);
 	sock_unregister(PF_KEY);
-	unregister_pernet_gen_subsys(pfkey_net_id, &pfkey_net_ops);
+	unregister_pernet_subsys(&pfkey_net_ops);
 	proto_unregister(&key_proto);
 }
 
@@ -3852,7 +3839,7 @@ static int __init ipsec_pfkey_init(void)
 	if (err != 0)
 		goto out;
 
-	err = register_pernet_gen_subsys(&pfkey_net_id, &pfkey_net_ops);
+	err = register_pernet_subsys(&pfkey_net_ops);
 	if (err != 0)
 		goto out_unregister_key_proto;
 	err = sock_register(&pfkey_family_ops);
@@ -3867,7 +3854,7 @@ out:
 out_sock_unregister:
 	sock_unregister(PF_KEY);
 out_unregister_pernet:
-	unregister_pernet_gen_subsys(pfkey_net_id, &pfkey_net_ops);
+	unregister_pernet_subsys(&pfkey_net_ops);
 out_unregister_key_proto:
 	proto_unregister(&key_proto);
 	goto out;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/Kconfig	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/Kconfig	2015-01-21 12:02:58.548819559 +0300
@@ -966,6 +966,12 @@ config NETFILTER_XT_MATCH_OSF
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_WDOG_TMO
+	tristate '"wdog_tmo" watchdog timer match'
+	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK && FENCE_WATCHDOG
+	help
+	  This option selects the watchdog timer match module.
+
 endif # NETFILTER_XTABLES
 
 endmenu
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/Makefile
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/Makefile	2014-12-12 23:29:06.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/Makefile	2015-01-21 12:02:58.548819559 +0300
@@ -101,6 +101,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) 
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_WDOG_TMO) += xt_wdog_tmo.o
 
 # ipset
 obj-$(CONFIG_IP_SET) += ipset/
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/core.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/core.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/core.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/core.c	2015-01-21 12:02:45.411168310 +0300
@@ -60,6 +60,8 @@ int nf_register_hook(struct nf_hook_ops 
 	struct nf_hook_ops *elem;
 	int err;
 
+	BUG_ON(!ve_is_super(get_exec_env()));
+
 	err = mutex_lock_interruptible(&nf_hook_mutex);
 	if (err < 0)
 		return err;
@@ -75,6 +77,8 @@ EXPORT_SYMBOL(nf_register_hook);
 
 void nf_unregister_hook(struct nf_hook_ops *reg)
 {
+	BUG_ON(!ve_is_super(get_exec_env()));
+
 	mutex_lock(&nf_hook_mutex);
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/ipset/ip_set_core.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/ipset/ip_set_core.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/ipset/ip_set_core.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/ipset/ip_set_core.c	2015-01-21 12:02:45.536164991 +0300
@@ -638,6 +638,9 @@ ip_set_create(struct sock *ctnl, struct 
 	u32 flags = flag_exist(nlh);
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL ||
 		     attr[IPSET_ATTR_TYPENAME] == NULL ||
@@ -758,6 +761,9 @@ ip_set_destroy(struct sock *ctnl, struct
 	ip_set_id_t i;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -822,6 +828,9 @@ ip_set_flush(struct sock *ctnl, struct s
 {
 	ip_set_id_t i;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -861,6 +870,9 @@ ip_set_rename(struct sock *ctnl, struct 
 	ip_set_id_t i;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL ||
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
@@ -909,6 +921,9 @@ ip_set_swap(struct sock *ctnl, struct sk
 	ip_set_id_t from_id, to_id;
 	char from_name[IPSET_MAXNAMELEN];
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL ||
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
@@ -1123,6 +1138,9 @@ ip_set_dump(struct sock *ctnl, struct sk
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -1214,6 +1232,9 @@ ip_set_uadd(struct sock *ctnl, struct sk
 	bool use_lineno;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL ||
 		     !((attr[IPSET_ATTR_DATA] != NULL) ^
@@ -1268,6 +1289,9 @@ ip_set_udel(struct sock *ctnl, struct sk
 	bool use_lineno;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL ||
 		     !((attr[IPSET_ATTR_DATA] != NULL) ^
@@ -1319,6 +1343,9 @@ ip_set_utest(struct sock *ctnl, struct s
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL ||
 		     attr[IPSET_ATTR_DATA] == NULL ||
@@ -1356,6 +1383,9 @@ ip_set_header(struct sock *ctnl, struct 
 	ip_set_id_t index;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL))
 		return -IPSET_ERR_PROTOCOL;
@@ -1413,6 +1443,9 @@ ip_set_type(struct sock *ctnl, struct sk
 	const char *typename;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_TYPENAME] == NULL ||
 		     attr[IPSET_ATTR_FAMILY] == NULL))
@@ -1469,6 +1502,9 @@ ip_set_protocol(struct sock *ctnl, struc
 	struct nlmsghdr *nlh2;
 	int ret = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -1586,6 +1622,8 @@ ip_set_sockfn_get(struct sock *sk, int o
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
 	if (optval != SO_IP_SET)
 		return -EBADF;
 	if (*len < sizeof(unsigned))
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/ipvs/ip_vs_conn.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/ipvs/ip_vs_conn.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/ipvs/ip_vs_conn.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/ipvs/ip_vs_conn.c	2015-01-21 12:02:43.292224567 +0300
@@ -1253,7 +1253,7 @@ int __init ip_vs_conn_init(void)
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 					      sizeof(struct ip_vs_conn), 0,
-					      SLAB_HWCACHE_ALIGN, NULL);
+					      SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
 	if (!ip_vs_conn_cachep) {
 		vfree(ip_vs_conn_tab);
 		return -ENOMEM;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/ipvs/ip_vs_sync.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/ipvs/ip_vs_sync.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/ipvs/ip_vs_sync.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/ipvs/ip_vs_sync.c	2015-01-21 12:02:45.412168283 +0300
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/inetdevice.h>
 #include <linux/net.h>
+#include <linux/nsproxy.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/skbuff.h>
@@ -1295,7 +1296,8 @@ static int set_mcast_if(struct sock *sk,
 	struct net_device *dev;
 	struct inet_sock *inet = inet_sk(sk);
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
+	if (!dev)
 		return -ENODEV;
 
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -1316,11 +1318,12 @@ static int set_mcast_if(struct sock *sk,
  */
 static int set_sync_mesg_maxlen(int sync_state)
 {
+	struct net *net = get_exec_env()->ve_netns;
 	struct net_device *dev;
 	int num;
 
 	if (sync_state == IP_VS_STATE_MASTER) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+		if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL)
 			return -ENODEV;
 
 		num = (dev->mtu - sizeof(struct iphdr) -
@@ -1331,7 +1334,7 @@ static int set_sync_mesg_maxlen(int sync
 		IP_VS_DBG(7, "setting the maximum length of sync sending "
 			  "message %d.\n", sync_send_mesg_maxlen);
 	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+		if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL)
 			return -ENODEV;
 
 		sync_recv_mesg_maxlen = dev->mtu -
@@ -1359,7 +1362,8 @@ join_mcast_group(struct sock *sk, struct
 	memset(&mreq, 0, sizeof(mreq));
 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
+	if (!dev)
 		return -ENODEV;
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
@@ -1380,7 +1384,8 @@ static int bind_mcastif_addr(struct sock
 	__be32 addr;
 	struct sockaddr_in sin;
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
+	if (!dev)
 		return -ENODEV;
 
 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_core.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_core.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_core.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_core.c	2015-01-21 12:02:51.291012214 +0300
@@ -45,6 +45,9 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 
+#include <net/sock.h>
+#include <bc/sock.h>
+
 #define NF_CONNTRACK_VERSION	"0.5.0"
 
 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
@@ -58,9 +61,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
-
 struct nf_conn nf_conntrack_untracked __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
@@ -179,6 +179,11 @@ destroy_conntrack(struct nf_conntrack *n
 	struct nf_conn *ct = (struct nf_conn *)nfct;
 	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_l4proto *l4proto;
+#ifdef CONFIG_VE_IPTABLES
+	struct ve_struct *old_ve;
+
+	old_ve = set_exec_env(ct->ct_net->owner_ve);
+#endif
 
 	pr_debug("destroy_conntrack(%p)\n", ct);
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
@@ -215,6 +220,9 @@ destroy_conntrack(struct nf_conntrack *n
 
 	pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 	nf_conntrack_free(ct);
+#ifdef CONFIG_VE_IPTABLES
+	(void)set_exec_env(old_ve);
+#endif
 }
 
 void nf_ct_delete_from_lists(struct nf_conn *ct)
@@ -455,6 +463,16 @@ __nf_conntrack_confirm(struct sk_buff *s
 
 	spin_lock_bh(&nf_conntrack_lock);
 
+	/* We have to check the DYING flag inside the lock to prevent
+	   a race against nf_ct_get_next_corpse() possibly called from
+	   user context, else we insert an already 'dead' hash, blocking
+	   further use of that particular connection -JM */
+
+	if (unlikely(nf_ct_is_dying(ct))) {
+		spin_unlock_bh(&nf_conntrack_lock);
+		return NF_ACCEPT;
+	}
+
 	/* See if there's one in the list already, including reverse:
 	   NAT could have grabbed it without realizing, since we're
 	   not in the hash.  If there is, we lost race. */
@@ -580,9 +598,12 @@ static noinline int early_drop(struct ne
 struct nf_conn *nf_conntrack_alloc(struct net *net,
 				   const struct nf_conntrack_tuple *orig,
 				   const struct nf_conntrack_tuple *repl,
+				   struct user_beancounter *ub,
 				   gfp_t gfp)
 {
 	struct nf_conn *ct;
+	struct user_beancounter *old_ub;
+	unsigned int ct_max = net->ct.max ? net->ct.max : init_net.ct.max;
 
 	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
 		get_random_bytes(&nf_conntrack_hash_rnd,
@@ -593,8 +614,8 @@ struct nf_conn *nf_conntrack_alloc(struc
 	/* We don't want any race condition at early drop stage */
 	atomic_inc(&net->ct.count);
 
-	if (nf_conntrack_max &&
-	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+	if (ct_max &&
+	    unlikely(atomic_read(&net->ct.count) > ct_max)) {
 		unsigned int hash = hash_conntrack(net, orig);
 		if (!early_drop(net, hash)) {
 			atomic_dec(&net->ct.count);
@@ -610,7 +631,9 @@ struct nf_conn *nf_conntrack_alloc(struc
 	 * Do not use kmem_cache_zalloc(), as this cache uses
 	 * SLAB_DESTROY_BY_RCU.
 	 */
+	old_ub = set_exec_ub(ub);
 	ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+	(void)set_exec_ub(old_ub);
 	if (ct == NULL) {
 		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
 		atomic_dec(&net->ct.count);
@@ -672,13 +695,20 @@ init_conntrack(struct net *net,
 	struct nf_conn_help *help;
 	struct nf_conntrack_tuple repl_tuple;
 	struct nf_conntrack_expect *exp;
+	struct user_beancounter *ub = NULL;
 
 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
 		pr_debug("Can't invert tuple.\n");
 		return NULL;
 	}
 
-	ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC);
+#ifdef CONFIG_BEANCOUNTERS
+	if (skb->dev != NULL)  /* received skb */
+		ub = netdev_bc(skb->dev)->exec_ub;
+	else if (skb->sk != NULL) /* sent skb */
+		ub = sock_bc(skb->sk)->ub;
+#endif
+	ct = nf_conntrack_alloc(net, tuple, &repl_tuple, ub, GFP_ATOMIC);
 	if (IS_ERR(ct)) {
 		pr_debug("Can't allocate conntrack.\n");
 		return (struct nf_conntrack_tuple_hash *)ct;
@@ -754,6 +784,9 @@ resolve_normal_ct(struct net *net,
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 
+	if (!mask_ipt_allow(get_exec_env()->ipt_mask, VE_NF_CONNTRACK))
+		return NULL;
+
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
 			     l4proto)) {
@@ -1192,17 +1225,22 @@ static void nf_conntrack_cleanup_net(str
 
 /* Mishearing the voices in his head, our hero wonders how he's
    supposed to kill the mall. */
-void nf_conntrack_cleanup(struct net *net)
+void nf_conntrack_cleanup_list(struct list_head *net_exit_list)
 {
-	if (net_eq(net, &init_net))
-		rcu_assign_pointer(ip_ct_attach, NULL);
+	struct net *net;
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		if (net_eq(net, &init_net))
+			rcu_assign_pointer(ip_ct_attach, NULL);
+	}
 
 	/* This makes sure all current packets have passed through
 	   netfilter framework.  Roll on, two-stage module
 	   delete... */
 	synchronize_net();
 
-	nf_conntrack_cleanup_net(net);
+	list_for_each_entry(net, net_exit_list, exit_list)
+		nf_conntrack_cleanup_net(net);
 }
 
 void nf_conntrack_cleanup_end(void)
@@ -1222,12 +1260,12 @@ void *nf_ct_alloc_hashtable(unsigned int
 	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
 	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
 	sz = nr_slots * sizeof(struct hlist_nulls_head);
-	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+	hash = (void *)__get_free_pages(GFP_KERNEL_UBC | __GFP_NOWARN | __GFP_ZERO,
 					get_order(sz));
 	if (!hash) {
 		*vmalloced = 1;
 		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
-		hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+		hash = __vmalloc(sz, GFP_KERNEL_UBC | __GFP_ZERO, PAGE_KERNEL);
 	}
 
 	if (hash && nulls)
@@ -1315,11 +1353,11 @@ static int nf_conntrack_init_init_net(vo
 		 * entries. */
 		max_factor = 4;
 	}
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.max = max_factor * nf_conntrack_htable_size;
 
 	printk("nf_conntrack version %s (%u buckets, %d max)\n",
 	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
-	       nf_conntrack_max);
+	       init_net.ct.max);
 
 	ret = nf_conntrack_proto_init();
 	if (ret < 0)
@@ -1356,6 +1394,7 @@ static int nf_conntrack_init_net(struct 
 	int ret;
 
 	atomic_set(&net->ct.count, 0);
+	net->ct.max = init_net.ct.max;
 	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
 	INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
 	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_ecache.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_ecache.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_ecache.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_ecache.c	2015-01-21 12:02:42.536244638 +0300
@@ -25,22 +25,17 @@
 
 static DEFINE_MUTEX(nf_ct_ecache_mutex);
 
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
-
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
-EXPORT_SYMBOL_GPL(nf_expect_event_cb);
-
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
 void nf_ct_deliver_cached_events(struct nf_conn *ct)
 {
+	struct net *net = nf_ct_net(ct);
 	unsigned long events;
 	struct nf_ct_event_notifier *notify;
 	struct nf_conntrack_ecache *e;
 
 	rcu_read_lock();
-	notify = rcu_dereference(nf_conntrack_event_cb);
+	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
 	if (notify == NULL)
 		goto out_unlock;
 
@@ -78,18 +73,19 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 
-int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
+int nf_conntrack_register_notifier(struct net *net,
+				   struct nf_ct_event_notifier *new)
 {
 	int ret = 0;
 	struct nf_ct_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_conntrack_event_cb);
+	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
 	if (notify != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
-	rcu_assign_pointer(nf_conntrack_event_cb, new);
+	rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
 	mutex_unlock(&nf_ct_ecache_mutex);
 	return ret;
 
@@ -99,30 +95,32 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
 
-void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
+void nf_conntrack_unregister_notifier(struct net *net,
+				      struct nf_ct_event_notifier *new)
 {
 	struct nf_ct_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_conntrack_event_cb);
+	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
 	BUG_ON(notify != new);
-	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
+	rcu_assign_pointer(net->ct.nf_conntrack_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 
-int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
+int nf_ct_expect_register_notifier(struct net *net,
+				   struct nf_exp_event_notifier *new)
 {
 	int ret = 0;
 	struct nf_exp_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_expect_event_cb);
+	notify = rcu_dereference(net->ct.nf_expect_event_cb);
 	if (notify != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
-	rcu_assign_pointer(nf_expect_event_cb, new);
+	rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
 	mutex_unlock(&nf_ct_ecache_mutex);
 	return ret;
 
@@ -132,14 +130,15 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
 
-void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
+void nf_ct_expect_unregister_notifier(struct net *net,
+				      struct nf_exp_event_notifier *new)
 {
 	struct nf_exp_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_expect_event_cb);
+	notify = rcu_dereference(net->ct.nf_expect_event_cb);
 	BUG_ON(notify != new);
-	rcu_assign_pointer(nf_expect_event_cb, NULL);
+	rcu_assign_pointer(net->ct.nf_expect_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_expect.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_expect.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_expect.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_expect.c	2015-01-21 12:02:50.360036929 +0300
@@ -32,7 +32,6 @@ unsigned int nf_ct_expect_hsize __read_m
 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
 
 static unsigned int nf_ct_expect_hash_rnd __read_mostly;
-unsigned int nf_ct_expect_max __read_mostly;
 static int nf_ct_expect_hash_rnd_initted __read_mostly;
 
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
@@ -402,7 +401,7 @@ static inline int __nf_ct_expect_check(s
 		}
 	}
 
-	if (net->ct.expect_count >= nf_ct_expect_max) {
+	if (net->ct.expect_count >= init_net.ct.expect_max) {
 		if (net_ratelimit())
 			printk(KERN_WARNING
 			       "nf_conntrack: expectation table full\n");
@@ -581,10 +580,11 @@ int nf_conntrack_expect_init(struct net 
 			if (!nf_ct_expect_hsize)
 				nf_ct_expect_hsize = 1;
 		}
-		nf_ct_expect_max = nf_ct_expect_hsize * 4;
+		init_net.ct.expect_max = nf_ct_expect_hsize * 4;
 	}
 
 	net->ct.expect_count = 0;
+	net->ct.expect_max = init_net.ct.expect_max;
 	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
 						  &net->ct.expect_vmalloc, 0);
 	if (net->ct.expect_hash == NULL)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_h323_main.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_h323_main.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_h323_main.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_h323_main.c	2015-01-21 12:02:42.563243921 +0300
@@ -708,7 +708,8 @@ static int expect_h245(struct sk_buff *s
 
 /* If the calling party is on the same side of the forward-to party,
  * we don't need to track the second call */
-static int callforward_do_filter(const union nf_inet_addr *src,
+static int callforward_do_filter(struct net *net,
+				 const union nf_inet_addr *src,
 				 const union nf_inet_addr *dst,
 				 u_int8_t family)
 {
@@ -730,8 +731,9 @@ static int callforward_do_filter(const u
 
 		fl1.fl4_dst = src->ip;
 		fl2.fl4_dst = dst->ip;
-		if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
-			if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
+		if (!afinfo->route(net, (struct dst_entry **)&rt1, &fl1)) {
+			if (!afinfo->route(net, (struct dst_entry **)&rt2,
+					   &fl2)) {
 				if (rt1->rt_gateway == rt2->rt_gateway &&
 				    rt1->u.dst.dev  == rt2->u.dst.dev)
 					ret = 1;
@@ -748,8 +750,9 @@ static int callforward_do_filter(const u
 
 		memcpy(&fl1.fl6_dst, src, sizeof(fl1.fl6_dst));
 		memcpy(&fl2.fl6_dst, dst, sizeof(fl2.fl6_dst));
-		if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
-			if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
+		if (!afinfo->route(net, (struct dst_entry **)&rt1, &fl1)) {
+			if (!afinfo->route(net, (struct dst_entry **)&rt2,
+					   &fl2)) {
 				if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
 					    sizeof(rt1->rt6i_gateway)) &&
 				    rt1->u.dst.dev == rt2->u.dst.dev)
@@ -778,6 +781,7 @@ static int expect_callforwarding(struct 
 	__be16 port;
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
+	struct net *net = nf_ct_net(ct);
 	typeof(nat_callforwarding_hook) nat_callforwarding;
 
 	/* Read alternativeAddress */
@@ -787,7 +791,7 @@ static int expect_callforwarding(struct 
 	/* If the calling party is on the same side of the forward-to party,
 	 * we don't need to track the second call */
 	if (callforward_filter &&
-	    callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3,
+	    callforward_do_filter(net, &addr, &ct->tuplehash[!dir].tuple.src.u3,
 				  nf_ct_l3num(ct))) {
 		pr_debug("nf_ct_q931: Call Forwarding not tracked\n");
 		return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_netlink.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_netlink.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_netlink.c	2014-12-12 23:29:36.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_netlink.c	2015-01-21 12:02:45.530165151 +0300
@@ -4,7 +4,7 @@
  * (C) 2001 by Jay Schulist <jschlst@samba.org>
  * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
  * (C) 2003 by Patrick Mchardy <kaber@trash.net>
- * (C) 2005-2008 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2011 by Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * Initial connection tracking via netlink development funded and
  * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -46,6 +46,10 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
+#include <net/sock.h>
+#include <bc/beancounter.h>
+#include <bc/sock.h>
+
 MODULE_LICENSE("GPL");
 
 static char __initdata version[] = "0.93";
@@ -463,6 +467,7 @@ ctnetlink_nlmsg_size(const struct nf_con
 static int
 ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 {
+	struct net *net;
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct nlattr *nest_parms;
@@ -489,7 +494,8 @@ ctnetlink_conntrack_event(unsigned int e
 	} else
 		return 0;
 
-	if (!item->report && !nfnetlink_has_listeners(group))
+	net = nf_ct_net(ct);
+	if (!item->report && !nfnetlink_has_listeners(net, group))
 		return 0;
 
 	skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
@@ -566,7 +572,8 @@ ctnetlink_conntrack_event(unsigned int e
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+	err = nfnetlink_send(skb, net, item->pid, group, item->report,
+			     GFP_ATOMIC);
 	if (err == -ENOBUFS || err == -EAGAIN)
 		return -ENOBUFS;
 
@@ -578,7 +585,7 @@ nla_put_failure:
 nlmsg_failure:
 	kfree_skb(skb);
 errout:
-	nfnetlink_set_err(0, group, -ENOBUFS);
+	nfnetlink_set_err(net, 0, group, -ENOBUFS);
 	return 0;
 }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
@@ -593,6 +600,7 @@ static int ctnetlink_done(struct netlink
 static int
 ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nf_conn *ct, *last;
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
@@ -603,7 +611,7 @@ ctnetlink_dump_table(struct sk_buff *skb
 	last = (struct nf_conn *)cb->args[1];
 	for (; cb->args[0] < init_net.ct.htable_size; cb->args[0]++) {
 restart:
-		hlist_nulls_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]],
+			hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[cb->args[0]],
 					 hnnode) {
 			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
 				continue;
@@ -775,6 +783,7 @@ ctnetlink_del_conntrack(struct sock *ctn
 			const struct nlmsghdr *nlh,
 			const struct nlattr * const cda[])
 {
+	struct net *net = sock_net(ctnl);
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conn *ct;
@@ -788,7 +797,7 @@ ctnetlink_del_conntrack(struct sock *ctn
 		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
 	else {
 		/* Flush the whole table */
-		nf_conntrack_flush_report(&init_net,
+		nf_conntrack_flush_report(net,
 					 NETLINK_CB(skb).pid,
 					 nlmsg_report(nlh));
 		return 0;
@@ -797,7 +806,7 @@ ctnetlink_del_conntrack(struct sock *ctn
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(&init_net, &tuple);
+	h = nf_conntrack_find_get(net, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -835,6 +844,7 @@ ctnetlink_get_conntrack(struct sock *ctn
 			const struct nlmsghdr *nlh,
 			const struct nlattr * const cda[])
 {
+	struct net *net = sock_net(ctnl);
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conn *ct;
@@ -861,7 +871,7 @@ ctnetlink_get_conntrack(struct sock *ctn
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(&init_net, &tuple);
+	h = nf_conntrack_find_get(net, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -907,7 +917,7 @@ ctnetlink_parse_nat_setup(struct nf_conn
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
 		nfnl_unlock();
-		if (request_module("nf-nat-ipv4") < 0) {
+		if (ve0_request_module("nf-nat-ipv4") < 0) {
 			nfnl_lock();
 			rcu_read_lock();
 			return -EOPNOTSUPP;
@@ -1026,9 +1036,8 @@ ctnetlink_change_helper(struct nf_conn *
 		/* need to zero data of old helper */
 		memset(&help->help, 0, sizeof(help->help));
 	} else {
-		help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
-		if (help == NULL)
-			return -ENOMEM;
+		/* we cannot set a helper for an existing conntrack */
+		return -EOPNOTSUPP;
 	}
 
 	rcu_assign_pointer(help->helper, helper);
@@ -1181,16 +1190,18 @@ ctnetlink_change_conntrack(struct nf_con
 }
 
 static struct nf_conn *
-ctnetlink_create_conntrack(const struct nlattr * const cda[],
+ctnetlink_create_conntrack(struct net *net,
+			   const struct nlattr * const cda[],
 			   struct nf_conntrack_tuple *otuple,
 			   struct nf_conntrack_tuple *rtuple,
-			   u8 u3)
+			   u8 u3,
+			   struct user_beancounter *ub)
 {
 	struct nf_conn *ct;
 	int err = -EINVAL;
 	struct nf_conntrack_helper *helper;
 
-	ct = nf_conntrack_alloc(&init_net, otuple, rtuple, GFP_ATOMIC);
+	ct = nf_conntrack_alloc(net, otuple, rtuple, ub, GFP_ATOMIC);
 	if (IS_ERR(ct))
 		return ERR_PTR(-ENOMEM);
 
@@ -1199,7 +1210,6 @@ ctnetlink_create_conntrack(const struct 
 	ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
 
 	ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
-	ct->status |= IPS_CONFIRMED;
 
 	rcu_read_lock();
  	if (cda[CTA_HELP]) {
@@ -1247,16 +1257,21 @@ ctnetlink_create_conntrack(const struct 
 			goto err2;
 	}
 
+	err = ctnetlink_setup_nat(ct, cda);
+	if (err < 0)
+		goto err2;
+
+	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
+	/* we must add conntrack extensions before confirmation. */
+	ct->status |= IPS_CONFIRMED;
+
 	if (cda[CTA_STATUS]) {
 		err = ctnetlink_change_status(ct, cda);
 		if (err < 0)
 			goto err2;
 	}
 
-	err = ctnetlink_setup_nat(ct, cda);
-	if (err < 0)
-		goto err2;
-
 #ifdef CONFIG_NF_NAT_NEEDED
 	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
 		err = ctnetlink_change_nat_seq_adj(ct, cda);
@@ -1271,9 +1286,6 @@ ctnetlink_create_conntrack(const struct 
 			goto err2;
 	}
 
-	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
-	nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
-
 #if defined(CONFIG_NF_CONNTRACK_MARK)
 	if (cda[CTA_MARK])
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
@@ -1289,7 +1301,7 @@ ctnetlink_create_conntrack(const struct 
 		if (err < 0)
 			goto err2;
 
-		master_h = nf_conntrack_find_get(&init_net, &master);
+		master_h = nf_conntrack_find_get(net, &master);
 		if (master_h == NULL) {
 			err = -ENOENT;
 			goto err2;
@@ -1322,7 +1334,7 @@ ctnetlink_new_conntrack(struct sock *ctn
 	struct nf_conntrack_tuple otuple, rtuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	struct net *net = &init_net;
+	struct net *net = sock_net(ctnl);
 	struct nf_conn *ct;
 	u_int8_t u3 = nfmsg->nfgen_family;
 	int err = 0;
@@ -1348,12 +1360,17 @@ ctnetlink_new_conntrack(struct sock *ctn
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
 			enum ip_conntrack_events events;
+			struct user_beancounter *ub = NULL;
 
+#ifdef CONFIG_BEANCOUNTERS
+			if (skb->sk)
+				ub = sock_bc(skb->sk)->ub;
+#endif
 			if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
 				return -EINVAL;
 
-			ct = ctnetlink_create_conntrack(cda, &otuple,
-							&rtuple, u3);
+			ct = ctnetlink_create_conntrack(net, cda, &otuple,
+							&rtuple, u3, ub);
 			if (IS_ERR(ct))
 				return PTR_ERR(ct);
 
@@ -1522,9 +1539,10 @@ nla_put_failure:
 static int
 ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 {
-	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	struct nf_conntrack_expect *exp = item->exp;
+	struct net *net = nf_ct_exp_net(exp);
+ 	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
 	struct sk_buff *skb;
 	unsigned int type;
 	int flags = 0;
@@ -1536,7 +1554,7 @@ ctnetlink_expect_event(unsigned int even
 		return 0;
 
 	if (!item->report &&
-	    !nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW))
+	    !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
 		return 0;
 
 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1559,7 +1577,7 @@ ctnetlink_expect_event(unsigned int even
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
+	nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
 		       item->report, GFP_ATOMIC);
 	return 0;
 
@@ -1569,7 +1587,7 @@ nla_put_failure:
 nlmsg_failure:
 	kfree_skb(skb);
 errout:
-	nfnetlink_set_err(0, 0, -ENOBUFS);
+	nfnetlink_set_err(net, 0, 0, -ENOBUFS);
 	return 0;
 }
 #endif
@@ -1583,7 +1601,7 @@ static int ctnetlink_exp_done(struct net
 static int
 ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct net *net = &init_net;
+	struct net *net = sock_net(skb->sk);
 	struct nf_conntrack_expect *exp, *last;
 	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
 	struct hlist_node *n;
@@ -1636,6 +1654,7 @@ ctnetlink_get_expect(struct sock *ctnl, 
 		     const struct nlmsghdr *nlh,
 		     const struct nlattr * const cda[])
 {
+	struct net *net = sock_net(ctnl);
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_expect *exp;
 	struct sk_buff *skb2;
@@ -1659,7 +1678,7 @@ ctnetlink_get_expect(struct sock *ctnl, 
 	if (err < 0)
 		return err;
 
-	exp = nf_ct_expect_find_get(&init_net, &tuple);
+	exp = nf_ct_expect_find_get(net, &tuple);
 	if (!exp)
 		return -ENOENT;
 
@@ -1699,6 +1718,7 @@ ctnetlink_del_expect(struct sock *ctnl, 
 		     const struct nlmsghdr *nlh,
 		     const struct nlattr * const cda[])
 {
+	struct net *net = sock_net(ctnl);
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_helper *h;
@@ -1715,7 +1735,7 @@ ctnetlink_del_expect(struct sock *ctnl, 
 			return err;
 
 		/* bump usage count to 2 */
-		exp = nf_ct_expect_find_get(&init_net, &tuple);
+		exp = nf_ct_expect_find_get(net, &tuple);
 		if (!exp)
 			return -ENOENT;
 
@@ -1745,7 +1765,7 @@ ctnetlink_del_expect(struct sock *ctnl, 
 		}
 		for (i = 0; i < nf_ct_expect_hsize; i++) {
 			hlist_for_each_entry_safe(exp, n, next,
-						  &init_net.ct.expect_hash[i],
+						  &net->ct.expect_hash[i],
 						  hnode) {
 				m_help = nfct_help(exp->master);
 				if (m_help->helper == h
@@ -1761,7 +1781,7 @@ ctnetlink_del_expect(struct sock *ctnl, 
 		spin_lock_bh(&nf_conntrack_lock);
 		for (i = 0; i < nf_ct_expect_hsize; i++) {
 			hlist_for_each_entry_safe(exp, n, next,
-						  &init_net.ct.expect_hash[i],
+						  &net->ct.expect_hash[i],
 						  hnode) {
 				if (del_timer(&exp->timeout)) {
 					nf_ct_unlink_expect(exp);
@@ -1782,8 +1802,8 @@ ctnetlink_change_expect(struct nf_conntr
 }
 
 static int
-ctnetlink_create_expect(const struct nlattr * const cda[], u_int8_t u3,
-			u32 pid, int report)
+ctnetlink_create_expect(struct net *net, const struct nlattr * const cda[],
+			u_int8_t u3, u32 pid, int report)
 {
 	struct nf_conntrack_tuple tuple, mask, master_tuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
@@ -1804,7 +1824,7 @@ ctnetlink_create_expect(const struct nla
 		return err;
 
 	/* Look for master conntrack of this expectation */
-	h = nf_conntrack_find_get(&init_net, &master_tuple);
+	h = nf_conntrack_find_get(net, &master_tuple);
 	if (!h)
 		return -ENOENT;
 	ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1844,6 +1864,7 @@ ctnetlink_new_expect(struct sock *ctnl, 
 		     const struct nlmsghdr *nlh,
 		     const struct nlattr * const cda[])
 {
+	struct net *net = sock_net(ctnl);
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_expect *exp;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -1860,13 +1881,13 @@ ctnetlink_new_expect(struct sock *ctnl, 
 		return err;
 
 	spin_lock_bh(&nf_conntrack_lock);
-	exp = __nf_ct_expect_find(&init_net, &tuple);
+	exp = __nf_ct_expect_find(net, &tuple);
 
 	if (!exp) {
 		spin_unlock_bh(&nf_conntrack_lock);
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
-			err = ctnetlink_create_expect(cda,
+			err = ctnetlink_create_expect(net, cda,
 						      u3,
 						      NETLINK_CB(skb).pid,
 						      nlmsg_report(nlh));
@@ -1937,6 +1958,46 @@ MODULE_ALIAS("ip_conntrack_netlink");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
 
+static int __net_init ctnetlink_net_init(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	int ret;
+
+	ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
+	if (ret < 0) {
+		pr_err("ctnetlink_init: cannot register notifier.\n");
+		goto err_out;
+	}
+
+	ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
+	if (ret < 0) {
+		pr_err("ctnetlink_init: cannot expect register notifier.\n");
+		goto err_unreg_notifier;
+	}
+#endif
+	return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+	nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+err_out:
+	return ret;
+#endif
+}
+
+static void ctnetlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
+	nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+#endif
+}
+
+static struct pernet_operations ctnetlink_net_ops = {
+	.init		= ctnetlink_net_init,
+	.exit		= ctnetlink_net_exit,
+};
+
 static int __init ctnetlink_init(void)
 {
 	int ret;
@@ -1954,28 +2015,15 @@ static int __init ctnetlink_init(void)
 		goto err_unreg_subsys;
 	}
 
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-	ret = nf_conntrack_register_notifier(&ctnl_notifier);
-	if (ret < 0) {
-		printk("ctnetlink_init: cannot register notifier.\n");
+	if (register_pernet_subsys(&ctnetlink_net_ops)) {
+		pr_err("ctnetlink_init: cannot register pernet operations\n");
 		goto err_unreg_exp_subsys;
 	}
 
-	ret = nf_ct_expect_register_notifier(&ctnl_notifier_exp);
-	if (ret < 0) {
-		printk("ctnetlink_init: cannot expect register notifier.\n");
-		goto err_unreg_notifier;
-	}
-#endif
-
 	return 0;
 
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
-	nf_conntrack_unregister_notifier(&ctnl_notifier);
 err_unreg_exp_subsys:
 	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
-#endif
 err_unreg_subsys:
 	nfnetlink_subsys_unregister(&ctnl_subsys);
 err_out:
@@ -1986,11 +2034,7 @@ static void __exit ctnetlink_exit(void)
 {
 	printk("ctnetlink: unregistering from nfnetlink.\n");
 
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-	nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
-	nf_conntrack_unregister_notifier(&ctnl_notifier);
-#endif
-
+	unregister_pernet_subsys(&ctnetlink_net_ops);
 	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
 	nfnetlink_subsys_unregister(&ctnl_subsys);
 	return;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_proto.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_proto.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_proto.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_proto.c	2015-01-21 12:02:45.531165125 +0300
@@ -103,7 +103,7 @@ nf_ct_l3proto_try_module_get(unsigned sh
 
 retry:	p = nf_ct_l3proto_find_get(l3proto);
 	if (p == &nf_conntrack_l3proto_generic) {
-		ret = request_module("nf_conntrack-%d", l3proto);
+		ret = ve0_request_module("nf_conntrack-%d", l3proto);
 		if (!ret)
 			goto retry;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_proto_dccp.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_proto_dccp.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_proto_dccp.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_proto_dccp.c	2015-01-21 12:02:51.206014470 +0300
@@ -810,12 +810,7 @@ static struct nf_conntrack_l4proto dccp_
 
 static __net_init int dccp_net_init(struct net *net)
 {
-	struct dccp_net *dn;
-	int err;
-
-	dn = kmalloc(sizeof(*dn), GFP_KERNEL);
-	if (!dn)
-		return -ENOMEM;
+	struct dccp_net *dn = dccp_pernet(net);
 
 	/* default values */
 	dn->dccp_loose = 1;
@@ -827,16 +822,11 @@ static __net_init int dccp_net_init(stru
 	dn->dccp_timeout[CT_DCCP_CLOSING]	= 64 * HZ;
 	dn->dccp_timeout[CT_DCCP_TIMEWAIT]	= 2 * DCCP_MSL;
 
-	err = net_assign_generic(net, dccp_net_id, dn);
-	if (err)
-		goto out;
-
 #ifdef CONFIG_SYSCTL
-	err = -ENOMEM;
 	dn->sysctl_table = kmemdup(dccp_sysctl_table,
 			sizeof(dccp_sysctl_table), GFP_KERNEL);
 	if (!dn->sysctl_table)
-		goto out;
+		return -ENOMEM;
 
 	dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
 	dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
@@ -851,15 +841,11 @@ static __net_init int dccp_net_init(stru
 			nf_net_netfilter_sysctl_path, dn->sysctl_table);
 	if (!dn->sysctl_header) {
 		kfree(dn->sysctl_table);
-		goto out;
+		return -ENOMEM;
 	}
 #endif
 
 	return 0;
-
-out:
-	kfree(dn);
-	return err;
 }
 
 static __net_exit void dccp_net_exit(struct net *net)
@@ -869,21 +855,20 @@ static __net_exit void dccp_net_exit(str
 	unregister_net_sysctl_table(dn->sysctl_header);
 	kfree(dn->sysctl_table);
 #endif
-	kfree(dn);
-
-	net_assign_generic(net, dccp_net_id, NULL);
 }
 
 static struct pernet_operations dccp_net_ops = {
 	.init = dccp_net_init,
 	.exit = dccp_net_exit,
+	.id   = &dccp_net_id,
+	.size = sizeof(struct dccp_net),
 };
 
 static int __init nf_conntrack_proto_dccp_init(void)
 {
 	int err;
 
-	err = register_pernet_gen_subsys(&dccp_net_id, &dccp_net_ops);
+	err = register_pernet_subsys(&dccp_net_ops);
 	if (err < 0)
 		goto err1;
 
@@ -899,14 +884,14 @@ static int __init nf_conntrack_proto_dcc
 err3:
 	nf_conntrack_l4proto_unregister(&dccp_proto4);
 err2:
-	unregister_pernet_gen_subsys(dccp_net_id, &dccp_net_ops);
+	unregister_pernet_subsys(&dccp_net_ops);
 err1:
 	return err;
 }
 
 static void __exit nf_conntrack_proto_dccp_fini(void)
 {
-	unregister_pernet_gen_subsys(dccp_net_id, &dccp_net_ops);
+	unregister_pernet_subsys(&dccp_net_ops);
 	nf_conntrack_l4proto_unregister(&dccp_proto6);
 	nf_conntrack_l4proto_unregister(&dccp_proto4);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_proto_gre.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_proto_gre.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_proto_gre.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_proto_gre.c	2015-01-21 12:02:51.209014390 +0300
@@ -300,32 +300,24 @@ static struct nf_conntrack_l4proto nf_co
 
 static int proto_gre_net_init(struct net *net)
 {
-	struct netns_proto_gre *net_gre;
-	int rv;
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
 
-	net_gre = kmalloc(sizeof(struct netns_proto_gre), GFP_KERNEL);
-	if (!net_gre)
-		return -ENOMEM;
 	rwlock_init(&net_gre->keymap_lock);
 	INIT_LIST_HEAD(&net_gre->keymap_list);
 
-	rv = net_assign_generic(net, proto_gre_net_id, net_gre);
-	if (rv < 0)
-		kfree(net_gre);
-	return rv;
+	return 0;
 }
 
 static void proto_gre_net_exit(struct net *net)
 {
-	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
-
 	nf_ct_gre_keymap_flush(net);
-	kfree(net_gre);
 }
 
 static struct pernet_operations proto_gre_net_ops = {
 	.init = proto_gre_net_init,
 	.exit = proto_gre_net_exit,
+	.id   = &proto_gre_net_id,
+	.size = sizeof(struct netns_proto_gre),
 };
 
 static int __init nf_ct_proto_gre_init(void)
@@ -335,7 +327,7 @@ static int __init nf_ct_proto_gre_init(v
 	rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4);
 	if (rv < 0)
 		return rv;
-	rv = register_pernet_gen_subsys(&proto_gre_net_id, &proto_gre_net_ops);
+	rv = register_pernet_subsys(&proto_gre_net_ops);
 	if (rv < 0)
 		nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
 	return rv;
@@ -344,7 +336,7 @@ static int __init nf_ct_proto_gre_init(v
 static void __exit nf_ct_proto_gre_fini(void)
 {
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
-	unregister_pernet_gen_subsys(proto_gre_net_id, &proto_gre_net_ops);
+	unregister_pernet_subsys(&proto_gre_net_ops);
 }
 
 module_init(nf_ct_proto_gre_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_standalone.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_standalone.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_conntrack_standalone.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_conntrack_standalone.c	2015-01-21 12:02:51.375009984 +0300
@@ -29,6 +29,10 @@
 
 MODULE_LICENSE("GPL");
 
+int ip_conntrack_disable_ve0 = 0;
+module_param(ip_conntrack_disable_ve0, int, 0440);
+EXPORT_SYMBOL(ip_conntrack_disable_ve0);
+
 #ifdef CONFIG_PROC_FS
 int
 print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
@@ -342,10 +346,11 @@ static ctl_table nf_ct_sysctl_table[] = 
 	{
 		.ctl_name	= NET_NF_CONNTRACK_MAX,
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.strategy	= sysctl_intvec,
 	},
 	{
 		.ctl_name	= NET_NF_CONNTRACK_COUNT,
@@ -354,6 +359,7 @@ static ctl_table nf_ct_sysctl_table[] = 
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
+		.strategy	= sysctl_intvec,
 	},
 	{
 		.ctl_name       = NET_NF_CONNTRACK_BUCKETS,
@@ -362,6 +368,7 @@ static ctl_table nf_ct_sysctl_table[] = 
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0444,
 		.proc_handler   = proc_dointvec,
+		.strategy	= sysctl_intvec,
 	},
 	{
 		.ctl_name	= NET_NF_CONNTRACK_CHECKSUM,
@@ -370,6 +377,7 @@ static ctl_table nf_ct_sysctl_table[] = 
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.strategy	= sysctl_intvec,
 	},
 	{
 		.ctl_name	= NET_NF_CONNTRACK_LOG_INVALID,
@@ -385,10 +393,11 @@ static ctl_table nf_ct_sysctl_table[] = 
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nf_conntrack_expect_max",
-		.data		= &nf_ct_expect_max,
+		.data		= &init_net.ct.expect_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.strategy	= sysctl_intvec,
 	},
 	{ .ctl_name = 0 }
 };
@@ -399,7 +408,7 @@ static ctl_table nf_ct_netfilter_table[]
 	{
 		.ctl_name	= NET_NF_CONNTRACK_MAX,
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -412,6 +421,8 @@ static struct ctl_path nf_ct_path[] = {
 	{ }
 };
 
+static int zero;
+
 static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	struct ctl_table *table;
@@ -428,10 +439,18 @@ static int nf_conntrack_standalone_init_
 	if (!table)
 		goto out_kmemdup;
 
+	table[0].data = &net->ct.max;
 	table[1].data = &net->ct.count;
 	table[2].data = &net->ct.htable_size;
 	table[3].data = &net->ct.sysctl_checksum;
 	table[4].data = &net->ct.sysctl_log_invalid;
+	table[5].data = &net->ct.expect_max;
+
+	if (!net_eq(net, &init_net)) {
+		table[0].proc_handler = proc_dointvec_minmax;
+		table[0].extra1 = &zero;
+		table[0].extra2 = &init_net.ct.max;
+	}
 
 	net->ct.sysctl_header = register_net_sysctl_table(net,
 					nf_net_netfilter_sysctl_path, table);
@@ -496,20 +515,35 @@ out_init:
 	return ret;
 }
 
-static void nf_conntrack_net_exit(struct net *net)
+static void nf_conntrack_net_exit(struct list_head *net_exit_list)
 {
-	nf_conntrack_standalone_fini_sysctl(net);
-	nf_conntrack_standalone_fini_proc(net);
-	nf_conntrack_cleanup(net);
+	struct net *net;
+	struct ve_struct *old_env;
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		old_env = set_exec_env(net->owner_ve);
+		nf_conntrack_standalone_fini_sysctl(net);
+		nf_conntrack_standalone_fini_proc(net);
+		set_exec_env(old_env);
+	}
+	nf_conntrack_cleanup_list(net_exit_list);
 }
 
 static struct pernet_operations nf_conntrack_net_ops = {
 	.init = nf_conntrack_net_init,
-	.exit = nf_conntrack_net_exit,
+	.exit_batch = nf_conntrack_net_exit,
 };
 
 static int __init nf_conntrack_standalone_init(void)
 {
+	if (ip_conntrack_disable_ve0) {
+		printk("Disabling conntracks and NAT for ve0\n");
+		get_ve0()->ipt_mask &= ~(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLE_NAT_MOD);
+	} else {
+		printk("Enabling conntracks and NAT for ve0\n");
+		get_ve0()->ipt_mask |= VE_NF_CONNTRACK_MOD | VE_IP_IPTABLE_NAT_MOD;
+	}
+
 	return register_pernet_subsys(&nf_conntrack_net_ops);
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_sockopt.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_sockopt.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nf_sockopt.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nf_sockopt.c	2015-01-21 12:02:45.471166717 +0300
@@ -6,6 +6,11 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif /* CONFIG_VE_IPTABLES */
+
 #include "nf_internals.h"
 
 /* Sockopts only registered and called from user context, so
@@ -91,6 +96,73 @@ out:
 	mutex_unlock(&nf_sockopt_mutex);
 	return ops;
 }
+#ifdef CONFIG_VE_IPTABLES
+static int sockopt_module_fits(u_int8_t pf, int val, int get,
+			       u_int8_t mod_pf,
+			       int set_optmin, int set_optmax,
+			       int get_optmin, int get_optmax)
+{
+	if (pf != mod_pf)
+		return 0;
+	if (get)
+		return val >= get_optmin && val < get_optmax;
+	else
+		return val >= set_optmin && val < set_optmax;
+}
+
+static int ve0_load_sockopt_module(struct net *net, u8 pf, int val, int get)
+{
+	const char *name;
+	int ret = -EPERM;
+
+	if (!capable(CAP_VE_NET_ADMIN))
+		goto out;
+
+	if (sockopt_module_fits(pf, val, get, PF_INET,
+				     IPT_BASE_CTL, IPT_SO_SET_MAX + 1,
+				     IPT_BASE_CTL, IPT_SO_GET_MAX + 1)) {
+		name = "ip_tables";
+	} else if (sockopt_module_fits(pf, val, get, PF_INET6,
+				     IP6T_BASE_CTL, IP6T_SO_SET_MAX + 1,
+				     IP6T_BASE_CTL, IP6T_SO_GET_MAX + 1)) {
+		name = "ip6_tables";
+	} else {
+		ret = -EINVAL;
+		goto out;
+	}
+	/*
+	 * Currently loaded modules are free of locks used during
+	 * their initialization. So, if you add one more module
+	 * here research it before. Maybe you will have to use
+	 * nowait module request in the function below.
+	 */
+	ret = ve0_request_module(name);
+out:
+	return ret;
+}
+
+static struct nf_sockopt_ops *nf_sockopt_find_ve(struct sock *sk, u_int8_t pf,
+		int val, int get)
+{
+	struct nf_sockopt_ops *ops = nf_sockopt_find(sk, pf, val, get);
+
+	if (!IS_ERR(ops) || ve_is_super(get_exec_env()))
+		return ops;
+
+	/*
+	 * Containers are not able to load appropriate modules
+	 * from userspace. We tricky help them here. For containers
+	 * this looks like module is already loaded or driver
+	 * is built in kernel.
+	 */
+	if (ve0_load_sockopt_module(sock_net(sk), pf, val, get) != 0)
+		return ops;
+
+	return nf_sockopt_find(sk, pf, val, get);
+}
+#else /* !CONFIG_VE_IPTABLES */
+#define nf_sockopt_find_ve(sk, pf, val, get)	nf_sockopt_find(sk, pf, val, get)
+#endif /* !CONFIG_VE_IPTABLES */
 
 /* Call get/setsockopt() */
 static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
@@ -99,7 +171,7 @@ static int nf_sockopt(struct sock *sk, u
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, get);
+	ops = nf_sockopt_find_ve(sk, pf, val, get);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 
@@ -133,7 +205,7 @@ static int compat_nf_sockopt(struct sock
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, get);
+	ops = nf_sockopt_find_ve(sk, pf, val, get);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nfnetlink.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nfnetlink.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nfnetlink.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nfnetlink.c	2015-01-21 12:02:45.437167621 +0300
@@ -40,7 +40,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NE
 
 static char __initdata nfversion[] = "0.30";
 
-static struct sock *nfnl = NULL;
 static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
 static DEFINE_MUTEX(nfnl_mutex);
 
@@ -101,39 +100,41 @@ nfnetlink_find_client(u_int16_t type, co
 	return &ss->cb[cb_id];
 }
 
-int nfnetlink_has_listeners(unsigned int group)
+int nfnetlink_has_listeners(struct net *net, unsigned int group)
 {
-	return netlink_has_listeners(nfnl, group);
+	return netlink_has_listeners(net->nfnl, group);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
-int nfnetlink_send(struct sk_buff *skb, u32 pid,
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
 		   unsigned group, int echo, gfp_t flags)
 {
-	return nlmsg_notify(nfnl, skb, pid, group, echo, flags);
+	return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_send);
 
-void nfnetlink_set_err(u32 pid, u32 group, int error)
+void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
 {
-	netlink_set_err(nfnl, pid, group, error);
+	netlink_set_err(net->nfnl, pid, group, error);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_set_err);
 
-int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags)
 {
-	return netlink_unicast(nfnl, skb, pid, flags);
+	return netlink_unicast(net->nfnl, skb, pid, flags);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_unicast);
 
 /* Process one complete nfnetlink message. */
 static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
+	struct net *net = sock_net(skb->sk);
 	const struct nfnl_callback *nc;
 	const struct nfnetlink_subsystem *ss;
 	int type, err;
 
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
+	if (!netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	/* All the messages must at least contain nfgenmsg */
@@ -170,7 +171,7 @@ replay:
 		if (err < 0)
 			return err;
 
-		err = nc->call(nfnl, skb, nlh, (const struct nlattr **)cda);
+		err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda);
 		if (err == -EAGAIN)
 			goto replay;
 		return err;
@@ -184,26 +185,40 @@ static void nfnetlink_rcv(struct sk_buff
 	nfnl_unlock();
 }
 
-static void __exit nfnetlink_exit(void)
+static int __net_init nfnetlink_net_init(struct net *net)
 {
-	printk("Removing netfilter NETLINK layer.\n");
-	netlink_kernel_release(nfnl);
-	return;
+	struct sock *nfnl;
+	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
+				     nfnetlink_rcv, NULL, THIS_MODULE);
+	if (!nfnl)
+		return -ENOMEM;
+	net->nfnl_stash = nfnl;
+	rcu_assign_pointer(net->nfnl, nfnl);
+	return 0;
 }
 
-static int __init nfnetlink_init(void)
+static void __net_exit nfnetlink_net_exit(struct net *net)
 {
-	printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+	rcu_assign_pointer(net->nfnl, NULL);
+	synchronize_net();
+	netlink_kernel_release(net->nfnl_stash);
+}
 
-	nfnl = netlink_kernel_create(&init_net, NETLINK_NETFILTER, NFNLGRP_MAX,
-				     nfnetlink_rcv, NULL, THIS_MODULE);
-	if (!nfnl) {
-		printk(KERN_ERR "cannot initialize nfnetlink!\n");
-		return -ENOMEM;
-	}
+static struct pernet_operations nfnetlink_net_ops = {
+	.init		= nfnetlink_net_init,
+	.exit		= nfnetlink_net_exit,
+};
 
-	return 0;
+static int __init nfnetlink_init(void)
+{
+	printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+	return register_pernet_subsys(&nfnetlink_net_ops);
 }
 
+static void __exit nfnetlink_exit(void)
+{
+	printk("Removing netfilter NETLINK layer.\n");
+	unregister_pernet_subsys(&nfnetlink_net_ops);
+}
 module_init(nfnetlink_init);
 module_exit(nfnetlink_exit);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nfnetlink_log.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nfnetlink_log.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nfnetlink_log.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nfnetlink_log.c	2015-01-21 12:02:42.479246153 +0300
@@ -323,7 +323,8 @@ __nfulnl_send(struct nfulnl_instance *in
 			  NLMSG_DONE,
 			  sizeof(struct nfgenmsg));
 
-	status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
+	status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid,
+				   MSG_DONTWAIT);
 
 	inst->qlen = 0;
 	inst->skb = NULL;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/nfnetlink_queue.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nfnetlink_queue.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/nfnetlink_queue.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/nfnetlink_queue.c	2015-01-21 12:02:45.413168256 +0300
@@ -432,7 +432,7 @@ nfqnl_enqueue_packet(struct nf_queue_ent
 	}
 
 	/* nfnetlink_unicast will either free the nskb or add it to a socket */
-	err = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
+	err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT);
 	if (err < 0) {
 		queue->queue_user_dropped++;
 		goto err_out_unlock;
@@ -569,9 +569,6 @@ nfqnl_rcv_dev_event(struct notifier_bloc
 {
 	struct net_device *dev = ptr;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	/* Drop any packets associated with the downed device */
 	if (event == NETDEV_DOWN)
 		nfqnl_dev_drop(dev->ifindex);
@@ -600,8 +597,7 @@ nfqnl_rcv_nl_event(struct notifier_block
 			struct hlist_head *head = &instance_table[i];
 
 			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
-				if ((n->net == &init_net) &&
-				    (n->pid == inst->peer_pid))
+				if (n->pid == inst->peer_pid)
 					__instance_destroy(inst);
 			}
 		}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/x_tables.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/x_tables.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/x_tables.c	2014-12-12 23:29:03.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/x_tables.c	2015-01-21 12:02:45.521165391 +0300
@@ -27,6 +27,8 @@
 #endif
 #include <net/net_namespace.h>
 
+#include <bc/kmem.h>
+
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp.h>
 
@@ -40,7 +42,7 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tabl
 struct compat_delta {
 	struct compat_delta *next;
 	unsigned int offset;
-	short delta;
+	int delta;
 };
 
 struct xt_af {
@@ -69,6 +71,51 @@ static const char *const xt_prefix[NFPRO
 	[NFPROTO_IPV6]   = "ip6",
 };
 
+#ifdef CONFIG_BEANCOUNTERS
+static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info)
+{
+	return mem_ub(info);
+}
+
+static void uncharge_xtables(struct xt_table_info *info, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = xt_table_ub(info);
+	uncharge_beancounter(ub, UB_NUMXTENT, size);
+}
+
+static int recharge_xtables(struct xt_table_info *new, struct xt_table_info *old)
+{
+	struct user_beancounter *ub, *old_ub;
+	long change;
+
+	ub = xt_table_ub(new);
+	old_ub = old->number ? xt_table_ub(old) : ub;
+	change = (long)new->number - (long)old->number;
+	if (old_ub != ub) {
+		printk(KERN_WARNING "iptables resources are charged"
+				" from different UB (%d -> %d)\n",
+				old_ub->ub_uid, ub->ub_uid);
+		change = new->number;
+	}
+
+	if (change > 0) {
+		if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT))
+			return -ENOMEM;
+	} else if (change < 0)
+		uncharge_beancounter(ub, UB_NUMXTENT, -change);
+
+	if (old_ub != ub)
+		uncharge_beancounter(old_ub, UB_NUMXTENT, old->number);
+
+	return 0;
+}
+#else
+#define recharge_xtables(c, new, old)	(0)
+#define uncharge_xtables(info, s)	do { } while (0)
+#endif	/* CONFIG_BEANCOUNTERS */
+
 /* Registration hooks for targets. */
 int
 xt_register_target(struct xt_target *target)
@@ -184,6 +231,28 @@ xt_unregister_matches(struct xt_match *m
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
+/*
+ * Convert xt_name to module name and check for it's allowed.
+ *
+ * xt_name is a module name without prefix.
+ */
+static bool xt_name_allowed(u8 af, const char *xt_name)
+{
+	char module_name[MODULE_NAME_LEN] = {'\0'};
+	const char *prefix = xt_prefix[af];
+	int len = strlen(prefix) + strlen("t_");
+
+	if (len + strnlen(xt_name, MODULE_NAME_LEN) >= MODULE_NAME_LEN)
+		return false;
+
+	/* Fallback targets (ipt_standard_target etc) have zero len names */
+	if (strlen(xt_name) == 0)
+		return true;
+
+	sprintf(module_name, "%st_%s", prefix, xt_name);
+
+	return module_payload_allowed(module_name);
+}
 
 /*
  * These are weird, but module loading must not be done with mutex
@@ -197,6 +266,9 @@ struct xt_match *xt_find_match(u8 af, co
 	struct xt_match *m;
 	int err = 0;
 
+	if (!xt_name_allowed(af, name))
+		return ERR_PTR(err);
+
 	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
 		return ERR_PTR(-EINTR);
 
@@ -227,6 +299,9 @@ struct xt_target *xt_find_target(u8 af, 
 	struct xt_target *t;
 	int err = 0;
 
+	if (!xt_name_allowed(af, name))
+		return ERR_PTR(err);
+
 	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
 		return ERR_PTR(-EINTR);
 
@@ -255,8 +330,8 @@ struct xt_target *xt_request_find_target
 {
 	struct xt_target *target;
 
-	target = try_then_request_module(xt_find_target(af, name, revision),
-					 "%st_%s", xt_prefix[af], name);
+	target = ve0_try_then_request_module(xt_find_target(af, name, revision),
+					     "%st_%s", xt_prefix[af], name);
 	if (IS_ERR(target) || !target)
 		return NULL;
 	return target;
@@ -367,14 +442,14 @@ int xt_check_match(struct xt_mtchk_param
 		 * ebt_among is exempt from centralized matchsize checking
 		 * because it uses a dynamic-size data set.
 		 */
-		pr_err("%s_tables: %s match: invalid size %Zu != %u\n",
+		ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: invalid size %Zu != %u\n",
 		       xt_prefix[par->family], par->match->name,
 		       XT_ALIGN(par->match->matchsize), size);
 		return -EINVAL;
 	}
 	if (par->match->table != NULL &&
 	    strcmp(par->match->table, par->table) != 0) {
-		pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
+		ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: only valid in %s table, not %s\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->table, par->table);
 		return -EINVAL;
@@ -382,7 +457,7 @@ int xt_check_match(struct xt_mtchk_param
 	if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
 		char used[64], allow[64];
 
-		pr_err("%s_tables: %s match: used from hooks %s, but only "
+		ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: used from hooks %s, but only "
 		       "valid from %s\n",
 		       xt_prefix[par->family], par->match->name,
 		       textify_hooks(used, sizeof(used), par->hook_mask),
@@ -390,7 +465,7 @@ int xt_check_match(struct xt_mtchk_param
 		return -EINVAL;
 	}
 	if (par->match->proto && (par->match->proto != proto || inv_proto)) {
-		pr_err("%s_tables: %s match: only valid for protocol %u\n",
+		ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: only valid for protocol %u\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->proto);
 		return -EINVAL;
@@ -438,10 +513,10 @@ void xt_compat_flush_offsets(u_int8_t af
 }
 EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
 
-short xt_compat_calc_jump(u_int8_t af, unsigned int offset)
+int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
 {
 	struct compat_delta *tmp;
-	short delta;
+	int delta;
 
 	for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next)
 		if (tmp->offset < offset)
@@ -623,19 +698,19 @@ struct xt_table_info *xt_alloc_table_inf
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
-	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
+	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL_UBC);
 	if (!newinfo)
 		return NULL;
 
-	newinfo->size = size;
+	newinfo->alloc_size = newinfo->size = size;
 
 	for_each_possible_cpu(cpu) {
 		if (size <= PAGE_SIZE)
 			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
+							GFP_KERNEL_UBC,
 							cpu_to_node(cpu));
 		else
-			newinfo->entries[cpu] = vmalloc_node(size,
+			newinfo->entries[cpu] = ub_vmalloc_node(size,
 							cpu_to_node(cpu));
 
 		if (newinfo->entries[cpu] == NULL) {
@@ -653,7 +728,7 @@ void xt_free_table_info(struct xt_table_
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
+		if (info->alloc_size <= PAGE_SIZE)
 			kfree(info->entries[cpu]);
 		else
 			vfree(info->entries[cpu]);
@@ -724,6 +799,12 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 
+	if (recharge_xtables(newinfo, private)) {
+		local_bh_enable();
+		*error = -ENOMEM;
+		return NULL;
+	}
+
 	table->private = newinfo;
 	newinfo->initial_entries = private->initial_entries;
 
@@ -816,6 +897,7 @@ void *xt_unregister_table(struct xt_tabl
 	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 	kfree(table);
+	uncharge_xtables(private, private->number);
 
 	return private;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_CONNMARK.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_CONNMARK.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_CONNMARK.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_CONNMARK.c	2015-01-21 12:02:45.413168256 +0300
@@ -36,6 +36,45 @@ MODULE_ALIAS("ip6t_CONNMARK");
 #include <net/netfilter/nf_conntrack_ecache.h>
 
 static unsigned int
+connmark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_connmark_target_info *markinfo = par->targinfo;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t diff;
+	u_int32_t mark;
+	u_int32_t newmark;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		switch(markinfo->mode) {
+		case XT_CONNMARK_SET:
+			newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+			if (newmark != ct->mark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+			break;
+		case XT_CONNMARK_SAVE:
+			newmark = (ct->mark & ~markinfo->mask) |
+				  (skb->mark & markinfo->mask);
+			if (ct->mark != newmark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+			break;
+		case XT_CONNMARK_RESTORE:
+			mark = skb->mark;
+			diff = (ct->mark ^ mark) & markinfo->mask;
+			skb->mark = mark ^ diff;
+			break;
+		}
+	}
+
+	return XT_CONTINUE;
+}
+
+static unsigned int
 connmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct xt_connmark_tginfo1 *info = par->targinfo;
@@ -73,6 +112,30 @@ connmark_tg(struct sk_buff *skb, const s
 	return XT_CONTINUE;
 }
 
+static bool connmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct xt_connmark_target_info *matchinfo = par->targinfo;
+
+	if (matchinfo->mode == XT_CONNMARK_RESTORE) {
+		if (strcmp(par->table, "mangle") != 0) {
+			printk(KERN_WARNING "CONNMARK: restore can only be "
+			       "called from \"mangle\" table, not \"%s\"\n",
+			       par->table);
+			return false;
+		}
+	}
+	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+		return false;
+	}
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
+		printk(KERN_WARNING "can't load conntrack support for "
+				    "proto=%u\n", par->family);
+		return false;
+	}
+	return true;
+}
+
 static bool connmark_tg_check(const struct xt_tgchk_param *par)
 {
 	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
@@ -88,25 +151,74 @@ static void connmark_tg_destroy(const st
 	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_target connmark_tg_reg __read_mostly = {
-	.name           = "CONNMARK",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_tg_check,
-	.target         = connmark_tg,
-	.targetsize     = sizeof(struct xt_connmark_tginfo1),
-	.destroy        = connmark_tg_destroy,
-	.me             = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_target_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	mode;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void connmark_tg_compat_from_user_v0(void *dst, void *src)
+{
+	const struct compat_xt_connmark_target_info *cm = src;
+	struct xt_connmark_target_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.mode	= cm->mode,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_tg_compat_to_user_v0(void __user *dst, void *src)
+{
+	const struct xt_connmark_target_info *m = src;
+	struct compat_xt_connmark_target_info cm = {
+		.mark	= m->mark,
+		.mask	= m->mask,
+		.mode	= m->mode,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target connmark_tg_reg[] __read_mostly = {
+	{
+		.name		= "CONNMARK",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connmark_tg_check_v0,
+		.destroy	= connmark_tg_destroy,
+		.target		= connmark_tg_v0,
+		.targetsize	= sizeof(struct xt_connmark_target_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_connmark_target_info),
+		.compat_from_user = connmark_tg_compat_from_user_v0,
+		.compat_to_user	= connmark_tg_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE
+	},
+	{
+		.name           = "CONNMARK",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_tg_check,
+		.target         = connmark_tg,
+		.targetsize     = sizeof(struct xt_connmark_tginfo1),
+		.destroy        = connmark_tg_destroy,
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init connmark_tg_init(void)
 {
-	return xt_register_target(&connmark_tg_reg);
+	return xt_register_targets(connmark_tg_reg,
+	       ARRAY_SIZE(connmark_tg_reg));
 }
 
 static void __exit connmark_tg_exit(void)
 {
-	xt_unregister_target(&connmark_tg_reg);
+	xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg));
 }
 
 module_init(connmark_tg_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_DSCP.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_DSCP.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_DSCP.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_DSCP.c	2015-01-21 12:02:45.414168230 +0300
@@ -18,6 +18,7 @@
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_DSCP.h>
+#include <linux/netfilter_ipv4/ipt_TOS.h>
 
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification");
@@ -65,13 +66,48 @@ static bool dscp_tg_check(const struct x
 	const struct xt_DSCP_info *info = par->targinfo;
 
 	if (info->dscp > XT_DSCP_MAX) {
-		printk(KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp);
+		ve_printk(VE_LOG, KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp);
 		return false;
 	}
 	return true;
 }
 
 static unsigned int
+tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct ipt_tos_target_info *info = par->targinfo;
+	struct iphdr *iph = ip_hdr(skb);
+	u_int8_t oldtos;
+
+	if ((iph->tos & IPTOS_TOS_MASK) != info->tos) {
+		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+			return NF_DROP;
+
+		iph      = ip_hdr(skb);
+		oldtos   = iph->tos;
+		iph->tos = (iph->tos & IPTOS_PREC_MASK) | info->tos;
+		csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+	}
+
+	return XT_CONTINUE;
+}
+
+static bool tos_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct ipt_tos_target_info *info = par->targinfo;
+	const uint8_t tos = info->tos;
+
+	if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT &&
+	    tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST &&
+	    tos != IPTOS_NORMALSVC) {
+		printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+		return false;
+	}
+
+	return true;
+}
+
+static unsigned int
 tos_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct xt_tos_target_info *info = par->targinfo;
@@ -132,6 +168,16 @@ static struct xt_target dscp_tg_reg[] __
 	},
 	{
 		.name		= "TOS",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tos_tg_v0,
+		.targetsize	= sizeof(struct ipt_tos_target_info),
+		.checkentry	= tos_tg_check_v0,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TOS",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
 		.table		= "mangle",
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_MARK.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_MARK.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_MARK.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_MARK.c	2015-01-21 12:02:45.418168124 +0300
@@ -25,6 +25,39 @@ MODULE_ALIAS("ipt_MARK");
 MODULE_ALIAS("ip6t_MARK");
 
 static unsigned int
+mark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_mark_target_info *markinfo = par->targinfo;
+
+	skb->mark = markinfo->mark;
+	return XT_CONTINUE;
+}
+
+static unsigned int
+mark_tg_v1(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+	int mark = 0;
+
+	switch (markinfo->mode) {
+	case XT_MARK_SET:
+		mark = markinfo->mark;
+		break;
+
+	case XT_MARK_AND:
+		mark = skb->mark & markinfo->mark;
+		break;
+
+	case XT_MARK_OR:
+		mark = skb->mark | markinfo->mark;
+		break;
+	}
+
+	skb->mark = mark;
+	return XT_CONTINUE;
+}
+
+static unsigned int
 mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct xt_mark_tginfo2 *info = par->targinfo;
@@ -33,23 +66,149 @@ mark_tg(struct sk_buff *skb, const struc
 	return XT_CONTINUE;
 }
 
-static struct xt_target mark_tg_reg __read_mostly = {
-	.name           = "MARK",
-	.revision       = 2,
-	.family         = NFPROTO_UNSPEC,
-	.target         = mark_tg,
-	.targetsize     = sizeof(struct xt_mark_tginfo2),
-	.me             = THIS_MODULE,
+static bool mark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct xt_mark_target_info *markinfo = par->targinfo;
+
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return false;
+	}
+	return true;
+}
+
+static bool mark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+
+	if (markinfo->mode != XT_MARK_SET
+	    && markinfo->mode != XT_MARK_AND
+	    && markinfo->mode != XT_MARK_OR) {
+		printk(KERN_WARNING "MARK: unknown mode %u\n",
+		       markinfo->mode);
+		return false;
+	}
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return false;
+	}
+	return true;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_target_info {
+	compat_ulong_t	mark;
+};
+
+static void mark_tg_compat_from_user_v0(void *dst, void *src)
+{
+	const struct compat_xt_mark_target_info *cm = src;
+	struct xt_mark_target_info m = {
+		.mark	= cm->mark,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v0(void __user *dst, void *src)
+{
+	const struct xt_mark_target_info *m = src;
+	struct compat_xt_mark_target_info cm = {
+		.mark	= m->mark,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+
+struct compat_xt_mark_target_info_v1 {
+	compat_ulong_t	mark;
+	u_int8_t	mode;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void mark_tg_compat_from_user_v1(void *dst, void *src)
+{
+	const struct compat_xt_mark_target_info_v1 *cm = src;
+	struct xt_mark_target_info_v1 m = {
+		.mark	= cm->mark,
+		.mode	= cm->mode,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v1(void __user *dst, void *src)
+{
+	const struct xt_mark_target_info_v1 *m = src;
+	struct compat_xt_mark_target_info_v1 cm = {
+		.mark	= m->mark,
+		.mode	= m->mode,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target mark_tg_reg[] __read_mostly = {
+	{
+		.name		= "MARK",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 0,
+		.checkentry	= mark_tg_check_v0,
+		.target		= mark_tg_v0,
+		.targetsize	= sizeof(struct xt_mark_target_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_target_info),
+		.compat_from_user = mark_tg_compat_from_user_v0,
+		.compat_to_user	= mark_tg_compat_to_user_v0,
+#endif
+		/*
+		 * To support rhel5 containers which use iptables 1.3.5
+		 * series (which in turn exploit @revision = 1) we're
+		 * dropping off @table here so kernel won't complain
+		 * if one setting up MARK rule in a fashion of iptables 1.4.2
+		 * series (which exploit @revision = 2).
+		 */
+		/* .table		= "mangle", */
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "MARK",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 1,
+		.checkentry	= mark_tg_check_v1,
+		.target		= mark_tg_v1,
+		.targetsize	= sizeof(struct xt_mark_target_info_v1),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_target_info_v1),
+		.compat_from_user = mark_tg_compat_from_user_v1,
+		.compat_to_user	= mark_tg_compat_to_user_v1,
+#endif
+		/*
+		 * To support rhel5 containers which use iptables 1.3.5
+		 * series (which in turn exploit @revision = 1) we're
+		 * dropping off @table here so kernel won't complain
+		 * if one setting up MARK rule in a fashion of iptables 1.4.2
+		 * series (which exploit @revision = 2).
+		 */
+		/* .table		= "mangle", */
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "MARK",
+		.revision       = 2,
+		.family         = NFPROTO_UNSPEC,
+		.target         = mark_tg,
+		.targetsize     = sizeof(struct xt_mark_tginfo2),
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init mark_tg_init(void)
 {
-	return xt_register_target(&mark_tg_reg);
+	return xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 }
 
 static void __exit mark_tg_exit(void)
 {
-	xt_unregister_target(&mark_tg_reg);
+	xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 }
 
 module_init(mark_tg_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_TCPMSS.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_TCPMSS.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_TCPMSS.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_TCPMSS.c	2015-01-21 12:02:45.414168230 +0300
@@ -67,7 +67,7 @@ tcpmss_mangle_packet(struct sk_buff *skb
 	   badly. --RR */
 	if (tcplen != tcph->doff*4) {
 		if (net_ratelimit())
-			printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n",
+			ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n",
 			       skb->len);
 		return -1;
 	}
@@ -75,14 +75,14 @@ tcpmss_mangle_packet(struct sk_buff *skb
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
 		if (dst_mtu(skb_dst(skb)) <= minlen) {
 			if (net_ratelimit())
-				printk(KERN_ERR "xt_TCPMSS: "
+				ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: "
 				       "unknown or invalid path-MTU (%u)\n",
 				       dst_mtu(skb_dst(skb)));
 			return -1;
 		}
 		if (in_mtu <= minlen) {
 			if (net_ratelimit())
-				printk(KERN_ERR "xt_TCPMSS: unknown or "
+				ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: unknown or "
 				       "invalid path-MTU (%u)\n", in_mtu);
 			return -1;
 		}
@@ -147,7 +147,7 @@ tcpmss_mangle_packet(struct sk_buff *skb
 	return TCPOLEN_MSS;
 }
 
-static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
+static u_int32_t tcpmss_reverse_mtu(struct net *net, const struct sk_buff *skb,
 				    unsigned int family)
 {
 	struct flowi fl = {};
@@ -163,7 +163,7 @@ static u_int32_t tcpmss_reverse_mtu(cons
 	rcu_read_lock();
 	ai = nf_get_afinfo(family);
 	if (ai != NULL)
-		ai->route((struct dst_entry **)&rt, &fl);
+		ai->route(net, (struct dst_entry **)&rt, &fl);
 	rcu_read_unlock();
 
 	if (rt != NULL) {
@@ -177,11 +177,13 @@ static unsigned int
 tcpmss_tg4(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct iphdr *iph = ip_hdr(skb);
+	struct net *net;
 	__be16 newlen;
 	int ret;
 
+	net = dev_net(par->in ? par->in : par->out);
 	ret = tcpmss_mangle_packet(skb, par->targinfo,
-				   tcpmss_reverse_mtu(skb, PF_INET),
+				   tcpmss_reverse_mtu(net, skb, PF_INET),
 				   iph->ihl * 4,
 				   sizeof(*iph) + sizeof(struct tcphdr));
 	if (ret < 0)
@@ -200,6 +202,7 @@ static unsigned int
 tcpmss_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct net *net;
 	u8 nexthdr;
 	int tcphoff;
 	int ret;
@@ -208,8 +211,10 @@ tcpmss_tg6(struct sk_buff *skb, const st
 	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
 	if (tcphoff < 0)
 		return NF_DROP;
+
+	net = dev_net(par->in ? par->in : par->out);
 	ret = tcpmss_mangle_packet(skb, par->targinfo,
-				   tcpmss_reverse_mtu(skb, PF_INET6),
+				   tcpmss_reverse_mtu(net, skb, PF_INET6),
 				   tcphoff,
 				   sizeof(*ipv6h) + sizeof(struct tcphdr));
 	if (ret < 0)
@@ -246,13 +251,13 @@ static bool tcpmss_tg4_check(const struc
 	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
-		printk("xt_TCPMSS: path-MTU clamping only supported in "
+		ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in "
 		       "FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return false;
 	}
 	if (IPT_MATCH_ITERATE(e, find_syn_match))
 		return true;
-	printk("xt_TCPMSS: Only works on TCP SYN packets\n");
+	ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n");
 	return false;
 }
 
@@ -266,13 +271,13 @@ static bool tcpmss_tg6_check(const struc
 	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
-		printk("xt_TCPMSS: path-MTU clamping only supported in "
+		ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in "
 		       "FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return false;
 	}
 	if (IP6T_MATCH_ITERATE(e, find_syn_match))
 		return true;
-	printk("xt_TCPMSS: Only works on TCP SYN packets\n");
+	ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n");
 	return false;
 }
 #endif
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_connbytes.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_connbytes.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_connbytes.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_connbytes.c	2015-01-21 12:02:45.455167142 +0300
@@ -95,6 +95,7 @@ connbytes_mt(const struct sk_buff *skb, 
 static bool connbytes_mt_check(const struct xt_mtchk_param *par)
 {
 	const struct xt_connbytes_info *sinfo = par->matchinfo;
+	struct net *net;
 
 	if (sinfo->what != XT_CONNBYTES_PKTS &&
 	    sinfo->what != XT_CONNBYTES_BYTES &&
@@ -116,9 +117,10 @@ static bool connbytes_mt_check(const str
 	 * This filter cannot function correctly unless connection tracking
 	 * accounting is enabled, so complain in the hope that someone notices.
 	 */
-	if (!nf_ct_acct_enabled(&init_net)) {
+	net = get_exec_env()->ve_netns;
+	if (!nf_ct_acct_enabled(net)) {
 		pr_warning("Forcing CT accounting to be enabled\n");
-		nf_ct_set_acct(&init_net, true);
+		nf_ct_set_acct(net, true);
 	}
 
 	return true;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_connlimit.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_connlimit.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_connlimit.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_connlimit.c	2015-01-21 12:02:42.553244187 +0300
@@ -99,7 +99,8 @@ same_source_net(const union nf_inet_addr
 	}
 }
 
-static int count_them(struct xt_connlimit_data *data,
+static int count_them(struct net *net,
+		      struct xt_connlimit_data *data,
 		      const struct nf_conntrack_tuple *tuple,
 		      const union nf_inet_addr *addr,
 		      const union nf_inet_addr *mask,
@@ -122,7 +123,7 @@ static int count_them(struct xt_connlimi
 
 	/* check the saved connections */
 	list_for_each_entry_safe(conn, tmp, hash, list) {
-		found    = nf_conntrack_find_get(&init_net, &conn->tuple);
+		found    = nf_conntrack_find_get(net, &conn->tuple);
 		found_ct = NULL;
 
 		if (found != NULL)
@@ -180,6 +181,7 @@ static int count_them(struct xt_connlimi
 static bool
 connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
+	struct net *net = dev_net(par->in ? par->in : par->out);
 	const struct xt_connlimit_info *info = par->matchinfo;
 	union nf_inet_addr addr;
 	struct nf_conntrack_tuple tuple;
@@ -204,7 +206,7 @@ connlimit_mt(const struct sk_buff *skb, 
 	}
 
 	spin_lock_bh(&info->data->lock);
-	connections = count_them(info->data, tuple_ptr, &addr,
+	connections = count_them(net, info->data, tuple_ptr, &addr,
 	                         &info->mask, par->family);
 	spin_unlock_bh(&info->data->lock);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_connmark.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_connmark.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_connmark.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_connmark.c	2015-01-21 12:02:45.414168230 +0300
@@ -47,6 +47,36 @@ connmark_mt(const struct sk_buff *skb, c
 	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
 }
 
+static bool
+connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct xt_connmark_info *info = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+
+	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static bool connmark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct xt_connmark_info *cm = par->matchinfo;
+
+	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+		printk(KERN_WARNING "connmark: only support 32bit mark\n");
+		return false;
+	}
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
+		printk(KERN_WARNING "can't load conntrack support for "
+				    "proto=%u\n", par->family);
+		return false;
+	}
+	return true;
+}
+
 static bool connmark_mt_check(const struct xt_mtchk_param *par)
 {
 	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
@@ -62,25 +92,74 @@ static void connmark_mt_destroy(const st
 	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_match connmark_mt_reg __read_mostly = {
-	.name           = "connmark",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_mt_check,
-	.match          = connmark_mt,
-	.matchsize      = sizeof(struct xt_connmark_mtinfo1),
-	.destroy        = connmark_mt_destroy,
-	.me             = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	invert;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void connmark_mt_compat_from_user_v0(void *dst, void *src)
+{
+	const struct compat_xt_connmark_info *cm = src;
+	struct xt_connmark_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.invert	= cm->invert,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_mt_compat_to_user_v0(void __user *dst, void *src)
+{
+	const struct xt_connmark_info *m = src;
+	struct compat_xt_connmark_info cm = {
+		.mark	= m->mark,
+		.mask	= m->mask,
+		.invert	= m->invert,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match connmark_mt_reg[] __read_mostly = {
+	{
+		.name		= "connmark",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connmark_mt_check_v0,
+		.match		= connmark_mt_v0,
+		.destroy	= connmark_mt_destroy,
+		.matchsize	= sizeof(struct xt_connmark_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_connmark_info),
+		.compat_from_user = connmark_mt_compat_from_user_v0,
+		.compat_to_user	= connmark_mt_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE
+	},
+	{
+		.name           = "connmark",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_mt_check,
+		.match          = connmark_mt,
+		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
+		.destroy        = connmark_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init connmark_mt_init(void)
 {
-	return xt_register_match(&connmark_mt_reg);
+	return xt_register_matches(connmark_mt_reg,
+	       ARRAY_SIZE(connmark_mt_reg));
 }
 
 static void __exit connmark_mt_exit(void)
 {
-	xt_unregister_match(&connmark_mt_reg);
+	xt_unregister_matches(connmark_mt_reg, ARRAY_SIZE(connmark_mt_reg));
 }
 
 module_init(connmark_mt_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_conntrack.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_conntrack.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_conntrack.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_conntrack.c	2015-01-21 12:02:45.414168230 +0300
@@ -25,6 +25,95 @@ MODULE_ALIAS("ipt_conntrack");
 MODULE_ALIAS("ip6t_conntrack");
 
 static bool
+conntrack_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct xt_conntrack_info *sinfo = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg)))
+
+	if (ct == &nf_conntrack_untracked)
+		statebit = XT_CONNTRACK_STATE_UNTRACKED;
+	else if (ct)
+		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	else
+		statebit = XT_CONNTRACK_STATE_INVALID;
+
+	if (sinfo->flags & XT_CONNTRACK_STATE) {
+		if (ct) {
+			if (test_bit(IPS_SRC_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_SNAT;
+			if (test_bit(IPS_DST_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_DNAT;
+		}
+		if (FWINV((statebit & sinfo->statemask) == 0,
+			  XT_CONNTRACK_STATE))
+			return false;
+	}
+
+	if (ct == NULL) {
+		if (sinfo->flags & ~XT_CONNTRACK_STATE)
+			return false;
+		return true;
+	}
+
+	if (sinfo->flags & XT_CONNTRACK_PROTO &&
+	    FWINV(nf_ct_protonum(ct) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum,
+		  XT_CONNTRACK_PROTO))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_ORIGSRC &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip &
+		   sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip,
+		  XT_CONNTRACK_ORIGSRC))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_ORIGDST &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip &
+		   sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip,
+		  XT_CONNTRACK_ORIGDST))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_REPLSRC &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip &
+		   sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_REPLY].src.ip,
+		  XT_CONNTRACK_REPLSRC))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_REPLDST &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip &
+		   sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_REPLY].dst.ip,
+		  XT_CONNTRACK_REPLDST))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_STATUS &&
+	    FWINV((ct->status & sinfo->statusmask) == 0,
+		  XT_CONNTRACK_STATUS))
+		return false;
+
+	if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
+		unsigned long expires = timer_pending(&ct->timeout) ?
+					(ct->timeout.expires - jiffies)/HZ : 0;
+
+		if (FWINV(!(expires >= sinfo->expires_min &&
+			    expires <= sinfo->expires_max),
+			  XT_CONNTRACK_EXPIRES))
+			return false;
+	}
+	return true;
+#undef FWINV
+}
+
+static bool
 conntrack_addrcmp(const union nf_inet_addr *kaddr,
                   const union nf_inet_addr *uaddr,
                   const union nf_inet_addr *umask, unsigned int l3proto)
@@ -112,6 +201,55 @@ ct_proto_port_check(const struct xt_conn
 	return true;
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_xt_conntrack_info
+{
+	compat_uint_t			statemask;
+	compat_uint_t			statusmask;
+	struct ip_conntrack_old_tuple	tuple[IP_CT_DIR_MAX];
+	struct in_addr			sipmsk[IP_CT_DIR_MAX];
+	struct in_addr			dipmsk[IP_CT_DIR_MAX];
+	compat_ulong_t			expires_min;
+	compat_ulong_t			expires_max;
+	u_int8_t			flags;
+	u_int8_t			invflags;
+};
+
+static void conntrack_mt_compat_from_user_v0(void *dst, void *src)
+{
+	const struct compat_xt_conntrack_info *cm = src;
+	struct xt_conntrack_info m = {
+		.statemask	= cm->statemask,
+		.statusmask	= cm->statusmask,
+		.expires_min	= cm->expires_min,
+		.expires_max	= cm->expires_max,
+		.flags		= cm->flags,
+		.invflags	= cm->invflags,
+	};
+	memcpy(m.tuple, cm->tuple, sizeof(m.tuple));
+	memcpy(m.sipmsk, cm->sipmsk, sizeof(m.sipmsk));
+	memcpy(m.dipmsk, cm->dipmsk, sizeof(m.dipmsk));
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int conntrack_mt_compat_to_user_v0(void __user *dst, void *src)
+{
+	const struct xt_conntrack_info *m = src;
+	struct compat_xt_conntrack_info cm = {
+		.statemask	= m->statemask,
+		.statusmask	= m->statusmask,
+		.expires_min	= m->expires_min,
+		.expires_max	= m->expires_max,
+		.flags		= m->flags,
+		.invflags	= m->invflags,
+	};
+	memcpy(cm.tuple, m->tuple, sizeof(cm.tuple));
+	memcpy(cm.sipmsk, m->sipmsk, sizeof(cm.sipmsk));
+	memcpy(cm.dipmsk, m->dipmsk, sizeof(cm.dipmsk));
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif
+
 static bool
 conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par,
              u16 state_mask, u16 status_mask)
@@ -224,6 +362,21 @@ static void conntrack_mt_destroy(const s
 static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.match      = conntrack_mt_v0,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
+		.matchsize  = sizeof(struct xt_conntrack_info),
+		.me         = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+		.compatsize       = sizeof(struct compat_xt_conntrack_info),
+		.compat_from_user = conntrack_mt_compat_from_user_v0,
+		.compat_to_user   = conntrack_mt_compat_to_user_v0,
+#endif
+	},
+	{
+		.name       = "conntrack",
 		.revision   = 1,
 		.family     = NFPROTO_UNSPEC,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_dscp.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_dscp.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_dscp.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_dscp.c	2015-01-21 12:02:45.415168204 +0300
@@ -15,6 +15,7 @@
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_dscp.h>
+#include <linux/netfilter_ipv4/ipt_tos.h>
 
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: DSCP/TOS field match");
@@ -54,6 +55,14 @@ static bool dscp_mt_check(const struct x
 	return true;
 }
 
+static bool
+tos_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct ipt_tos_info *info = par->matchinfo;
+
+	return (ip_hdr(skb)->tos == info->tos) ^ info->invert;
+}
+
 static bool tos_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct xt_tos_match_info *info = par->matchinfo;
@@ -85,6 +94,14 @@ static struct xt_match dscp_mt_reg[] __r
 	},
 	{
 		.name		= "tos",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.match		= tos_mt_v0,
+		.matchsize	= sizeof(struct ipt_tos_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "tos",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
 		.match		= tos_mt,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_hashlimit.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_hashlimit.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_hashlimit.c	2014-12-12 23:28:58.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_hashlimit.c	2015-01-21 12:02:45.415168204 +0300
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/nsproxy.h>
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/mm.h>
@@ -41,8 +42,13 @@ MODULE_ALIAS("ipt_hashlimit");
 MODULE_ALIAS("ip6t_hashlimit");
 
 /* need to declare this at the top */
+#ifdef CONFIG_VE_IPTABLES
+#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4)
+#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6)
+#else
 static struct proc_dir_entry *hashlimit_procdir4;
 static struct proc_dir_entry *hashlimit_procdir6;
+#endif
 static const struct file_operations dl_file_ops;
 
 /* hash table crap */
@@ -99,9 +105,16 @@ struct xt_hashlimit_htable {
 
 static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
 static DEFINE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
+#ifdef CONFIG_VE_IPTABLES
+#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables)
+#else
 static HLIST_HEAD(hashlimit_htables);
+#endif
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
+static int init_xt_hashlimit(void);
+static void fini_xt_hashlimit(void);
+
 static inline bool dst_cmp(const struct dsthash_ent *ent,
 			   const struct dsthash_dst *b)
 {
@@ -687,6 +700,9 @@ static bool hashlimit_mt_check_v0(const 
 	if (r->name[sizeof(r->name) - 1] != '\0')
 		return false;
 
+	if (init_xt_hashlimit())
+		return 0;
+
 	/* This is the best we've got: We cannot release and re-grab lock,
 	 * since checkentry() is called before x_tables.c grabs xt_mutex.
 	 * We also cannot grab the hashtable spinlock, since htable_create will
@@ -728,6 +744,9 @@ static bool hashlimit_mt_check(const str
 			return false;
 	}
 
+	if (init_xt_hashlimit())
+		return 0;
+
 	/* This is the best we've got: We cannot release and re-grab lock,
 	 * since checkentry() is called before x_tables.c grabs xt_mutex.
 	 * We also cannot grab the hashtable spinlock, since htable_create will
@@ -750,6 +769,8 @@ hashlimit_mt_destroy_v0(const struct xt_
 	const struct xt_hashlimit_info *r = par->matchinfo;
 
 	htable_put(r->hinfo);
+	if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables))
+		fini_xt_hashlimit();
 }
 
 static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
@@ -757,6 +778,8 @@ static void hashlimit_mt_destroy(const s
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 
 	htable_put(info->hinfo);
+	if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables))
+		fini_xt_hashlimit();
 }
 
 #ifdef CONFIG_COMPAT
@@ -958,6 +981,78 @@ static const struct file_operations dl_f
 	.release = seq_release
 };
 
+static inline struct proc_dir_entry *proc_from_netns(void)
+{
+#if defined(CONFIG_VE)
+	return get_exec_env()->ve_netns->proc_net;
+#else
+	return init_net.proc_net;
+#endif
+}
+
+static int init_xt_hashlimit(void)
+{
+	struct proc_dir_entry *proc_net = proc_from_netns();
+
+#if defined(CONFIG_VE_IPTABLES)
+	struct ve_struct *ve = get_exec_env();
+
+	if (ve->_xt_hashlimit)
+		return 0;
+
+	ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL);
+	if (!ve->_xt_hashlimit)
+		goto err1;
+#endif
+	INIT_HLIST_HEAD(&hashlimit_htables);
+
+	hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net);
+	if (!hashlimit_procdir4) {
+		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
+				"entry\n");
+		goto err2;
+	}
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net);
+	if (!hashlimit_procdir6) {
+		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
+				"entry\n");
+		goto err3;
+	}
+#endif
+
+	return 0;
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+err3:
+	remove_proc_entry("ipt_hashlimit", proc_net);
+#endif
+err2:
+#if defined(CONFIG_VE_IPTABLES)
+	kfree(ve->_xt_hashlimit);
+	ve->_xt_hashlimit = NULL;
+err1:
+#endif
+	return -ENOMEM;
+}
+
+static void fini_xt_hashlimit(void)
+{
+	struct proc_dir_entry *proc_net = proc_from_netns();
+#ifdef CONFIG_VE_IPTABLES
+	struct ve_struct *ve = get_exec_env();
+#endif
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	remove_proc_entry("ip6t_hashlimit", proc_net);
+#endif
+	remove_proc_entry("ipt_hashlimit", proc_net);
+
+#if defined(CONFIG_VE_IPTABLES)
+	kfree(ve->_xt_hashlimit);
+	ve->_xt_hashlimit = NULL;
+#endif
+}
+
 static int __init hashlimit_mt_init(void)
 {
 	int err;
@@ -975,24 +1070,11 @@ static int __init hashlimit_mt_init(void
 		printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n");
 		goto err2;
 	}
-	hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net);
-	if (!hashlimit_procdir4) {
-		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
-				"entry\n");
+	err = init_xt_hashlimit();
+	if (err)
 		goto err3;
-	}
-	err = 0;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-	hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net);
-	if (!hashlimit_procdir6) {
-		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
-				"entry\n");
-		err = -ENOMEM;
-	}
-#endif
 	if (!err)
 		return 0;
-	remove_proc_entry("ipt_hashlimit", init_net.proc_net);
 err3:
 	kmem_cache_destroy(hashlimit_cachep);
 err2:
@@ -1004,10 +1086,7 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	remove_proc_entry("ipt_hashlimit", init_net.proc_net);
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-	remove_proc_entry("ip6t_hashlimit", init_net.proc_net);
-#endif
+	fini_xt_hashlimit();
 	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_iprange.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_iprange.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_iprange.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_iprange.c	2015-01-21 12:02:45.415168204 +0300
@@ -14,6 +14,40 @@
 #include <linux/ipv6.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_iprange.h>
+#include <linux/netfilter_ipv4/ipt_iprange.h>
+
+static bool
+iprange_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct ipt_iprange_info *info = par->matchinfo;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (info->flags & IPRANGE_SRC) {
+		if ((ntohl(iph->saddr) < ntohl(info->src.min_ip)
+			  || ntohl(iph->saddr) > ntohl(info->src.max_ip))
+			 ^ !!(info->flags & IPRANGE_SRC_INV)) {
+			pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
+				 &iph->saddr,
+				 info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+				 &info->src.min_ip,
+				 &info->src.max_ip);
+			return false;
+		}
+	}
+	if (info->flags & IPRANGE_DST) {
+		if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip)
+			  || ntohl(iph->daddr) > ntohl(info->dst.max_ip))
+			 ^ !!(info->flags & IPRANGE_DST_INV)) {
+			pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n",
+				 &iph->daddr,
+				 info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+				 &info->dst.min_ip,
+				 &info->dst.max_ip);
+			return false;
+		}
+	}
+	return true;
+}
 
 static bool
 iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par)
@@ -93,6 +127,14 @@ iprange_mt6(const struct sk_buff *skb, c
 static struct xt_match iprange_mt_reg[] __read_mostly = {
 	{
 		.name      = "iprange",
+		.revision  = 0,
+		.family    = NFPROTO_IPV4,
+		.match     = iprange_mt_v0,
+		.matchsize = sizeof(struct ipt_iprange_info),
+		.me        = THIS_MODULE,
+	},
+	{
+		.name      = "iprange",
 		.revision  = 1,
 		.family    = NFPROTO_IPV4,
 		.match     = iprange_mt4,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_limit.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_limit.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_limit.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_limit.c	2015-01-21 12:02:45.415168204 +0300
@@ -105,7 +105,7 @@ static bool limit_mt_check(const struct 
 	/* Check for overflow. */
 	if (r->burst == 0
 	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
-		printk("Overflow in xt_limit, try lower: %u/%u\n",
+		ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n",
 		       r->avg, r->burst);
 		return false;
 	}
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_mark.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_mark.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_mark.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_mark.c	2015-01-21 12:02:45.415168204 +0300
@@ -23,6 +23,14 @@ MODULE_ALIAS("ipt_mark");
 MODULE_ALIAS("ip6t_mark");
 
 static bool
+mark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct xt_mark_info *info = par->matchinfo;
+
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static bool
 mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct xt_mark_mtinfo1 *info = par->matchinfo;
@@ -30,23 +38,81 @@ mark_mt(const struct sk_buff *skb, const
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
-static struct xt_match mark_mt_reg __read_mostly = {
-	.name           = "mark",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.match          = mark_mt,
-	.matchsize      = sizeof(struct xt_mark_mtinfo1),
-	.me             = THIS_MODULE,
+static bool mark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct xt_mark_info *minfo = par->matchinfo;
+
+	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "mark: only supports 32bit mark\n");
+		return false;
+	}
+	return true;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	invert;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void mark_mt_compat_from_user_v0(void *dst, void *src)
+{
+	const struct compat_xt_mark_info *cm = src;
+	struct xt_mark_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.invert	= cm->invert,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_mt_compat_to_user_v0(void __user *dst, void *src)
+{
+	const struct xt_mark_info *m = src;
+	struct compat_xt_mark_info cm = {
+		.mark	= m->mark,
+		.mask	= m->mask,
+		.invert	= m->invert,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match mark_mt_reg[] __read_mostly = {
+	{
+		.name		= "mark",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= mark_mt_check_v0,
+		.match		= mark_mt_v0,
+		.matchsize	= sizeof(struct xt_mark_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_info),
+		.compat_from_user = mark_mt_compat_from_user_v0,
+		.compat_to_user	= mark_mt_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "mark",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.match          = mark_mt,
+		.matchsize      = sizeof(struct xt_mark_mtinfo1),
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init mark_mt_init(void)
 {
-	return xt_register_match(&mark_mt_reg);
+	return xt_register_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
 }
 
 static void __exit mark_mt_exit(void)
 {
-	xt_unregister_match(&mark_mt_reg);
+	xt_unregister_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
 }
 
 module_init(mark_mt_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_osf.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_osf.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_osf.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_osf.c	2015-01-21 12:02:42.551244240 +0300
@@ -427,5 +427,7 @@ module_exit(xt_osf_fini);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_ALIAS("ipt_osf");
+MODULE_ALIAS("ip6t_osf");
 MODULE_DESCRIPTION("Passive OS fingerprint matching.");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_owner.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_owner.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_owner.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_owner.c	2015-01-21 12:02:45.416168178 +0300
@@ -16,6 +16,60 @@
 #include <net/sock.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_owner.h>
+#include <linux/netfilter_ipv4/ipt_owner.h>
+#include <linux/netfilter_ipv6/ip6t_owner.h>
+
+static bool
+owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct ipt_owner_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return false;
+
+	if (info->match & IPT_OWNER_UID)
+		if ((filp->f_cred->fsuid != info->uid) ^
+		    !!(info->invert & IPT_OWNER_UID))
+			return false;
+
+	if (info->match & IPT_OWNER_GID)
+		if ((filp->f_cred->fsgid != info->gid) ^
+		    !!(info->invert & IPT_OWNER_GID))
+			return false;
+
+	return true;
+}
+
+static bool
+owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct ip6t_owner_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return false;
+
+	if (info->match & IP6T_OWNER_UID)
+		if ((filp->f_cred->fsuid != info->uid) ^
+		    !!(info->invert & IP6T_OWNER_UID))
+			return false;
+
+	if (info->match & IP6T_OWNER_GID)
+		if ((filp->f_cred->fsgid != info->gid) ^
+		    !!(info->invert & IP6T_OWNER_GID))
+			return false;
+
+	return true;
+}
 
 static bool
 owner_mt(const struct sk_buff *skb, const struct xt_match_param *par)
@@ -52,25 +106,76 @@ owner_mt(const struct sk_buff *skb, cons
 	return true;
 }
 
-static struct xt_match owner_mt_reg __read_mostly = {
-	.name       = "owner",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.match      = owner_mt,
-	.matchsize  = sizeof(struct xt_owner_match_info),
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-	              (1 << NF_INET_POST_ROUTING),
-	.me         = THIS_MODULE,
+static bool owner_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct ipt_owner_info *info = par->matchinfo;
+
+	if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) {
+		printk(KERN_WARNING KBUILD_MODNAME
+		       ": PID, SID and command matching is not "
+		       "supported anymore\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool owner_mt6_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_owner_info *info = par->matchinfo;
+
+	if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) {
+		printk(KERN_WARNING KBUILD_MODNAME
+		       ": PID and SID matching is not supported anymore\n");
+		return false;
+	}
+
+	return true;
+}
+
+static struct xt_match owner_mt_reg[] __read_mostly = {
+	{
+		.name       = "owner",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.match      = owner_mt_v0,
+		.matchsize  = sizeof(struct ipt_owner_info),
+		.checkentry = owner_mt_check_v0,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.match      = owner_mt6_v0,
+		.matchsize  = sizeof(struct ip6t_owner_info),
+		.checkentry = owner_mt6_check_v0,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_UNSPEC,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
 };
 
 static int __init owner_mt_init(void)
 {
-	return xt_register_match(&owner_mt_reg);
+	return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 static void __exit owner_mt_exit(void)
 {
-	xt_unregister_match(&owner_mt_reg);
+	xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 module_init(owner_mt_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_recent.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_recent.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_recent.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_recent.c	2015-01-21 12:02:47.561111232 +0300
@@ -17,6 +17,8 @@
 #include <linux/ipv6.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
@@ -58,6 +60,9 @@ MODULE_PARM_DESC(ip_list_perms, "permiss
 MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/xt_recent/* files");
 MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/xt_recent/* files");
 
+static int init_ipt_recent(struct ve_struct *ve);
+static void fini_ipt_recent(struct ve_struct *ve);
+
 struct recent_entry {
 	struct list_head	list;
 	struct list_head	lru_list;
@@ -78,15 +83,27 @@ struct recent_table {
 	struct list_head	iphash[0];
 };
 
+#if defined(CONFIG_VE_IPTABLES)
+#define tables		(get_exec_env()->_ipt_recent->tables)
+#else
 static LIST_HEAD(tables);
+#endif
 static DEFINE_SPINLOCK(recent_lock);
 static DEFINE_MUTEX(recent_mutex);
 
 #ifdef CONFIG_PROC_FS
 #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+#if defined(CONFIG_VE_IPTABLES)
+#define proc_old_dir	(get_exec_env()->_ipt_recent->proc_old_dir)
+#else
 static struct proc_dir_entry *proc_old_dir;
 #endif
+#endif
+#if defined(CONFIG_VE_IPTABLES)
+#define recent_proc_dir (get_exec_env()->_ipt_recent->proc_dir)
+#else
 static struct proc_dir_entry *recent_proc_dir;
+#endif
 static const struct file_operations recent_old_fops, recent_mt_fops;
 #endif
 
@@ -300,6 +317,9 @@ static bool recent_mt_check(const struct
 	    strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
 		return false;
 
+	if (init_ipt_recent(get_exec_env()))
+		return 0;
+
 	mutex_lock(&recent_mutex);
 	t = recent_table_lookup(info->name);
 	if (t != NULL) {
@@ -351,6 +371,13 @@ static void recent_mt_destroy(const stru
 {
 	const struct xt_recent_mtinfo *info = par->matchinfo;
 	struct recent_table *t;
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+#ifdef CONFIG_VE_IPTABLES
+	if (!ve->_ipt_recent)
+		return;
+#endif
 
 	mutex_lock(&recent_mutex);
 	t = recent_table_lookup(info->name);
@@ -368,6 +395,8 @@ static void recent_mt_destroy(const stru
 		kfree(t);
 	}
 	mutex_unlock(&recent_mutex);
+	if (!ve_is_super(ve) && list_empty(&tables))
+		fini_ipt_recent(ve);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -636,47 +665,84 @@ static struct xt_match recent_mt_reg[] _
 	},
 };
 
-static int __init recent_mt_init(void)
+static int init_ipt_recent(struct ve_struct *ve)
 {
-	int err;
+	int err = 0;
 
-	if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
-		return -EINVAL;
-	ip_list_hash_size = 1 << fls(ip_list_tot);
+#ifdef CONFIG_VE_IPTABLES
+	if (ve->_ipt_recent)
+		return 0;
 
-	err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+	ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL);
+	if (!ve->_ipt_recent) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&tables);
+#endif
 #ifdef CONFIG_PROC_FS
-	if (err)
-		return err;
-	recent_proc_dir = proc_mkdir("xt_recent", init_net.proc_net);
+	recent_proc_dir = proc_mkdir("xt_recent", ve->ve_netns->proc_net);
 	if (recent_proc_dir == NULL) {
-		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 		err = -ENOMEM;
+		goto out_mem;
 	}
 #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
-	if (err < 0)
-		return err;
-	proc_old_dir = proc_mkdir("ipt_recent", init_net.proc_net);
+	proc_old_dir = proc_mkdir("ipt_recent", ve->ve_netns->proc_net);
 	if (proc_old_dir == NULL) {
-		remove_proc_entry("xt_recent", init_net.proc_net);
-		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 		err = -ENOMEM;
+		remove_proc_entry("xt_recent", ve->ve_netns->proc_net);
+		goto out_mem;
 	}
 #endif
 #endif
+out:
 	return err;
+
+out_mem:
+#ifdef CONFIG_VE_IPTABLES
+	kfree(ve->_ipt_recent);
+	ve->_ipt_recent = NULL;
+#endif
+	goto out;
 }
 
-static void __exit recent_mt_exit(void)
+static void fini_ipt_recent(struct ve_struct *ve)
 {
-	BUG_ON(!list_empty(&tables));
-	xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 #ifdef CONFIG_PROC_FS
 #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
-	remove_proc_entry("ipt_recent", init_net.proc_net);
+	remove_proc_entry("ipt_recent", ve->ve_netns->proc_net);
 #endif
-	remove_proc_entry("xt_recent", init_net.proc_net);
+	remove_proc_entry("xt_recent", ve->ve_netns->proc_net);
 #endif
+#ifdef CONFIG_VE_IPTABLES
+	kfree(ve->_ipt_recent);
+	ve->_ipt_recent = NULL;
+#endif
+}
+
+static int __init recent_mt_init(void)
+{
+	int err;
+
+	if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
+		return -EINVAL;
+	ip_list_hash_size = 1 << fls(ip_list_tot);
+
+	err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+	if (err)
+		return err;
+	err = init_ipt_recent(&ve0);
+	if (err)
+		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+	return err;
+}
+
+static void __exit recent_mt_exit(void)
+{
+	BUG_ON(!list_empty(&tables));
+	xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+	fini_ipt_recent(&ve0);
 }
 
 module_init(recent_mt_init);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_wdog_tmo.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_wdog_tmo.c
--- linux-2.6.32-504.3.3.el6.orig/net/netfilter/xt_wdog_tmo.c	2015-01-21 12:02:58.549819532 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netfilter/xt_wdog_tmo.c	2015-01-21 12:02:58.557819320 +0300
@@ -0,0 +1,53 @@
+/*
+ *  net/netfilter/xt_wdog_tmo.c
+ *
+ *  Copyright (C) 2013, Parallels inc.
+ *  All rights reserved.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/fence-watchdog.h>
+
+static bool
+wdog_tmo_mt(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	return fence_wdog_tmo_match();
+}
+
+bool wdog_tmo_mt_check(const struct xt_mtchk_param *par)
+{
+
+	return ve_is_super(get_exec_env());
+}
+
+static struct xt_match wdog_tmo_mt_reg __read_mostly = {
+		.name       = "wdog_tmo",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.match      = wdog_tmo_mt,
+		.checkentry = wdog_tmo_mt_check,
+		.matchsize  = 0,
+		.me         = THIS_MODULE,
+};
+
+static int __init wdog_tmo_mt_init(void)
+{
+	return xt_register_match(&wdog_tmo_mt_reg);
+}
+
+static void __exit wdog_tmo_mt_exit(void)
+{
+	xt_unregister_match(&wdog_tmo_mt_reg);
+}
+
+module_init(wdog_tmo_mt_init);
+module_exit(wdog_tmo_mt_exit);
+MODULE_AUTHOR("Dmitry Guryanov <dguryanov@parallels.com>");
+MODULE_DESCRIPTION("Xtables: fence watchdog timeout matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_wdog_tmo");
+MODULE_ALIAS("ip6t_wdog_tmo");
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netlink/af_netlink.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netlink/af_netlink.c
--- linux-2.6.32-504.3.3.el6.orig/net/netlink/af_netlink.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netlink/af_netlink.c	2015-01-21 12:02:47.887102577 +0300
@@ -60,29 +60,14 @@
 #include <net/sock.h>
 #include <net/scm.h>
 #include <net/netlink.h>
+#include <net/netlink_sock.h>
+
+#include <bc/beancounter.h>
+#include <bc/net.h>
 
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
 
-struct netlink_sock {
-	/* struct sock has to be the first member of netlink_sock */
-	struct sock		sk;
-	u32			pid;
-	u32			dst_pid;
-	u32			dst_group;
-	u32			flags;
-	u32			subscriptions;
-	u32			ngroups;
-	unsigned long		*groups;
-	unsigned long		state;
-	wait_queue_head_t	wait;
-	struct netlink_callback	*cb;
-	struct mutex		*cb_mutex;
-	struct mutex		cb_def_mutex;
-	void			(*netlink_rcv)(struct sk_buff *skb);
-	struct module		*module;
-};
-
 struct listeners_rcu_head {
 	struct rcu_head rcu_head;
 	void *ptr;
@@ -409,7 +394,7 @@ static struct proto netlink_proto = {
 };
 
 static int __netlink_create(struct net *net, struct socket *sock,
-			    struct mutex *cb_mutex, int protocol)
+			    struct mutex *cb_mutex, int protocol, int kern)
 {
 	struct sock *sk;
 	struct netlink_sock *nlk;
@@ -419,6 +404,8 @@ static int __netlink_create(struct net *
 	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
 	if (!sk)
 		return -ENOMEM;
+	if (ub_other_sock_charge(sk, kern))
+		goto out_free;
 
 	sock_init_data(sock, sk);
 
@@ -434,6 +421,10 @@ static int __netlink_create(struct net *
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
 	return 0;
+
+out_free:
+	sk_free(sk);
+	return -ENOMEM;
 }
 
 static int netlink_create(struct net *net, struct socket *sock, int protocol,
@@ -471,7 +462,7 @@ static int netlink_create(struct net *ne
 	if (err < 0)
 		goto out;
 
-	err = __netlink_create(net, sock, cb_mutex, protocol);
+	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
 	if (err < 0)
 		goto out_module;
 
@@ -553,7 +544,7 @@ static int netlink_autobind(struct socke
 	struct hlist_head *head;
 	struct sock *osk;
 	struct hlist_node *node;
-	s32 pid = current->tgid;
+	s32 pid = task_tgid_vnr(current);
 	int err;
 	static s32 rover = -4097;
 
@@ -624,7 +615,7 @@ EXPORT_SYMBOL(netlink_capable);
 static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
 {
 	return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) ||
-	       capable(CAP_NET_ADMIN);
+	       capable(CAP_VE_NET_ADMIN) || capable(CAP_NET_ADMIN);
 }
 
 static void
@@ -834,12 +825,20 @@ int netlink_attachskb(struct sock *sk, s
 		      long *timeo, struct sock *ssk)
 {
 	struct netlink_sock *nlk;
+	unsigned long chargesize;
+	int no_ubc;
 
 	nlk = nlk_sk(sk);
 
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	chargesize = skb_charge_fullsize(skb);
+	no_ubc = ub_sock_getwres_other(sk, chargesize);
+	if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
 	    test_bit(0, &nlk->state)) {
 		DECLARE_WAITQUEUE(wait, current);
+
+		if (!no_ubc)
+			ub_sock_retwres_other(sk, chargesize,
+					      SOCK_MIN_UBCSPACE_CH);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
 				netlink_overrun(sk);
@@ -851,13 +850,20 @@ int netlink_attachskb(struct sock *sk, s
 		__set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&nlk->wait, &wait);
 
+		/* this if can't be moved upper because ub_sock_snd_queue_add()
+		 * may change task state to TASK_RUNNING */
+		if (no_ubc)
+			ub_sock_sndqueueadd_other(sk, chargesize);
+
 		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-		     test_bit(0, &nlk->state)) &&
+		     test_bit(0, &nlk->state) || no_ubc) &&
 		    !sock_flag(sk, SOCK_DEAD))
 			*timeo = schedule_timeout(*timeo);
 
 		__set_current_state(TASK_RUNNING);
 		remove_wait_queue(&nlk->wait, &wait);
+		if (no_ubc)
+			ub_sock_sndqueuedel(sk);
 		sock_put(sk);
 
 		if (signal_pending(current)) {
@@ -867,6 +873,7 @@ int netlink_attachskb(struct sock *sk, s
 		return 1;
 	}
 	skb_set_owner_r(skb, sk);
+	ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
 	return 0;
 }
 
@@ -1037,8 +1044,13 @@ static inline int do_one_broadcast(struc
 	    !test_bit(p->group - 1, nlk->groups))
 		goto out;
 
+	if (!ve_accessible_strict(get_exec_env(), sk->owner_env))
+		goto out;
+
+#ifndef CONFIG_VE
 	if (!net_eq(sock_net(sk), p->net))
 		goto out;
+#endif
 
 	if (p->failure) {
 		netlink_overrun(sk);
@@ -1542,7 +1554,7 @@ netlink_kernel_create(struct net *net, i
 	 * So we create one inside init_net and the move it to net.
 	 */
 
-	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+	if (__netlink_create(&init_net, sock, cb_mutex, unit, 1) < 0)
 		goto out_sock_release_nosk;
 
 	sk = sock->sk;
@@ -1731,7 +1743,11 @@ static int netlink_dump(struct sock *sk)
 	skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL);
 	if (!skb) {
 		mutex_unlock(nlk->cb_mutex);
-		goto errout;
+		goto errout_skb;
+	}
+	if (ub_nlrcvbuf_charge(skb, sk) < 0) {
+		err = -EACCES;
+		goto errout_skb;
 	}
 
 	len = cb->dump(skb, cb);
@@ -1775,7 +1791,6 @@ static int netlink_dump(struct sock *sk)
 errout_skb:
 	mutex_unlock(nlk->cb_mutex);
 	kfree_skb(skb);
-errout:
 	return err;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/netlink/genetlink.c linux-2.6.32-504.3.3.el6-042stab103_6/net/netlink/genetlink.c
--- linux-2.6.32-504.3.3.el6.orig/net/netlink/genetlink.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/netlink/genetlink.c	2015-01-21 12:02:45.437167621 +0300
@@ -541,7 +541,8 @@ static int genl_family_rcv_msg(struct ge
 		return -EOPNOTSUPP;
 
 	if ((ops->flags & GENL_ADMIN_PERM) &&
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/Kconfig	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/Kconfig	2015-01-21 12:02:58.458821948 +0300
@@ -28,6 +28,15 @@ config OPENVSWITCH
 
 	  If unsure, say N.
 
+config OVS_BRCOMPAT
+	tristate "Open vSwitch bridge compatibility"
+	depends on OPENVSWITCH
+	---help---
+	  Enable this option if you want to control Open vSwitch with bridge
+	  control utilities like brctl.
+
+	  This option adds compatibility layer to implement ioctl transferring.
+
 config OPENVSWITCH_GRE
 	bool "Open vSwitch GRE tunneling support"
 	depends on INET
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/Makefile linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/Makefile
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/Makefile	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/Makefile	2015-01-21 12:02:58.465821761 +0300
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_OPENVSWITCH) += openvswitch.o
+obj-$(CONFIG_OVS_BRCOMPAT) += brcompat.o
 
 openvswitch-y := \
 	actions.o \
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/brcompat-nl.h linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/brcompat-nl.h
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/brcompat-nl.h	2015-01-21 12:02:58.465821761 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/brcompat-nl.h	2015-01-21 12:02:58.465821761 +0300
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2008, 2009, 2011 Nicira Networks.
+ *
+ * This file is offered under your choice of two licenses: Apache 2.0 or GNU
+ * GPL 2.0 or later.  The permission statements for each of these licenses is
+ * given below.  You may license your modifications to this file under either
+ * of these licenses or both.  If you wish to license your modifications under
+ * only one of these licenses, delete the permission text for the other
+ * license.
+ *
+ * ----------------------------------------------------------------------
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ----------------------------------------------------------------------
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * ----------------------------------------------------------------------
+ */
+
+#ifndef OPENVSWITCH_BRCOMPAT_NETLINK_H
+#define OPENVSWITCH_BRCOMPAT_NETLINK_H 1
+
+#define BRC_GENL_FAMILY_NAME "brcompat"
+
+/* Attributes that can be attached to the datapath's netlink messages. */
+enum {
+	BRC_GENL_A_UNSPEC,
+
+	/*
+	 * "K:" attributes appear in messages from the kernel to userspace.
+	 * "U:" attributes appear in messages from userspace to the kernel.
+	 */
+
+	/* BRC_GENL_C_DP_ADD, BRC_GENL_C_DP_DEL. */
+	BRC_GENL_A_DP_NAME,		/* K: Datapath name. */
+
+	/* BRC_GENL_C_DP_ADD, BRC_GENL_C_DP_DEL,
+	   BRC_GENL_C_PORT_ADD, BRC_GENL_C_PORT_DEL. */
+	BRC_GENL_A_PORT_NAME,	/* K: Interface name. */
+
+	/* BRC_GENL_C_DP_RESULT. */
+	BRC_GENL_A_ERR_CODE,	/* U: Positive error code. */
+
+	/* BRC_GENL_C_QUERY_MC. */
+	BRC_GENL_A_MC_GROUP,	/* K: Generic netlink multicast group. */
+
+	/* BRC_GENL_C_FDB_QUERY. */
+	BRC_GENL_A_FDB_COUNT,	/* K: Number of FDB entries to read. */
+	BRC_GENL_A_FDB_SKIP,	/* K: Record offset into FDB to start reading. */
+
+	/* BRC_GENL_C_DP_RESULT. */
+	BRC_GENL_A_FDB_DATA,    /* U: FDB records. */
+	BRC_GENL_A_IFINDEXES,   /* U: "int" ifindexes of bridges or ports. */
+
+	__BRC_GENL_A_MAX,
+	BRC_GENL_A_MAX = __BRC_GENL_A_MAX - 1
+};
+
+/* Commands that can be executed on the datapath's netlink interface. */
+enum brc_genl_command {
+	BRC_GENL_C_UNSPEC,
+
+	/*
+	 * "K:" messages are sent by the kernel to userspace.
+	 * "U:" messages are sent by userspace to the kernel.
+	 */
+	BRC_GENL_C_DP_ADD,		/* K: Datapath created. */
+	BRC_GENL_C_DP_DEL,		/* K: Datapath destroyed. */
+	BRC_GENL_C_DP_RESULT,	/* U: Return code from ovs-brcompatd. */
+	BRC_GENL_C_PORT_ADD,	/* K: Port added to datapath. */
+	BRC_GENL_C_PORT_DEL,	/* K: Port removed from datapath. */
+	BRC_GENL_C_QUERY_MC,	/* U: Get multicast group for brcompat. */
+	BRC_GENL_C_FDB_QUERY,	/* K: Read records from forwarding database. */
+	BRC_GENL_C_GET_BRIDGES, /* K: Get ifindexes of all bridges. */
+	BRC_GENL_C_GET_PORTS,   /* K: Get ifindexes of all ports on a bridge. */
+
+	__BRC_GENL_C_MAX,
+	BRC_GENL_C_MAX = __BRC_GENL_C_MAX - 1
+};
+#endif /* openvswitch/brcompat-netlink.h */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/brcompat.c linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/brcompat.c
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/brcompat.c	2015-01-21 12:02:58.465821761 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/brcompat.c	2015-01-21 12:02:58.465821761 +0300
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2007-2012 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/completion.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/genetlink.h>
+
+#include "brcompat-nl.h"
+#include "datapath.h"
+
+static struct genl_family brc_genl_family;
+static struct genl_multicast_group brc_mc_group;
+
+/* Time to wait for ovs-vswitchd to respond to a datapath action, in
+ * jiffies. */
+#define BRC_TIMEOUT (HZ * 5)
+
+/* Mutex to serialize ovs-brcompatd callbacks.  (Some callbacks naturally hold
+ * br_ioctl_mutex, others hold rtnl_lock, but we can't take the former
+ * ourselves and we don't want to hold the latter over a potentially long
+ * period of time.) */
+static DEFINE_MUTEX(brc_serial);
+
+/* Userspace communication. */
+static DEFINE_SPINLOCK(brc_lock);    /* Ensure atomic access to these vars. */
+static DECLARE_COMPLETION(brc_done); /* Userspace signaled operation done? */
+static struct sk_buff *brc_reply;    /* Reply from userspace. */
+static u32 brc_seq;		     /* Sequence number for current op. */
+
+static struct sk_buff *brc_send_command(struct net *,
+					struct sk_buff *,
+					struct nlattr **attrs);
+static int brc_send_simple_command(struct net *, struct sk_buff *);
+
+static struct sk_buff *brc_make_request(int op, const char *bridge,
+					const char *port)
+{
+	struct sk_buff *skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto error;
+
+	genlmsg_put(skb, 0, 0, &brc_genl_family, 0, op);
+	if (bridge)
+		NLA_PUT_STRING(skb, BRC_GENL_A_DP_NAME, bridge);
+
+	if (port)
+		NLA_PUT_STRING(skb, BRC_GENL_A_PORT_NAME, port);
+
+	return skb;
+
+nla_put_failure:
+	kfree_skb(skb);
+error:
+	return NULL;
+}
+
+static int brc_send_simple_command(struct net *net, struct sk_buff *request)
+{
+	struct nlattr *attrs[BRC_GENL_A_MAX + 1];
+	struct sk_buff *reply;
+	int error;
+
+	reply = brc_send_command(net, request, attrs);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	error = nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
+	kfree_skb(reply);
+	return -error;
+}
+
+static int brc_add_del_bridge(struct net *net, char __user *uname, int add)
+{
+	struct sk_buff *request;
+	char name[IFNAMSIZ];
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(name, uname, IFNAMSIZ))
+		return -EFAULT;
+
+	name[IFNAMSIZ - 1] = 0;
+	request = brc_make_request(add ? BRC_GENL_C_DP_ADD : BRC_GENL_C_DP_DEL,
+				   name, NULL);
+	if (!request)
+		return -ENOMEM;
+
+	return brc_send_simple_command(net, request);
+}
+
+static int brc_get_indices(struct net *net,
+			   int op, const char *br_name,
+			   int __user *uindices, int n)
+{
+	struct nlattr *attrs[BRC_GENL_A_MAX + 1];
+	struct sk_buff *request, *reply;
+	int *indices;
+	int ret;
+	int len;
+
+	if (n < 0)
+		return -EINVAL;
+	if (n >= 2048)
+		return -ENOMEM;
+
+	request = brc_make_request(op, br_name, NULL);
+	if (!request)
+		return -ENOMEM;
+
+	reply = brc_send_command(net, request, attrs);
+	ret = PTR_ERR(reply);
+	if (IS_ERR(reply))
+		goto exit;
+
+	ret = -nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
+	if (ret < 0)
+		goto exit_free_skb;
+
+	ret = -EINVAL;
+	if (!attrs[BRC_GENL_A_IFINDEXES])
+		goto exit_free_skb;
+
+	len = nla_len(attrs[BRC_GENL_A_IFINDEXES]);
+	indices = nla_data(attrs[BRC_GENL_A_IFINDEXES]);
+	if (len % sizeof(int))
+		goto exit_free_skb;
+
+	n = min_t(int, n, len / sizeof(int));
+	ret = copy_to_user(uindices, indices, n * sizeof(int)) ? -EFAULT : n;
+
+exit_free_skb:
+	kfree_skb(reply);
+exit:
+	return ret;
+}
+
+/* Called with br_ioctl_mutex. */
+static int brc_get_bridges(struct net *net, int __user *uindices, int n)
+{
+	return brc_get_indices(net, BRC_GENL_C_GET_BRIDGES, NULL, uindices, n);
+}
+
+/* Legacy deviceless bridge ioctl's.  Called with br_ioctl_mutex. */
+static int old_deviceless(struct net *net, void __user *uarg)
+{
+	unsigned long args[3];
+
+	if (copy_from_user(args, uarg, sizeof(args)))
+		return -EFAULT;
+
+	switch (args[0]) {
+	case BRCTL_GET_BRIDGES:
+		return brc_get_bridges(net, (int __user *)args[1], args[2]);
+
+	case BRCTL_ADD_BRIDGE:
+		return brc_add_del_bridge(net, (void __user *)args[1], 1);
+	case BRCTL_DEL_BRIDGE:
+		return brc_add_del_bridge(net, (void __user *)args[1], 0);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+/* Called with the br_ioctl_mutex. */
+static int brc_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
+				     void __user *uarg)
+{
+	switch (cmd) {
+	case SIOCGIFBR:
+	case SIOCSIFBR:
+		return old_deviceless(net, uarg);
+
+	case SIOCBRADDBR:
+		return brc_add_del_bridge(net, uarg, 1);
+	case SIOCBRDELBR:
+		return brc_add_del_bridge(net, uarg, 0);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int brc_add_del_port(struct net_device *dev, int port_ifindex, int add)
+{
+	struct sk_buff *request;
+	struct net_device *port;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	port = __dev_get_by_index(dev_net(dev), port_ifindex);
+	if (!port)
+		return -EINVAL;
+
+	/* Save name of dev and port because there's a race between the
+	 * rtnl_unlock() and the brc_send_simple_command(). */
+	request = brc_make_request(add ? BRC_GENL_C_PORT_ADD : BRC_GENL_C_PORT_DEL,
+				   dev->name, port->name);
+	if (!request)
+		return -ENOMEM;
+
+	rtnl_unlock();
+	err = brc_send_simple_command(dev_net(dev), request);
+	rtnl_lock();
+
+	return err;
+}
+
+static int brc_get_bridge_info(struct net_device *dev,
+			       struct __bridge_info __user *ub)
+{
+	struct __bridge_info b;
+
+	memset(&b, 0, sizeof(struct __bridge_info));
+
+	/* First two bytes are the priority, which we should skip.  This comes
+	 * from struct bridge_id in br_private.h, which is unavailable to us.
+	 */
+	memcpy((u8 *)&b.bridge_id + 2, dev->dev_addr, ETH_ALEN);
+	b.stp_enabled = 0;
+
+	if (copy_to_user(ub, &b, sizeof(struct __bridge_info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int brc_get_port_list(struct net_device *dev, int __user *uindices,
+			     int num)
+{
+	int retval;
+
+	rtnl_unlock();
+	retval = brc_get_indices(dev_net(dev), BRC_GENL_C_GET_PORTS, dev->name,
+				 uindices, num);
+	rtnl_lock();
+
+	return retval;
+}
+
+/*
+ * Format up to a page worth of forwarding table entries
+ * userbuf -- where to copy result
+ * maxnum  -- maximum number of entries desired
+ *            (limited to a page for sanity)
+ * offset  -- number of records to skip
+ */
+static int brc_get_fdb_entries(struct net_device *dev, void __user *userbuf,
+			       unsigned long maxnum, unsigned long offset)
+{
+	struct nlattr *attrs[BRC_GENL_A_MAX + 1];
+	struct sk_buff *request, *reply;
+	int retval;
+	int len;
+
+	/* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */
+	if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry))
+		maxnum = PAGE_SIZE/sizeof(struct __fdb_entry);
+
+	request = brc_make_request(BRC_GENL_C_FDB_QUERY, dev->name, NULL);
+	if (!request)
+		return -ENOMEM;
+	NLA_PUT_U64(request, BRC_GENL_A_FDB_COUNT, maxnum);
+	NLA_PUT_U64(request, BRC_GENL_A_FDB_SKIP, offset);
+
+	rtnl_unlock();
+	reply = brc_send_command(dev_net(dev), request, attrs);
+	retval = PTR_ERR(reply);
+	if (IS_ERR(reply))
+		goto exit;
+
+	retval = -nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
+	if (retval < 0)
+		goto exit_free_skb;
+
+	retval = -EINVAL;
+	if (!attrs[BRC_GENL_A_FDB_DATA])
+		goto exit_free_skb;
+	len = nla_len(attrs[BRC_GENL_A_FDB_DATA]);
+	if (len % sizeof(struct __fdb_entry) ||
+	    len / sizeof(struct __fdb_entry) > maxnum)
+		goto exit_free_skb;
+
+	retval = len / sizeof(struct __fdb_entry);
+	if (copy_to_user(userbuf, nla_data(attrs[BRC_GENL_A_FDB_DATA]), len))
+		retval = -EFAULT;
+
+exit_free_skb:
+	kfree_skb(reply);
+exit:
+	rtnl_lock();
+	return retval;
+
+nla_put_failure:
+	kfree_skb(request);
+	return -ENOMEM;
+}
+
+/* Legacy ioctl's through SIOCDEVPRIVATE.  Called with rtnl_lock. */
+static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	unsigned long args[4];
+
+	if (copy_from_user(args, rq->ifr_data, sizeof(args)))
+		return -EFAULT;
+
+	switch (args[0]) {
+	case BRCTL_ADD_IF:
+		return brc_add_del_port(dev, args[1], 1);
+	case BRCTL_DEL_IF:
+		return brc_add_del_port(dev, args[1], 0);
+
+	case BRCTL_GET_BRIDGE_INFO:
+		return brc_get_bridge_info(dev, (struct __bridge_info __user *)args[1]);
+
+	case BRCTL_GET_PORT_LIST:
+		return brc_get_port_list(dev, (int __user *)args[1], args[2]);
+
+	case BRCTL_GET_FDB_ENTRIES:
+		return brc_get_fdb_entries(dev, (void __user *)args[1],
+					   args[2], args[3]);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+/* Called with the rtnl_lock. */
+static int brc_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	int err;
+
+	switch (cmd) {
+	case SIOCDEVPRIVATE:
+		err = old_dev_ioctl(dev, rq, cmd);
+		break;
+
+	case SIOCBRADDIF:
+		return brc_add_del_port(dev, rq->ifr_ifindex, 1);
+	case SIOCBRDELIF:
+		return brc_add_del_port(dev, rq->ifr_ifindex, 0);
+
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+
+static struct genl_family brc_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = BRC_GENL_FAMILY_NAME,
+	.version = 1,
+	.maxattr = BRC_GENL_A_MAX,
+	.netnsok = true,
+};
+
+static int brc_genl_query(struct sk_buff *skb, struct genl_info *info)
+{
+	int err = -EINVAL;
+	struct sk_buff *ans_skb;
+	void *data;
+
+	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!ans_skb)
+		return -ENOMEM;
+
+	data = genlmsg_put_reply(ans_skb, info, &brc_genl_family,
+				 0, BRC_GENL_C_QUERY_MC);
+	if (data == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+	NLA_PUT_U32(ans_skb, BRC_GENL_A_MC_GROUP, brc_mc_group.id);
+
+	genlmsg_end(ans_skb, data);
+	return genlmsg_reply(ans_skb, info);
+
+nla_put_failure:
+err:
+	kfree_skb(ans_skb);
+	return err;
+}
+
+/* Attribute policy: what each attribute may contain.  */
+static struct nla_policy brc_genl_policy[BRC_GENL_A_MAX + 1] = {
+	[BRC_GENL_A_ERR_CODE] = { .type = NLA_U32 },
+	[BRC_GENL_A_FDB_DATA] = { .type = NLA_UNSPEC },
+};
+
+static int brc_genl_dp_result(struct sk_buff *skb, struct genl_info *info)
+{
+	unsigned long int flags;
+	int err;
+
+	if (!info->attrs[BRC_GENL_A_ERR_CODE])
+		return -EINVAL;
+
+	skb = skb_clone(skb, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&brc_lock, flags);
+	if (brc_seq == info->snd_seq) {
+		brc_seq++;
+
+		kfree_skb(brc_reply);
+		brc_reply = skb;
+
+		complete(&brc_done);
+		err = 0;
+	} else {
+		kfree_skb(skb);
+		err = -ESTALE;
+	}
+	spin_unlock_irqrestore(&brc_lock, flags);
+
+	return err;
+}
+
+static struct genl_ops brc_genl_ops[] = {
+	{ .cmd = BRC_GENL_C_QUERY_MC,
+	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
+	  .policy = NULL,
+	  .doit = brc_genl_query,
+	},
+	{ .cmd = BRC_GENL_C_DP_RESULT,
+	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
+	  .policy = brc_genl_policy,
+	  .doit = brc_genl_dp_result,
+	},
+};
+
+static struct sk_buff *brc_send_command(struct net *net,
+					struct sk_buff *request,
+					struct nlattr **attrs)
+{
+	unsigned long int flags;
+	struct sk_buff *reply;
+	int error;
+
+	mutex_lock(&brc_serial);
+
+	/* Increment sequence number first, so that we ignore any replies
+	 * to stale requests. */
+	spin_lock_irqsave(&brc_lock, flags);
+	nlmsg_hdr(request)->nlmsg_seq = ++brc_seq;
+	INIT_COMPLETION(brc_done);
+	spin_unlock_irqrestore(&brc_lock, flags);
+
+	nlmsg_end(request, nlmsg_hdr(request));
+
+	/* Send message. */
+	error = genlmsg_multicast_netns(net, request, 0,
+					brc_mc_group.id, GFP_KERNEL);
+	if (error < 0)
+		goto error;
+
+	/* Wait for reply. */
+	error = -ETIMEDOUT;
+	if (!wait_for_completion_timeout(&brc_done, BRC_TIMEOUT)) {
+		pr_warn("timed out waiting for userspace\n");
+		goto error;
+	}
+
+	/* Grab reply. */
+	spin_lock_irqsave(&brc_lock, flags);
+	reply = brc_reply;
+	brc_reply = NULL;
+	spin_unlock_irqrestore(&brc_lock, flags);
+
+	mutex_unlock(&brc_serial);
+
+	/* Re-parse message.  Can't fail, since it parsed correctly once
+	 * already. */
+	error = nlmsg_parse(nlmsg_hdr(reply), GENL_HDRLEN,
+			    attrs, BRC_GENL_A_MAX, brc_genl_policy);
+	WARN_ON(error);
+
+	return reply;
+
+error:
+	mutex_unlock(&brc_serial);
+	return ERR_PTR(error);
+}
+
+static int __init brc_init(void)
+{
+	int err;
+
+	pr_info("Open vSwitch Bridge Compatibility, built "__DATE__" "__TIME__"\n");
+
+	/* Set the bridge ioctl handler */
+	brioctl_set(brc_ioctl_deviceless_stub);
+
+	/* Set the openvswitch_mod device ioctl handler */
+	ovs_dp_ioctl_hook = brc_dev_ioctl;
+
+	/* Randomize the initial sequence number.  This is not a security
+	 * feature; it only helps avoid crossed wires between userspace and
+	 * the kernel when the module is unloaded and reloaded. */
+	brc_seq = net_random();
+
+	/* Register generic netlink family to communicate changes to
+	 * userspace. */
+	err = genl_register_family_with_ops(&brc_genl_family,
+					    brc_genl_ops, ARRAY_SIZE(brc_genl_ops));
+	if (err)
+		goto error;
+
+	strcpy(brc_mc_group.name, "brcompat");
+	err = genl_register_mc_group(&brc_genl_family, &brc_mc_group);
+	if (err < 0)
+		goto err_unregister;
+
+	return 0;
+
+err_unregister:
+	genl_unregister_family(&brc_genl_family);
+error:
+	pr_emerg("failed to install!\n");
+	return err;
+}
+
+static void brc_cleanup(void)
+{
+	/* Unregister ioctl hooks */
+	ovs_dp_ioctl_hook = NULL;
+	brioctl_set(NULL);
+
+	genl_unregister_family(&brc_genl_family);
+}
+
+module_init(brc_init);
+module_exit(brc_cleanup);
+
+MODULE_DESCRIPTION("Open vSwitch bridge compatibility");
+MODULE_AUTHOR("Nicira Networks");
+MODULE_LICENSE("GPL");
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/datapath.c linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/datapath.c
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/datapath.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/datapath.c	2015-01-21 12:02:58.458821948 +0300
@@ -61,6 +61,13 @@
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
+
+#if defined(CONFIG_OVS_BRCOMPAT) || defined(CONFIG_OVS_BRCOMPAT_MODULE)
+/* Allow brcompat module be loaded and hooked to bridge */
+int (*ovs_dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
+EXPORT_SYMBOL_GPL(ovs_dp_ioctl_hook);
+#endif
+
 int ovs_net_id __read_mostly;
 
 static void ovs_notify(struct sk_buff *skb, struct genl_info *info,
@@ -1843,18 +1850,7 @@ error:
 
 static int __net_init ovs_init_net(struct net *net)
 {
-	struct ovs_net *ovs_net;
-	int err;
-
-	ovs_net = kzalloc(sizeof(struct ovs_net), GFP_KERNEL);
-	if (ovs_net == NULL)
-		return -ENOMEM;
-
-	err = net_assign_generic(net, ovs_net_id, ovs_net);
-	if (err < 0) {
-		kfree(ovs_net);
-		return err;
-	}
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
 
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
@@ -1872,12 +1868,13 @@ static void __net_exit ovs_exit_net(stru
 	ovs_unlock();
 
 	cancel_work_sync(&ovs_net->dp_notify_work);
-	kfree(ovs_net);
 }
 
 static struct pernet_operations ovs_net_ops = {
 	.init = ovs_init_net,
 	.exit = ovs_exit_net,
+	.id   = &ovs_net_id,
+	.size = sizeof(struct ovs_net),
 };
 
 static int __init dp_init(void)
@@ -1896,7 +1893,7 @@ static int __init dp_init(void)
 	if (err)
 		goto error_flow_exit;
 
-	err = register_pernet_gen_device(&ovs_net_id, &ovs_net_ops);
+	err = register_pernet_device(&ovs_net_ops);
 	if (err)
 		goto error_vport_exit;
 
@@ -1913,7 +1910,7 @@ static int __init dp_init(void)
 error_unreg_notifier:
 	unregister_netdevice_notifier(&ovs_dp_device_notifier);
 error_netns_exit:
-	unregister_pernet_gen_device(ovs_net_id, &ovs_net_ops);
+	unregister_pernet_device(&ovs_net_ops);
 error_vport_exit:
 	ovs_vport_exit();
 error_flow_exit:
@@ -1926,7 +1923,7 @@ static void dp_cleanup(void)
 {
 	dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
 	unregister_netdevice_notifier(&ovs_dp_device_notifier);
-	unregister_pernet_gen_device(ovs_net_id, &ovs_net_ops);
+	unregister_pernet_device(&ovs_net_ops);
 	rcu_barrier();
 	ovs_vport_exit();
 	ovs_flow_exit();
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/datapath.h linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/datapath.h
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/datapath.h	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/datapath.h	2015-01-21 12:02:58.458821948 +0300
@@ -177,7 +177,9 @@ static inline struct vport *ovs_vport_ov
 
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_multicast_group ovs_dp_vport_multicast_group;
-
+#if defined(CONFIG_OVS_BRCOMPAT) || defined(CONFIG_OVS_BRCOMPAT_MODULE)
+extern int (*ovs_dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
+#endif
 void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
 void ovs_dp_detach_port(struct vport *);
 int ovs_dp_upcall(struct datapath *, struct sk_buff *,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/openvswitch/vport-internal_dev.c linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/vport-internal_dev.c
--- linux-2.6.32-504.3.3.el6.orig/net/openvswitch/vport-internal_dev.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/openvswitch/vport-internal_dev.c	2015-01-21 12:02:58.459821922 +0300
@@ -107,6 +107,17 @@ static int internal_dev_change_mtu(struc
 	return 0;
 }
 
+#if defined(CONFIG_OVS_BRCOMPAT) || defined(CONFIG_OVS_BRCOMPAT_MODULE)
+static int internal_dev_do_ioctl(struct net_device *dev,
+				 struct ifreq *ifr, int cmd)
+{
+	if (ovs_dp_ioctl_hook)
+		return ovs_dp_ioctl_hook(dev, ifr, cmd);
+
+	return -EOPNOTSUPP;
+}
+#endif
+
 static void internal_dev_destructor(struct net_device *dev)
 {
 	struct vport *vport = ovs_internal_dev_get_vport(dev);
@@ -122,6 +133,9 @@ static const struct net_device_ops inter
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_change_mtu = internal_dev_change_mtu,
 	.ndo_get_stats = internal_dev_get_stats,
+#if defined(CONFIG_OVS_BRCOMPAT) || defined(CONFIG_OVS_BRCOMPAT_MODULE)
+	.ndo_do_ioctl = internal_dev_do_ioctl,
+#endif
 };
 
 static void do_setup(struct net_device *netdev)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/packet/af_packet.c linux-2.6.32-504.3.3.el6-042stab103_6/net/packet/af_packet.c
--- linux-2.6.32-504.3.3.el6.orig/net/packet/af_packet.c	2014-12-12 23:29:34.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/packet/af_packet.c	2015-01-21 12:02:50.860023656 +0300
@@ -83,10 +83,14 @@
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
 
+#include <bc/net.h>
+
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
 
+#include <linux/cpt_image.h>
+
 /*
    Assumptions:
    - if device has no dev->hard_header routine, it adds and removes ll header
@@ -563,6 +567,8 @@ static int packet_rcv(struct sk_buff *sk
 	if (dev_net(dev) != sock_net(sk))
 		goto drop;
 
+	skb_orphan(skb);
+
 	skb->dev = dev;
 
 	if (dev->header_ops) {
@@ -625,6 +631,9 @@ static int packet_rcv(struct sk_buff *sk
 	if (pskb_trim(skb, snaplen))
 		goto drop_n_acct;
 
+	if (ub_sockrcvbuf_charge(sk, skb))
+		goto drop_n_acct;
+
 	skb_set_owner_r(skb, sk);
 	skb->dev = NULL;
 	skb_dst_drop(skb);
@@ -684,6 +693,8 @@ static int tpacket_rcv(struct sk_buff *s
 	if (dev_net(dev) != sock_net(sk))
 		goto drop;
 
+	skb_orphan(skb);
+
 	if (dev->header_ops) {
 		if (sk->sk_type != SOCK_DGRAM)
 			skb_push(skb, skb->data - skb_mac_header(skb));
@@ -732,6 +743,12 @@ static int tpacket_rcv(struct sk_buff *s
 			snaplen = 0;
 	}
 
+	if (copy_skb &&
+	    ub_sockrcvbuf_charge(sk, copy_skb)) {
+		spin_lock(&sk->sk_receive_queue.lock);
+		goto ring_is_full;
+	}
+
 	spin_lock(&sk->sk_receive_queue.lock);
 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
 	if (!h.raw)
@@ -1088,12 +1105,8 @@ static inline struct sk_buff *packet_all
 {
 	struct sk_buff *skb;
 
-	/* Under a page?  Don't bother with paged skb. */
-	if (prepad + len < PAGE_SIZE || !linear)
-		linear = len;
-
-	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err);
+	linear = len;
+	skb = sock_alloc_send_skb(sk, prepad + linear, noblock, err);
 	if (!skb)
 		return NULL;
 
@@ -1425,6 +1438,8 @@ static int packet_create(struct net *net
 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
 	if (sk == NULL)
 		goto out;
+	if (ub_other_sock_charge(sk, kern))
+		goto out_free;
 
 	sock->ops = &packet_ops;
 	if (sock->type == SOCK_PACKET)
@@ -1464,6 +1479,9 @@ static int packet_create(struct net *net
 	sock_prot_inuse_add(net, &packet_proto, 1);
 	write_unlock_bh(&net->packet.sklist_lock);
 	return 0;
+
+out_free:
+	sk_free(sk);
 out:
 	return err;
 }
@@ -2215,10 +2233,11 @@ static void packet_mm_close(struct vm_ar
 		atomic_dec(&pkt_sk(sk)->mapped);
 }
 
-static const struct vm_operations_struct packet_mmap_ops = {
+const struct vm_operations_struct packet_mmap_ops = {
 	.open	=	packet_mm_open,
 	.close	=	packet_mm_close,
 };
+EXPORT_SYMBOL(packet_mmap_ops);
 
 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
 {
@@ -2233,7 +2252,7 @@ static void free_pg_vec(char **pg_vec, u
 
 static inline char *alloc_one_pg_vec_page(unsigned long order)
 {
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+	gfp_t gfp_flags = GFP_KERNEL_UBC | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
 
 	return (char *) __get_free_pages(gfp_flags, order);
 }
@@ -2244,7 +2263,7 @@ static char **alloc_pg_vec(struct tpacke
 	char **pg_vec;
 	int i;
 
-	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL_UBC);
 	if (unlikely(!pg_vec))
 		goto out;
 
@@ -2451,6 +2470,121 @@ out:
 }
 #endif
 
+void sock_packet_cpt_attr(struct sock *sk, struct cpt_sock_packet_image *v)
+{
+	struct packet_sock *po = pkt_sk(sk);
+#ifdef CONFIG_PACKET_MMAP
+	struct cpt_sock_packet_ring_image *ri;
+	struct packet_ring_buffer *rb;
+#endif
+
+	v->cpt_stats_tp_packets = po->stats.tp_packets;
+	v->cpt_stats_tp_drops = po->stats.tp_drops;
+
+	v->cpt_auxdata = po->auxdata;
+	v->cpt_origdev = po->origdev;
+	v->cpt_tp_tstamp = po->tp_tstamp;
+
+#ifdef CONFIG_PACKET_MMAP
+	v->cpt_copy_thresh = po->copy_thresh;
+	v->cpt_tp_version = po->tp_version;
+	v->cpt_tp_reserve = po->tp_reserve;
+	v->cpt_tp_loss = po->tp_loss;
+
+	for (rb = &po->rx_ring, ri = &v->cpt_rx_ring;
+	     rb <= &po->tx_ring; rb++, ri++) {
+		memset(ri, 0, sizeof(*ri));
+		if (!rb->pg_vec)
+			continue;
+		ri->cpt_tp_block_size = rb->pg_vec_pages * PAGE_SIZE;
+		ri->cpt_tp_block_nr = rb->pg_vec_len;
+		ri->cpt_tp_frame_size = rb->frame_size;
+		ri->cpt_tp_frame_nr = rb->frame_max + 1;
+	}
+#endif
+}
+EXPORT_SYMBOL(sock_packet_cpt_attr);
+
+int sock_packet_rst_attr(struct sock *sk, struct cpt_sock_packet_image *v)
+{
+	int err = 0;
+	struct packet_sock *po = pkt_sk(sk);
+#ifdef CONFIG_PACKET_MMAP
+	struct cpt_sock_packet_ring_image *ri;
+#endif
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	po->stats.tp_packets = v->cpt_stats_tp_packets;
+	po->stats.tp_drops = v->cpt_stats_tp_drops;
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	po->auxdata = v->cpt_auxdata;
+	po->origdev = v->cpt_origdev;
+	po->tp_tstamp = v->cpt_tp_tstamp;
+
+#ifdef CONFIG_PACKET_MMAP
+	po->copy_thresh = v->cpt_copy_thresh;
+	po->tp_version = v->cpt_tp_version;
+	po->tp_reserve = v->cpt_tp_reserve;
+	po->tp_loss = v->cpt_tp_loss;
+
+	for (ri = &v->cpt_rx_ring; ri <= &v->cpt_tx_ring; ri++) {
+		struct tpacket_req req;
+
+		req.tp_block_size = ri->cpt_tp_block_size;
+		req.tp_block_nr = ri->cpt_tp_block_nr;
+		req.tp_frame_size = ri->cpt_tp_frame_size;
+		req.tp_frame_nr = ri->cpt_tp_frame_nr;
+
+		err = packet_set_ring(sk, &req, 0, ri == &v->cpt_tx_ring);
+		if (err)
+			break;
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(sock_packet_rst_attr);
+
+void *sock_packet_cpt_one_mc(struct sock *sk,
+		struct cpt_sock_packet_mc_image *mi, void *prev)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_mclist *mc;
+
+	mc = prev ? ((struct packet_mclist *)prev)->next : po->mclist;
+	if (!mc)
+		return NULL;
+
+	mi->cpt_ifindex = mc->ifindex;
+	mi->cpt_count = mc->count;
+	mi->cpt_type = mc->type;
+	mi->cpt_alen = mc->alen;
+	memcpy(mi->cpt_addr, mc->addr, sizeof(mi->cpt_addr));
+
+	return mc;
+}
+EXPORT_SYMBOL(sock_packet_cpt_one_mc);
+
+int sock_packet_rst_one_mc(struct sock *sk,
+		struct cpt_sock_packet_mc_image *mi)
+{
+	struct packet_mreq_max mreq;
+	int i;
+	int err;
+
+	mreq.mr_ifindex = mi->cpt_ifindex;
+	mreq.mr_type = mi->cpt_type;
+	mreq.mr_alen = mi->cpt_alen;
+	memcpy(mreq.mr_address, mi->cpt_addr, sizeof(mreq.mr_address));
+
+	for (i = 0; i < mi->cpt_count; i++) {
+		err = packet_mc_add(sk, &mreq);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(sock_packet_rst_one_mc);
 
 static const struct proto_ops packet_ops_spkt = {
 	.family =	PF_PACKET,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/phonet/pn_dev.c linux-2.6.32-504.3.3.el6-042stab103_6/net/phonet/pn_dev.c
--- linux-2.6.32-504.3.3.el6.orig/net/phonet/pn_dev.c	2014-12-12 23:28:54.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/phonet/pn_dev.c	2015-01-21 12:02:51.211014338 +0300
@@ -251,24 +251,18 @@ static struct notifier_block phonet_devi
 /* Per-namespace Phonet devices handling */
 static int phonet_init_net(struct net *net)
 {
-	struct phonet_net *pnn = kmalloc(sizeof(*pnn), GFP_KERNEL);
-	if (!pnn)
-		return -ENOMEM;
+	struct phonet_net *pnn = net_generic(net, phonet_net_id);
 
-	if (!proc_net_fops_create(net, "phonet", 0, &pn_sock_seq_fops)) {
-		kfree(pnn);
+	if (!proc_net_fops_create(net, "phonet", 0, &pn_sock_seq_fops))
 		return -ENOMEM;
-	}
 
 	INIT_LIST_HEAD(&pnn->pndevs.list);
 	spin_lock_init(&pnn->pndevs.lock);
-	net_assign_generic(net, phonet_net_id, pnn);
 	return 0;
 }
 
 static void phonet_exit_net(struct net *net)
 {
-	struct phonet_net *pnn = net_generic(net, phonet_net_id);
 	struct net_device *dev;
 
 	rtnl_lock();
@@ -277,19 +271,19 @@ static void phonet_exit_net(struct net *
 	rtnl_unlock();
 
 	proc_net_remove(net, "phonet");
-	net_assign_generic(net, phonet_net_id, NULL);
-	kfree(pnn);
 }
 
 static struct pernet_operations phonet_net_ops = {
 	.init = phonet_init_net,
 	.exit = phonet_exit_net,
+	.id   = &phonet_net_id,
+	.size = sizeof(struct phonet_net),
 };
 
 /* Initialize Phonet devices list */
 int __init phonet_device_init(void)
 {
-	int err = register_pernet_gen_device(&phonet_net_id, &phonet_net_ops);
+	int err = register_pernet_device(&phonet_net_ops);
 	if (err)
 		return err;
 
@@ -304,5 +298,5 @@ void phonet_device_exit(void)
 {
 	rtnl_unregister_all(PF_PHONET);
 	unregister_netdevice_notifier(&phonet_device_notifier);
-	unregister_pernet_gen_device(phonet_net_id, &phonet_net_ops);
+	unregister_pernet_device(&phonet_net_ops);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sched/act_api.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/act_api.c
--- linux-2.6.32-504.3.3.el6.orig/net/sched/act_api.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/act_api.c	2015-01-21 12:02:47.549111551 +0300
@@ -667,7 +667,8 @@ nlmsg_failure:
 }
 
 static int
-act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event)
+act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
+	       struct tc_action *a, int event)
 {
 	struct sk_buff *skb;
 
@@ -679,7 +680,7 @@ act_get_notify(u32 pid, struct nlmsghdr 
 		return -EINVAL;
 	}
 
-	return rtnl_unicast(skb, &init_net, pid);
+	return rtnl_unicast(skb, net, pid);
 }
 
 static struct tc_action *
@@ -749,7 +750,8 @@ static struct tc_action *create_a(int i)
 	return act;
 }
 
-static int tca_action_flush(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
+static int tca_action_flush(struct net *net, struct nlattr *nla,
+			    struct nlmsghdr *n, u32 pid)
 {
 	struct sk_buff *skb;
 	unsigned char *b;
@@ -808,7 +810,7 @@ static int tca_action_flush(struct nlatt
 	nlh->nlmsg_flags |= NLM_F_ROOT;
 	module_put(a->ops->owner);
 	kfree(a);
-	err = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 	if (err > 0)
 		return 0;
 
@@ -825,7 +827,8 @@ noflush_out:
 }
 
 static int
-tca_action_gd(struct nlattr *nla, struct nlmsghdr *n, u32 pid, int event)
+tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+	      u32 pid, int event)
 {
 	int i, ret;
 	struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
@@ -837,7 +840,7 @@ tca_action_gd(struct nlattr *nla, struct
 
 	if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
 		if (tb[1] != NULL)
-			return tca_action_flush(tb[1], n, pid);
+			return tca_action_flush(net, tb[1], n, pid);
 		else
 			return -EINVAL;
 	}
@@ -858,7 +861,7 @@ tca_action_gd(struct nlattr *nla, struct
 	}
 
 	if (event == RTM_GETACTION)
-		ret = act_get_notify(pid, n, head, event);
+		ret = act_get_notify(net, pid, n, head, event);
 	else { /* delete */
 		struct sk_buff *skb;
 
@@ -877,7 +880,7 @@ tca_action_gd(struct nlattr *nla, struct
 
 		/* now do the delete */
 		tcf_action_destroy(head, 0);
-		ret = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC,
+		ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
 				     n->nlmsg_flags&NLM_F_ECHO);
 		if (ret > 0)
 			return 0;
@@ -888,8 +891,8 @@ err:
 	return ret;
 }
 
-static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
-			  u16 flags)
+static int tcf_add_notify(struct net *net, struct tc_action *a,
+			  u32 pid, u32 seq, int event, u16 flags)
 {
 	struct tcamsg *t;
 	struct nlmsghdr *nlh;
@@ -922,7 +925,7 @@ static int tcf_add_notify(struct tc_acti
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	NETLINK_CB(skb).dst_group = RTNLGRP_TC;
 
-	err = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
+	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
 	if (err > 0)
 		err = 0;
 	return err;
@@ -935,7 +938,8 @@ nlmsg_failure:
 
 
 static int
-tcf_action_add(struct nlattr *nla, struct nlmsghdr *n, u32 pid, int ovr)
+tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+	       u32 pid, int ovr)
 {
 	int ret = 0;
 	struct tc_action *act;
@@ -953,7 +957,7 @@ tcf_action_add(struct nlattr *nla, struc
 	/* dump then free all the actions after update; inserted policy
 	 * stays intact
 	 * */
-	ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
+	ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
 	for (a = act; a; a = act) {
 		act = a->next;
 		kfree(a);
@@ -969,9 +973,6 @@ static int tc_ctl_action(struct sk_buff 
 	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
 	int ret = 0, ovr = 0;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
 	if (ret < 0)
 		return ret;
@@ -994,15 +995,17 @@ static int tc_ctl_action(struct sk_buff 
 		if (n->nlmsg_flags&NLM_F_REPLACE)
 			ovr = 1;
 replay:
-		ret = tcf_action_add(tca[TCA_ACT_TAB], n, pid, ovr);
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
 		if (ret == -EAGAIN)
 			goto replay;
 		break;
 	case RTM_DELACTION:
-		ret = tca_action_gd(tca[TCA_ACT_TAB], n, pid, RTM_DELACTION);
+		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+				    pid, RTM_DELACTION);
 		break;
 	case RTM_GETACTION:
-		ret = tca_action_gd(tca[TCA_ACT_TAB], n, pid, RTM_GETACTION);
+		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+				    pid, RTM_GETACTION);
 		break;
 	default:
 		BUG();
@@ -1042,7 +1045,6 @@ find_dump_kind(const struct nlmsghdr *n)
 static int
 tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct net *net = sock_net(skb->sk);
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
@@ -1052,9 +1054,6 @@ tc_dump_action(struct sk_buff *skb, stru
 	struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
 	struct nlattr *kind = find_dump_kind(cb->nlh);
 
-	if (net != &init_net)
-		return 0;
-
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sched/cls_api.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/cls_api.c
--- linux-2.6.32-504.3.3.el6.orig/net/sched/cls_api.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/cls_api.c	2015-01-21 12:02:47.550111524 +0300
@@ -98,8 +98,9 @@ out:
 }
 EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
-static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			  struct tcf_proto *tp, unsigned long fh, int event);
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+			  struct nlmsghdr *n, struct tcf_proto *tp,
+			  unsigned long fh, int event);
 
 
 /* Select new prio value from the range, managed by kernel. */
@@ -137,9 +138,6 @@ static int tc_ctl_tfilter(struct sk_buff
 	int err;
 	int tp_created = 0;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 replay:
 	t = NLMSG_DATA(n);
 	protocol = TC_H_MIN(t->tcm_info);
@@ -158,7 +156,7 @@ replay:
 	/* Find head of filter chain. */
 
 	/* Find link */
-	dev = __dev_get_by_index(&init_net, t->tcm_ifindex);
+	dev = __dev_get_by_index(net, t->tcm_ifindex);
 	if (dev == NULL)
 		return -ENODEV;
 
@@ -282,7 +280,7 @@ replay:
 			*back = tp->next;
 			spin_unlock_bh(root_lock);
 
-			tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
+			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
 			tcf_destroy(tp);
 			err = 0;
 			goto errout;
@@ -305,10 +303,10 @@ replay:
 		case RTM_DELTFILTER:
 			err = tp->ops->delete(tp, fh);
 			if (err == 0)
-				tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
+				tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
 			goto errout;
 		case RTM_GETTFILTER:
-			err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+			err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
 			goto errout;
 		default:
 			err = -EINVAL;
@@ -324,7 +322,7 @@ replay:
 			*back = tp;
 			spin_unlock_bh(root_lock);
 		}
-		tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
 	} else {
 		if (tp_created)
 			tcf_destroy(tp);
@@ -370,8 +368,9 @@ nla_put_failure:
 	return -1;
 }
 
-static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			  struct tcf_proto *tp, unsigned long fh, int event)
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+			  struct nlmsghdr *n, struct tcf_proto *tp,
+			  unsigned long fh, int event)
 {
 	struct sk_buff *skb;
 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
@@ -385,7 +384,7 @@ static int tfilter_notify(struct sk_buff
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC,
+	return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
 			      n->nlmsg_flags & NLM_F_ECHO);
 }
 
@@ -417,12 +416,9 @@ static int tc_dump_tfilter(struct sk_buf
 	const struct Qdisc_class_ops *cops;
 	struct tcf_dump_args arg;
 
-	if (net != &init_net)
-		return 0;
-
 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
 		return skb->len;
-	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+	if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 		return skb->len;
 
 	if (!tcm->tcm_parent)
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sched/sch_api.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_api.c
--- linux-2.6.32-504.3.3.el6.orig/net/sched/sch_api.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_api.c	2015-01-21 12:02:47.550111524 +0300
@@ -34,10 +34,12 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+			struct nlmsghdr *n, u32 clid,
 			struct Qdisc *old, struct Qdisc *new);
-static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			 struct Qdisc *q, unsigned long cl, int event);
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+			 struct nlmsghdr *n, struct Qdisc *q,
+			 unsigned long cl, int event);
 
 /*
 
@@ -654,11 +656,12 @@ void qdisc_tree_decrease_qlen(struct Qdi
 }
 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 
-static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
+static void notify_and_destroy(struct net *net, struct sk_buff *skb,
+			       struct nlmsghdr *n, u32 clid,
 			       struct Qdisc *old, struct Qdisc *new)
 {
 	if (new || old)
-		qdisc_notify(skb, n, clid, old, new);
+		qdisc_notify(net, skb, n, clid, old, new);
 
 	if (old)
 		qdisc_destroy(old);
@@ -678,6 +681,7 @@ static int qdisc_graft(struct net_device
 		       struct Qdisc *new, struct Qdisc *old)
 {
 	struct Qdisc *q = old;
+	struct net *net = dev_net(dev);
 	int err = 0;
 
 	if (parent == NULL) {
@@ -714,12 +718,13 @@ static int qdisc_graft(struct net_device
 		}
 
 		if (!ingress) {
-			notify_and_destroy(skb, n, classid, dev->qdisc, new);
+			notify_and_destroy(net, skb, n, classid,
+					   dev->qdisc, new);
 			if (new && !new->ops->attach)
 				atomic_inc(&new->refcnt);
 			dev->qdisc = new ? : &noop_qdisc;
 		} else {
-			notify_and_destroy(skb, n, classid, old, new);
+			notify_and_destroy(net, skb, n, classid, old, new);
 		}
 
 		if (dev->flags & IFF_UP)
@@ -737,7 +742,7 @@ static int qdisc_graft(struct net_device
 				err = -ENOENT;
 		}
 		if (!err)
-			notify_and_destroy(skb, n, classid, old, new);
+			notify_and_destroy(net, skb, n, classid, old, new);
 	}
 	return err;
 }
@@ -963,10 +968,7 @@ static int tc_get_qdisc(struct sk_buff *
 	struct Qdisc *p = NULL;
 	int err;
 
-	if (net != &init_net)
-		return -EINVAL;
-
-	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 		return -ENODEV;
 
 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1006,7 +1008,7 @@ static int tc_get_qdisc(struct sk_buff *
 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
 			return err;
 	} else {
-		qdisc_notify(skb, n, clid, NULL, q);
+		qdisc_notify(net, skb, n, clid, NULL, q);
 	}
 	return 0;
 }
@@ -1025,16 +1027,13 @@ static int tc_modify_qdisc(struct sk_buf
 	struct Qdisc *q, *p;
 	int err;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 replay:
 	/* Reinit, just in case something touches this. */
 	tcm = NLMSG_DATA(n);
 	clid = tcm->tcm_parent;
 	q = p = NULL;
 
-	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 		return -ENODEV;
 
 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1121,7 +1120,7 @@ replay:
 		return -EINVAL;
 	err = qdisc_change(q, tca);
 	if (err == 0)
-		qdisc_notify(skb, n, clid, NULL, q);
+		qdisc_notify(net, skb, n, clid, NULL, q);
 	return err;
 
 create_n_graft:
@@ -1216,8 +1215,9 @@ static bool tc_qdisc_dump_ignore(struct 
 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
 }
 
-static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			u32 clid, struct Qdisc *old, struct Qdisc *new)
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+			struct nlmsghdr *n, u32 clid,
+			struct Qdisc *old, struct Qdisc *new)
 {
 	struct sk_buff *skb;
 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
@@ -1236,7 +1236,7 @@ static int qdisc_notify(struct sk_buff *
 	}
 
 	if (skb->len)
-		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+		return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 
 err_out:
 	kfree_skb(skb);
@@ -1290,14 +1290,11 @@ static int tc_dump_qdisc(struct sk_buff 
 	int s_idx, s_q_idx;
 	struct net_device *dev;
 
-	if (net != &init_net)
-		return 0;
-
 	s_idx = cb->args[0];
 	s_q_idx = q_idx = cb->args[1];
 	read_lock(&dev_base_lock);
 	idx = 0;
-	for_each_netdev(&init_net, dev) {
+	for_each_netdev(net, dev) {
 		struct netdev_queue *dev_queue;
 
 		if (idx < s_idx)
@@ -1349,10 +1346,7 @@ static int tc_ctl_tclass(struct sk_buff 
 	u32 qid = TC_H_MAJ(clid);
 	int err;
 
-	if (net != &init_net)
-		return -EINVAL;
-
-	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 		return -ENODEV;
 
 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1433,10 +1427,10 @@ static int tc_ctl_tclass(struct sk_buff 
 			if (cops->delete)
 				err = cops->delete(q, cl);
 			if (err == 0)
-				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
+				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
 			goto out;
 		case RTM_GETTCLASS:
-			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
+			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
 			goto out;
 		default:
 			err = -EINVAL;
@@ -1449,7 +1443,7 @@ static int tc_ctl_tclass(struct sk_buff 
 	if (cops->change)
 		err = cops->change(q, clid, pid, tca, &new_cl);
 	if (err == 0)
-		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
+		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
 
 out:
 	if (cl)
@@ -1501,8 +1495,9 @@ nla_put_failure:
 	return -1;
 }
 
-static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			  struct Qdisc *q, unsigned long cl, int event)
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+			 struct nlmsghdr *n, struct Qdisc *q,
+			 unsigned long cl, int event)
 {
 	struct sk_buff *skb;
 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
@@ -1516,7 +1511,7 @@ static int tclass_notify(struct sk_buff 
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 }
 
 struct qdisc_dump_args
@@ -1591,12 +1586,9 @@ static int tc_dump_tclass(struct sk_buff
 	struct net_device *dev;
 	int t, s_t;
 
-	if (net != &init_net)
-		return 0;
-
 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
 		return 0;
-	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+	if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 		return 0;
 
 	s_t = cb->args[0];
@@ -1716,14 +1708,53 @@ static const struct file_operations psch
 	.llseek = seq_lseek,
 	.release = single_release,
 };
+static int __net_init psched_net_init(struct net *net)
+{
+	struct proc_dir_entry *e;
+
+	e = proc_net_fops_create(net, "psched", 0, &psched_fops);
+	if (e == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+	proc_net_remove(net, "psched");
+
+	return;
+}
+#else
+static int __net_init psched_net_init(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+}
 #endif
 
+static struct pernet_operations psched_net_ops = {
+	.init = psched_net_init,
+	.exit = psched_net_exit,
+};
+
 static int __init pktsched_init(void)
 {
+	int err;
+
+	err = register_pernet_subsys(&psched_net_ops);
+	if (err) {
+		printk(KERN_ERR "pktsched_init: "
+		       "cannot initialize per netns operations\n");
+		return err;
+	}
+
 	register_qdisc(&pfifo_qdisc_ops);
 	register_qdisc(&bfifo_qdisc_ops);
 	register_qdisc(&mq_qdisc_ops);
-	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
 
 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sched/sch_cbq.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_cbq.c
--- linux-2.6.32-504.3.3.el6.orig/net/sched/sch_cbq.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_cbq.c	2015-01-21 12:02:45.373169320 +0300
@@ -160,7 +160,6 @@ struct cbq_sched_data
 	struct cbq_class	*tx_borrowed;
 	int			tx_len;
 	psched_time_t		now;		/* Cached timestamp */
-	psched_time_t		now_rt;		/* Cached real time */
 	unsigned		pmask;
 
 	struct hrtimer		delay_timer;
@@ -350,12 +349,7 @@ cbq_mark_toplevel(struct cbq_sched_data 
 	int toplevel = q->toplevel;
 
 	if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
-		psched_time_t now;
-		psched_tdiff_t incr;
-
-		now = psched_get_time();
-		incr = now - q->now_rt;
-		now = q->now + incr;
+		psched_time_t now = psched_get_time();
 
 		do {
 			if (cl->undertime < now) {
@@ -702,8 +696,13 @@ cbq_update(struct cbq_sched_data *q)
 	struct cbq_class *this = q->tx_class;
 	struct cbq_class *cl = this;
 	int len = q->tx_len;
+	psched_time_t now;
 
 	q->tx_class = NULL;
+	/* Time integrator. We calculate EOS time
+	 * by adding expected packet transmission time.
+	 */
+	now = q->now + L2T(&q->link, len);
 
 	for ( ; cl; cl = cl->share) {
 		long avgidle = cl->avgidle;
@@ -719,7 +718,7 @@ cbq_update(struct cbq_sched_data *q)
 			 idle = (now - last) - last_pktlen/rate
 		 */
 
-		idle = q->now - cl->last;
+		idle = now - cl->last;
 		if ((unsigned long)idle > 128*1024*1024) {
 			avgidle = cl->maxidle;
 		} else {
@@ -763,7 +762,7 @@ cbq_update(struct cbq_sched_data *q)
 			idle -= L2T(&q->link, len);
 			idle += L2T(cl, len);
 
-			cl->undertime = q->now + idle;
+			cl->undertime = now + idle;
 		} else {
 			/* Underlimit */
 
@@ -773,7 +772,8 @@ cbq_update(struct cbq_sched_data *q)
 			else
 				cl->avgidle = avgidle;
 		}
-		cl->last = q->now;
+		if ((s64)(now - cl->last) > 0)
+			cl->last = now;
 	}
 
 	cbq_update_toplevel(q, this, q->tx_borrowed);
@@ -873,8 +873,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int 
 
 			if (cl->deficit <= 0) {
 				q->active[prio] = cl;
-				cl = cl->next_alive;
 				cl->deficit += cl->quantum;
+				cl = cl->next_alive;
 			}
 			return skb;
 
@@ -944,28 +944,13 @@ cbq_dequeue(struct Qdisc *sch)
 	struct sk_buff *skb;
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	psched_time_t now;
-	psched_tdiff_t incr;
 
 	now = psched_get_time();
-	incr = now - q->now_rt;
 
-	if (q->tx_class) {
-		psched_tdiff_t incr2;
-		/* Time integrator. We calculate EOS time
-		   by adding expected packet transmission time.
-		   If real time is greater, we warp artificial clock,
-		   so that:
-
-		   cbq_time = max(real_time, work);
-		 */
-		incr2 = L2T(&q->link, q->tx_len);
-		q->now += incr2;
+	if (q->tx_class)
 		cbq_update(q);
-		if ((incr -= incr2) < 0)
-			incr = 0;
-	}
-	q->now += incr;
-	q->now_rt = now;
+
+	q->now = now;
 
 	for (;;) {
 		q->wd_expires = 0;
@@ -1047,17 +1032,19 @@ static void cbq_normalize_quanta(struct 
 
 	for (h = 0; h < q->clhash.hashsize; h++) {
 		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+			long mtu;
 			/* BUGGGG... Beware! This expression suffer of
 			   arithmetic overflows!
 			 */
 			if (cl->priority == prio) {
-				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
-					q->quanta[prio];
-			}
-			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
-				printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
-				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+				cl->quantum = (cl->weight * cl->allot) /
+					(q->quanta[prio] / q->nclasses[prio]);
 			}
+			mtu = qdisc_dev(cl->qdisc)->mtu;
+			if (cl->quantum <= mtu/2)
+				cl->quantum = mtu/2 + 1;
+			else if (cl->quantum > 32*mtu) 
+				cl->quantum = 32*mtu;
 		}
 	}
 }
@@ -1217,7 +1204,6 @@ cbq_reset(struct Qdisc* sch)
 	hrtimer_cancel(&q->delay_timer);
 	q->toplevel = TC_CBQ_MAXLEVEL;
 	q->now = psched_get_time();
-	q->now_rt = q->now;
 
 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
 		q->active[prio] = NULL;
@@ -1401,7 +1387,6 @@ static int cbq_init(struct Qdisc *sch, s
 	q->delay_timer.function = cbq_undelay;
 	q->toplevel = TC_CBQ_MAXLEVEL;
 	q->now = psched_get_time();
-	q->now_rt = q->now;
 
 	cbq_link_class(&q->link);
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sched/sch_generic.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_generic.c
--- linux-2.6.32-504.3.3.el6.orig/net/sched/sch_generic.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_generic.c	2015-01-21 12:02:51.119016779 +0300
@@ -179,17 +179,23 @@ static inline int qdisc_restart(struct Q
 	struct net_device *dev;
 	spinlock_t *root_lock;
 	struct sk_buff *skb;
+	int ret;
+	struct ve_struct *old_ve;
 
 	/* Dequeue packet */
 	skb = dequeue_skb(q);
 	if (unlikely(!skb))
 		return 0;
 
+	old_ve = set_exec_env(skb->owner_env);
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
-	return sch_direct_xmit(skb, q, dev, txq, root_lock);
+	ret = sch_direct_xmit(skb, q, dev, txq, root_lock);
+	(void)set_exec_env(old_ve);
+
+	return ret;
 }
 
 void __qdisc_run(struct Qdisc *q)
@@ -805,15 +811,18 @@ static bool some_qdisc_is_busy(struct ne
 
 void dev_deactivate_many(struct list_head *head)
 {
+	struct ve_struct *old_env;
 	struct net_device *dev;
 	struct net_device_extended *nde;
 
 	list_for_each_entry(nde, head, unreg_list) {
 		dev = nde->dev;
+		old_env = set_exec_env(dev->owner_env);
 		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
 					 &noop_qdisc);
 		dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
 		dev_watchdog_down(dev);
+		set_exec_env(old_env);
 	}
 
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sched/sch_teql.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_teql.c
--- linux-2.6.32-504.3.3.el6.orig/net/sched/sch_teql.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sched/sch_teql.c	2015-01-21 12:02:45.373169320 +0300
@@ -178,6 +178,9 @@ static int teql_qdisc_init(struct Qdisc 
 	struct teql_master *m = (struct teql_master*)sch->ops;
 	struct teql_sched_data *q = qdisc_priv(sch);
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (dev->hard_header_len > m->dev->hard_header_len)
 		return -EINVAL;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sctp/ulpevent.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sctp/ulpevent.c
--- linux-2.6.32-504.3.3.el6.orig/net/sctp/ulpevent.c	2014-12-12 23:29:16.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sctp/ulpevent.c	2015-01-21 12:02:43.295224488 +0300
@@ -701,7 +701,7 @@ struct sctp_ulpevent *sctp_ulpevent_make
 	if (rx_count >= asoc->base.sk->sk_rcvbuf) {
 
 		if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
-		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
+		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb)))
 			goto fail;
 	}
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/socket.c linux-2.6.32-504.3.3.el6-042stab103_6/net/socket.c
--- linux-2.6.32-504.3.3.el6.orig/net/socket.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/socket.c	2015-01-21 12:02:49.581057608 +0300
@@ -85,6 +85,7 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/in.h>
 #include <linux/nsproxy.h>
 #include <linux/magic.h>
 
@@ -171,15 +172,6 @@ static DEFINE_PER_CPU(int, sockets_in_us
  * divide and look after the messy bits.
  */
 
-#define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
-					   16 for IP, 16 for IPX,
-					   24 for IPv6,
-					   about 80 for AX.25
-					   must be at least one bigger than
-					   the AF_UNIX size (see net/unix/af_unix.c
-					   :unix_mkname()).
-					 */
-
 /**
  *	move_addr_to_kernel	-	copy a socket address into kernel space
  *	@uaddr: Address in user space
@@ -201,6 +193,7 @@ int move_addr_to_kernel(void __user *uad
 		return -EFAULT;
 	return audit_sockaddr(ulen, kaddr);
 }
+EXPORT_SYMBOL(move_addr_to_kernel);
 
 /**
  *	move_addr_to_user	-	copy an address to user space
@@ -526,6 +519,9 @@ const struct file_operations bad_sock_fo
 
 void sock_release(struct socket *sock)
 {
+	if (sock->sk)
+		ub_sock_sndqueuedel(sock->sk);
+
 	if (sock->ops) {
 		struct module *owner = sock->ops->owner;
 
@@ -1211,6 +1207,54 @@ call_kill:
 	return 0;
 }
 
+int vz_security_family_check(int family)
+{
+#ifdef CONFIG_VE
+	if (ve_is_super(get_exec_env()))
+		return 0;
+
+	switch (family) {
+	case PF_UNSPEC:
+	case PF_PACKET:
+	case PF_NETLINK:
+	case PF_UNIX:
+	case PF_INET:
+	case PF_INET6:
+	case PF_PPPOX:
+	case PF_KEY:
+		break;
+	default:
+		return -EAFNOSUPPORT;
+        }
+#endif
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vz_security_family_check);
+
+int vz_security_protocol_check(int protocol)
+{
+#ifdef CONFIG_VE
+	if (ve_is_super(get_exec_env()))
+		return 0;
+
+	switch (protocol) {
+	case  IPPROTO_IP:
+	case  IPPROTO_TCP:
+	case  IPPROTO_UDP:
+	case  IPPROTO_RAW:
+	case  IPPROTO_DCCP:
+	case  IPPROTO_GRE:
+	case  IPPROTO_ESP:
+	case  IPPROTO_AH:
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+#endif
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vz_security_protocol_check);
+
 int __sock_create(struct net *net, int family, int type, int protocol,
 			 struct socket **res, int kern)
 {
@@ -1241,6 +1285,11 @@ int __sock_create(struct net *net, int f
 		family = PF_PACKET;
 	}
 
+	/* VZ compatibility layer */
+	err = vz_security_family_check(family);
+	if (err < 0)
+		return err;
+
 	err = security_socket_create(family, type, protocol, kern);
 	if (err)
 		return err;
@@ -2510,6 +2559,19 @@ int sock_register(const struct net_proto
 	return err;
 }
 
+int is_sock_registered(int family)
+{
+	const struct net_proto_family *ops;
+
+	BUG_ON(family < 0 || family >= NPROTO);
+
+	spin_lock(&net_family_lock);
+	ops = net_families[family];
+	spin_unlock(&net_family_lock);
+
+	return ops ? 1 : 0;
+}
+
 /**
  *	sock_unregister - remove a protocol handler
  *	@family: protocol family to remove
@@ -2702,9 +2764,12 @@ int kernel_sock_ioctl(struct socket *soc
 {
 	mm_segment_t oldfs = get_fs();
 	int err;
+	struct ve_struct *old_env;
 
 	set_fs(KERNEL_DS);
+	old_env = set_exec_env(sock->sk->owner_env);
 	err = sock->ops->ioctl(sock, cmd, arg);
+	(void)set_exec_env(old_env);
 	set_fs(oldfs);
 
 	return err;
@@ -2722,6 +2787,7 @@ EXPORT_SYMBOL(sock_map_fd);
 EXPORT_SYMBOL(sock_recvmsg);
 EXPORT_SYMBOL(sock_register);
 EXPORT_SYMBOL(sock_release);
+EXPORT_SYMBOL(is_sock_registered);
 EXPORT_SYMBOL(sock_sendmsg);
 EXPORT_SYMBOL(sock_unregister);
 EXPORT_SYMBOL(sock_wake_async);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/cache.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/cache.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/cache.c	2014-12-12 23:29:10.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/cache.c	2015-01-21 12:02:47.642109082 +0300
@@ -35,6 +35,8 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <net/net_namespace.h>
 
+#include "ve.h"
+
 #define	 RPCDBG_FACILITY RPCDBG_CACHE
 
 static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
@@ -344,6 +346,34 @@ static int current_index;
 static void do_cache_clean(struct work_struct *work);
 static struct delayed_work cache_cleaner;
 
+struct cache_detail *cache_alloc(struct cache_detail *orig, int hsize)
+{
+	struct cache_detail *n;
+	struct cache_head **table;
+
+	n = kmemdup(orig, sizeof(struct cache_detail), GFP_KERNEL);
+	if (n == NULL)
+		return NULL;
+
+	table = kzalloc(hsize * sizeof(struct cache_head *), GFP_KERNEL);
+	if (table == NULL) {
+		kfree(n);
+		return NULL;
+	}
+
+	n->hash_table = table;
+	return n;
+}
+EXPORT_SYMBOL(cache_alloc);
+
+void cache_free(struct cache_detail *cd)
+{
+	cache_unregister(cd);
+	kfree(cd->hash_table);
+	kfree(cd);
+}
+EXPORT_SYMBOL(cache_free);
+
 static void sunrpc_init_cache_detail(struct cache_detail *cd)
 {
 	rwlock_init(&cd->hash_lock);
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/clnt.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/clnt.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/clnt.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/clnt.c	2015-01-21 12:02:47.692107754 +0300
@@ -35,6 +35,8 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/un.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
@@ -645,6 +647,9 @@ static const struct rpc_call_ops rpc_def
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 {
 	struct rpc_task *task;
+	struct ve_struct *ve;
+
+	ve = set_exec_env(task_setup_data->rpc_client->cl_xprt->owner_env);
 
 	task = rpc_new_task(task_setup_data);
 	if (IS_ERR(task))
@@ -659,6 +664,7 @@ struct rpc_task *rpc_run_task(const stru
 	atomic_inc(&task->tk_count);
 	rpc_execute(task);
 out:
+	(void)set_exec_env(ve);
 	return task;
 }
 EXPORT_SYMBOL_GPL(rpc_run_task);
@@ -1528,8 +1534,8 @@ call_status(struct rpc_task *task)
 		break;
 	default:
 		if (clnt->cl_chatty)
-			printk("%s: RPC call returned error %d\n",
-			       clnt->cl_protname, -status);
+			printk("ct%d %s: RPC call returned error %d\n",
+			       get_exec_env()->veid, clnt->cl_protname, -status);
 		rpc_exit(task, status);
 	}
 }
@@ -1558,8 +1564,8 @@ call_timeout(struct rpc_task *task)
 	}
 	if (RPC_IS_SOFT(task)) {
 		if (clnt->cl_chatty)
-			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
-				clnt->cl_protname, clnt->cl_server);
+			printk(KERN_NOTICE "ct%d %s: server %s not responding, timed out\n",
+				get_exec_env()->veid, clnt->cl_protname, clnt->cl_server);
 		if (task->tk_flags & RPC_TASK_TIMEOUT)
 			rpc_exit(task, -ETIMEDOUT);
 		else
@@ -1570,8 +1576,8 @@ call_timeout(struct rpc_task *task)
 	if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
 		task->tk_flags |= RPC_CALL_MAJORSEEN;
 		if (clnt->cl_chatty)
-			printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
-			clnt->cl_protname, clnt->cl_server);
+			printk(KERN_NOTICE "ct%d %s: server %s not responding, still trying\n",
+			get_exec_env()->veid, clnt->cl_protname, clnt->cl_server);
 	}
 	rpc_force_rebind(clnt);
 	/*
@@ -1602,8 +1608,8 @@ call_decode(struct rpc_task *task)
 
 	if (task->tk_flags & RPC_CALL_MAJORSEEN) {
 		if (clnt->cl_chatty)
-			printk(KERN_NOTICE "%s: server %s OK\n",
-				clnt->cl_protname, clnt->cl_server);
+			printk(KERN_NOTICE "ct%d %s: server %s OK\n",
+				get_exec_env()->veid, clnt->cl_protname, clnt->cl_server);
 		task->tk_flags &= ~RPC_CALL_MAJORSEEN;
 	}
 
@@ -1931,3 +1937,106 @@ void rpc_show_tasks(void)
 	spin_unlock(&rpc_client_lock);
 }
 #endif
+
+#ifdef CONFIG_VE
+static int ve_sunrpc_start(void *data)
+{
+	struct ve_struct *ve = data;
+	int err;
+
+	if (!(ve->features & (VE_FEATURE_NFS | VE_FEATURE_NFSD)))
+		return 0;
+
+	err = -ENOMEM;
+	ve->ve_rpc_data = kzalloc(sizeof(struct ve_rpc_data), GFP_KERNEL);
+	if (ve->ve_rpc_data == NULL)
+		goto err_rd;
+	ve_rpc_data_init();
+
+	if (rpc_proc_init() == NULL)
+		goto err_proc;
+
+	err = ve_ip_map_init();
+	if (err)
+		goto err_map;
+	
+	err = register_rpc_pipefs();
+	if (err)
+		goto err_pipefs;
+
+	err = rpciod_start();
+	if (!err)
+		goto err_rpciod;
+
+	return 0;
+
+err_rpciod:
+	unregister_rpc_pipefs();
+err_pipefs:
+	ve_ip_map_exit();
+err_map:
+	rpc_proc_exit();
+err_proc:
+	kfree(ve->ve_rpc_data);
+err_rd:
+	return err;
+}
+
+void ve_sunrpc_stop(void *data)
+{
+	struct ve_struct *ve = (struct ve_struct *)data;
+	struct rpc_clnt *clnt;
+
+	if (ve->ve_rpc_data == NULL)
+		return;
+
+	dprintk("RPC:       killing all tasks for VE %d\n", ve->veid);
+
+	spin_lock(&rpc_client_lock);
+	list_for_each_entry(clnt, &all_clients, cl_clients) {
+		if (clnt->cl_xprt->owner_env != ve)
+			continue;
+
+		rpc_killall_tasks(clnt);
+		if (!wait_event_timeout(destroy_wait,
+			list_empty(&clnt->cl_tasks), 1*HZ))
+			printk(KERN_WARNING "CT%d: SUNRPC client %p shutdown: "
+					"timed out\n", ve->veid, clnt);
+
+	}
+	spin_unlock(&rpc_client_lock);
+
+	unregister_rpc_pipefs();
+	ve_ip_map_exit();
+	rpc_proc_exit();
+	if (ve_rpc_data_put(ve) == false)
+		printk(KERN_WARNING "CT%d: SUNRPC transports used outside CT. "
+				"Release all external references to CT's SUNRPC "
+				"data to continue shutdown.\n", ve->veid);
+}
+
+static struct ve_hook sunrpc_hook = {
+	.init	  = ve_sunrpc_start,
+	.fini	  = ve_sunrpc_stop,
+	.owner	  = THIS_MODULE,
+	.priority = HOOK_PRIO_NET_PRE,
+};
+
+void ve_sunrpc_hook_register(void)
+{
+	ve_hook_register(VE_SS_CHAIN, &sunrpc_hook);
+}
+
+void ve_sunrpc_hook_unregister(void)
+{
+	ve_hook_unregister(&sunrpc_hook);
+}
+#else
+void ve_sunrpc_hook_register(void)
+{
+}
+
+void ve_sunrpc_hook_unregister(void)
+{
+}
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/rpc_pipe.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/rpc_pipe.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/rpc_pipe.c	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/rpc_pipe.c	2015-01-21 12:02:47.642109082 +0300
@@ -28,6 +28,8 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/cache.h>
 
+#include "ve.h"
+
 static struct vfsmount *rpc_mount __read_mostly;
 static int rpc_mount_count;
 
@@ -1054,6 +1056,12 @@ init_once(void *foo)
 int register_rpc_pipefs(void)
 {
 	int err;
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve))
+		return register_ve_fs_type(ve, &rpc_pipe_fs_type,
+				&rpc_pipefs_fstype, NULL);
 
 	rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
 				sizeof(struct rpc_inode),
@@ -1073,6 +1081,14 @@ int register_rpc_pipefs(void)
 
 void unregister_rpc_pipefs(void)
 {
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		unregister_ve_fs_type(rpc_pipefs_fstype, NULL);
+		return;
+	}
+
 	kmem_cache_destroy(rpc_inode_cachep);
 	unregister_filesystem(&rpc_pipe_fs_type);
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/rpcb_clnt.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/rpcb_clnt.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/rpcb_clnt.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/rpcb_clnt.c	2015-01-21 12:02:47.643109055 +0300
@@ -22,12 +22,15 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/mutex.h>
+#include <linux/nsproxy.h>
 #include <net/ipv6.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/xprtsock.h>
 
+#include "ve.h"
+
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_BIND
 #endif
@@ -114,11 +117,12 @@ static void			rpcb_getport_done(struct r
 static void			rpcb_map_release(void *data);
 static struct rpc_program	rpcb_program;
 
-static struct rpc_clnt *	rpcb_local_clnt;
-static struct rpc_clnt *	rpcb_local_clnt4;
-
-DEFINE_SPINLOCK(rpcb_clnt_lock);
-unsigned int			rpcb_users;
+#ifndef CONFIG_VE
+static struct rpc_clnt *	_rpcb_local_clnt;
+static struct rpc_clnt *	_rpcb_local_clnt4;
+DEFINE_SPINLOCK(_rpcb_clnt_lock);
+unsigned int			_rpcb_users;
+#endif
 
 struct rpcbind_args {
 	struct rpc_xprt *	r_xprt;
@@ -279,7 +283,7 @@ static int rpcb_create_local_net(void)
 		.sin_port		= htons(RPCBIND_PORT),
 	};
 	struct rpc_create_args args = {
-		.net		= &init_net,
+		.net		= current->nsproxy->net_ns,
 		.protocol	= XPRT_TRANSPORT_TCP,
 		.address	= (struct sockaddr *)&rpcb_inaddr_loopback,
 		.addrsize	= sizeof(rpcb_inaddr_loopback),
@@ -347,7 +351,7 @@ static struct rpc_clnt *rpcb_create(char
 				    size_t salen, int proto, u32 version)
 {
 	struct rpc_create_args args = {
-		.net		= &init_net,
+		.net		= current->nsproxy->net_ns,
 		.protocol	= proto,
 		.address	= srvaddr,
 		.addrsize	= salen,
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/sched.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/sched.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/sched.c	2014-12-12 23:29:25.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/sched.c	2015-01-21 12:02:47.643109055 +0300
@@ -28,6 +28,9 @@
 #define RPCDBG_FACILITY		RPCDBG_SCHED
 #endif
 
+static int rpc_serialize = 0;
+module_param(rpc_serialize, int, 0440);
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/sunrpc.h>
 
@@ -54,7 +57,9 @@ static struct rpc_wait_queue delay_queue
 /*
  * rpciod-related stuff
  */
-struct workqueue_struct *rpciod_workqueue;
+#ifndef CONFIG_VE
+struct workqueue_struct *_rpciod_workqueue;
+#endif
 
 /*
  * Disable the timer for a given RPC task. Should be called with
@@ -254,7 +259,7 @@ static int rpc_wait_bit_killable(void *w
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	freezable_schedule();
+	schedule();
 	return 0;
 }
 
@@ -318,6 +323,16 @@ int __rpc_wait_for_completion_task(struc
 }
 EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
 
+static struct ve_struct *rpc_task_ve(struct rpc_task *task)
+{
+	if (task->tk_client)
+		return task->tk_client->cl_xprt->owner_env;
+	else if (task->tk_rqstp)
+		return task->tk_rqstp->rq_xprt->owner_env;
+	else
+		BUG();
+}
+
 /*
  * Make an RPC task runnable.
  *
@@ -331,7 +346,11 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_complet
  */
 static void rpc_make_runnable(struct rpc_task *task)
 {
-	bool need_wakeup = !rpc_test_and_set_running(task);
+	bool need_wakeup;
+
+	BUG_ON(rpc_task_ve(task) != get_exec_env());
+
+	need_wakeup = !rpc_test_and_set_running(task);
 
 	rpc_clear_queued(task);
 	if (!need_wakeup)
@@ -427,11 +446,15 @@ static void __rpc_do_wake_up_task(struct
  */
 static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
+	struct ve_struct *ve;
+
+	ve = set_exec_env(rpc_task_ve(task));
 	if (RPC_IS_QUEUED(task)) {
 		smp_rmb();
 		if (task->tk_waitqueue == queue)
 			__rpc_do_wake_up_task(queue, task);
 	}
+	(void)set_exec_env(ve);
 }
 
 /*
@@ -623,11 +646,21 @@ static void __rpc_queue_timer_fn(unsigne
 	spin_lock(&queue->lock);
 	expires = now = jiffies;
 	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
+		struct ve_struct *ve;
+
+		ve = rpc_task_ve(task);
 		timeo = task->u.tk_wait.expires;
 		if (time_after_eq(now, timeo)) {
 			dprintk("RPC: %5u timeout\n", task->tk_pid);
 			task->tk_status = -ETIMEDOUT;
+			/*
+			 * Here we have to change execution environment since
+			 * this function is called from timer handling code,
+			 * which is executed in ve0.
+			 */
+			ve = set_exec_env(ve);
 			rpc_wake_up_task_queue_locked(queue, task);
+			(void)set_exec_env(ve);
 			continue;
 		}
 		if (expires == now || time_after(expires, timeo))
@@ -692,6 +725,16 @@ void rpc_release_calldata(const struct r
 		ops->rpc_release(calldata);
 }
 
+static inline int rpc_abort_task_ve(struct rpc_task *task)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	BUG_ON(rpc_task_ve(task) != ve);
+
+	return !(ve->is_running || (task->tk_flags & RPC_TASK_KILLED));
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -706,6 +749,13 @@ static void __rpc_execute(struct rpc_tas
 
 	BUG_ON(RPC_IS_QUEUED(task));
 
+	if (rpc_abort_task_ve(task)) {
+		dprintk("RPC: VE%d is not running. Drop task %d with EIO.",
+				get_exec_env()->veid, task->tk_pid);
+		task->tk_flags |= RPC_TASK_KILLED;
+		rpc_exit(task, -EIO);
+	}
+
 	for (;;) {
 		void (*do_action)(struct rpc_task *);
 
@@ -1042,7 +1092,7 @@ void rpciod_down(void)
 /*
  * Start up the rpciod workqueue.
  */
-static int rpciod_start(void)
+int rpciod_start(void)
 {
 	struct workqueue_struct *wq;
 
@@ -1050,12 +1100,17 @@ static int rpciod_start(void)
 	 * Create the rpciod thread and wait for it to start.
 	 */
 	dprintk("RPC:       creating workqueue rpciod\n");
-	wq = create_workqueue("rpciod");
+	if (rpc_serialize) {
+		wq = create_singlethread_workqueue_ve("rpciod", get_exec_env());
+	} else {
+		wq = create_workqueue_ve("rpciod", get_exec_env());
+	}
+
 	rpciod_workqueue = wq;
 	return rpciod_workqueue != NULL;
 }
 
-static void rpciod_stop(void)
+void rpciod_stop(void)
 {
 	struct workqueue_struct *wq = NULL;
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/stats.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/stats.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/stats.c	2014-12-12 23:29:30.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/stats.c	2015-01-21 12:02:47.651108843 +0300
@@ -23,9 +23,13 @@
 #include <linux/sunrpc/metrics.h>
 #include <net/net_namespace.h>
 
+#include "ve.h"
+
 #define RPCDBG_FACILITY	RPCDBG_MISC
 
-struct proc_dir_entry	*proc_net_rpc = NULL;
+#ifndef CONFIG_VE
+struct proc_dir_entry	*_proc_net_rpc = NULL;
+#endif
 
 /*
  * Get RPC client stats
@@ -215,7 +219,9 @@ EXPORT_SYMBOL_GPL(rpc_print_iostats);
 static inline struct proc_dir_entry *
 do_register(const char *name, void *data, const struct file_operations *fops)
 {
-	rpc_proc_init();
+	if (rpc_proc_init() == NULL)
+		return NULL;
+
 	dprintk("RPC:       registering /proc/net/rpc/%s\n", name);
 
 	return proc_create_data(name, 0, proc_net_rpc, fops, data);
@@ -249,12 +255,12 @@ svc_proc_unregister(const char *name)
 }
 EXPORT_SYMBOL_GPL(svc_proc_unregister);
 
-void
-rpc_proc_init(void)
+struct proc_dir_entry *rpc_proc_init(void)
 {
 	dprintk("RPC:       registering /proc/net/rpc\n");
-	if (!proc_net_rpc)
-		proc_net_rpc = proc_mkdir("rpc", init_net.proc_net);
+	if (!proc_net_rpc && get_exec_env()->ve_netns)
+		proc_net_rpc = proc_mkdir("rpc", get_exec_env()->ve_netns->proc_net);
+	return proc_net_rpc;
 }
 
 void
@@ -263,7 +269,7 @@ rpc_proc_exit(void)
 	dprintk("RPC:       unregistering /proc/net/rpc\n");
 	if (proc_net_rpc) {
 		proc_net_rpc = NULL;
-		remove_proc_entry("rpc", init_net.proc_net);
+		remove_proc_entry("rpc", get_exec_env()->ve_netns->proc_net);
 	}
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/sunrpc.h linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/sunrpc.h
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/sunrpc.h	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/sunrpc.h	2015-01-21 12:02:47.643109055 +0300
@@ -29,6 +29,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBI
 
 #include <linux/net.h>
 
+#include "ve.h"
+
 /*
  * Header for dynamically allocated rpc buffers.
  */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/sunrpc_syms.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/sunrpc_syms.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/sunrpc_syms.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/sunrpc_syms.c	2015-01-21 12:02:47.651108843 +0300
@@ -22,12 +22,26 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/xprtsock.h>
 
-extern struct cache_detail ip_map_cache, unix_gid_cache;
+#include "ve.h"
+
+extern struct cache_detail unix_gid_cache;
+
+extern void ve_sunrpc_hook_register(void);
+extern void ve_sunrpc_hook_unregister(void);
+extern int ve_ip_map_init(void);
+extern void ve_ip_map_exit(void);
+
+static struct ve_rpc_data ve0_rpcd;
 
 static int __init
 init_sunrpc(void)
 {
-	int err = register_rpc_pipefs();
+	int err;
+
+	get_ve0()->ve_rpc_data = &ve0_rpcd;
+	ve_rpc_data_init();
+
+	err = register_rpc_pipefs();
 	if (err)
 		goto out;
 	err = rpc_init_mempool();
@@ -36,18 +50,22 @@ init_sunrpc(void)
 	err = rpcauth_init_module();
 	if (err)
 		goto out3;
+#ifdef CONFIG_PROC_FS
+	if (rpc_proc_init() == NULL)
+		goto out4;
+#endif
 #ifdef RPC_DEBUG
 	rpc_register_sysctl();
 #endif
-#ifdef CONFIG_PROC_FS
-	rpc_proc_init();
-#endif
 	cache_initialize();
-	cache_register(&ip_map_cache);
+	ve_ip_map_init();
 	cache_register(&unix_gid_cache);
 	svc_init_xprt_sock();	/* svc sock transport */
 	init_socket_xprt();	/* clnt sock transport */
+	ve_sunrpc_hook_register();
 	return 0;
+out4:
+	rpcauth_remove_module();
 out3:
 	rpc_destroy_mempool();
 out2:
@@ -59,12 +77,13 @@ out:
 static void __exit
 cleanup_sunrpc(void)
 {
+	ve_sunrpc_hook_unregister();
 	rpcauth_remove_module();
 	cleanup_socket_xprt();
 	svc_cleanup_xprt_sock();
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
-	cache_unregister(&ip_map_cache);
+	ve_ip_map_exit();
 	cache_unregister(&unix_gid_cache);
 #ifdef RPC_DEBUG
 	rpc_unregister_sysctl();
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/svc.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/svc.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/svc.c	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/svc.c	2015-01-21 12:02:46.939127745 +0300
@@ -398,7 +398,7 @@ static int svc_uses_rpcbind(struct svc_s
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	     void (*shutdown)(struct svc_serv *serv))
+	     void (*shutdown)(struct svc_serv *serv), struct svc_stat *stats)
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -410,7 +410,7 @@ __svc_create(struct svc_program *prog, u
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
-	serv->sv_stats     = prog->pg_stats;
+	serv->sv_stats     = stats;
 	if (bufsize > RPCSVC_MAXPAYLOAD)
 		bufsize = RPCSVC_MAXPAYLOAD;
 	serv->sv_max_payload = bufsize? bufsize : 4096;
@@ -458,7 +458,7 @@ __svc_create(struct svc_program *prog, u
 	}
 
 	if (svc_uses_rpcbind(serv)) {
-	       	if (svc_rpcb_setup(serv) < 0) {
+	       	if (svc_rpcb_setup(serv)) {
 			kfree(serv->sv_pools);
 			kfree(serv);
 			return NULL;
@@ -474,19 +474,21 @@ struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
 	   void (*shutdown)(struct svc_serv *serv))
 {
-	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, shutdown,
+			prog->pg_stats);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 		  void (*shutdown)(struct svc_serv *serv),
-		  svc_thread_fn func, struct module *mod)
+		  svc_thread_fn func, struct module *mod,
+		  struct svc_stat *stat)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, shutdown);
+	serv = __svc_create(prog, bufsize, npools, shutdown, stat);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
@@ -716,7 +718,7 @@ svc_set_num_threads(struct svc_serv *ser
 		}
 
 		__module_get(serv->sv_module);
-		task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+		task = kthread_create_ve(get_exec_env(), serv->sv_function, rqstp, serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
 			module_put(serv->sv_module);
@@ -891,7 +893,7 @@ static int __svc_register(const char *pr
 	}
 
 	if (error < 0)
-		printk(KERN_WARNING "svc: failed to register %sv%u RPC "
+		ve_printk(VE_LOG, KERN_WARNING "svc: failed to register %sv%u RPC "
 			"service (errno %d).\n", progname, version, -error);
 	return error;
 }
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/svcauth_unix.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/svcauth_unix.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/svcauth_unix.c	2014-12-12 23:29:05.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/svcauth_unix.c	2015-01-21 12:02:47.644109029 +0300
@@ -17,6 +17,8 @@
 
 #include <linux/sunrpc/clnt.h>
 
+#include "ve.h"
+
 /*
  * AUTHUNIX and AUTHNULL credentials are both handled here.
  * AUTHNULL is treated just like AUTHUNIX except that the uid/gid
@@ -291,8 +293,7 @@ static int ip_map_show(struct seq_file *
 	return 0;
 }
 
-
-struct cache_detail ip_map_cache = {
+struct cache_detail __ip_map_cache = {
 	.owner		= THIS_MODULE,
 	.hash_size	= IP_HASHMAX,
 	.hash_table	= ip_table,
@@ -307,6 +308,26 @@ struct cache_detail ip_map_cache = {
 	.alloc		= ip_map_alloc,
 };
 
+int ve_ip_map_init(void)
+{
+	struct cache_detail *cd;
+
+	cd = cache_alloc(&__ip_map_cache, IP_HASHMAX);
+	if (cd == NULL)
+		return -ENOMEM;
+
+	cache_register(cd);
+	ip_map_cache = cd;
+	return 0;
+}
+
+void ve_ip_map_exit(void)
+{
+	if (ip_map_cache)
+		cache_free(ip_map_cache);
+}
+
+
 static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
 		struct in6_addr *addr)
 {
@@ -328,7 +349,7 @@ static struct ip_map *__ip_map_lookup(st
 static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
 		struct in6_addr *addr)
 {
-	return __ip_map_lookup(&ip_map_cache, class, addr);
+	return __ip_map_lookup(ip_map_cache, class, addr);
 }
 
 static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
@@ -362,7 +383,7 @@ static int __ip_map_update(struct cache_
 static inline int ip_map_update(struct net *net, struct ip_map *ipm,
 		struct unix_domain *udom, time_t expiry)
 {
-	return __ip_map_update(&ip_map_cache, ipm, udom, expiry);
+	return __ip_map_update(ip_map_cache, ipm, udom, expiry);
 }
 
 int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom)
@@ -403,24 +424,24 @@ struct auth_domain *auth_unix_lookup(str
 
 	if (!ipm)
 		return NULL;
-	if (cache_check(&ip_map_cache, &ipm->h, NULL))
+	if (cache_check(ip_map_cache, &ipm->h, NULL))
 		return NULL;
 
 	if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
-		sunrpc_invalidate(&ipm->h, &ip_map_cache);
+		sunrpc_invalidate(&ipm->h, ip_map_cache);
 		rv = NULL;
 	} else {
 		rv = &ipm->m_client->h;
 		kref_get(&rv->ref);
 	}
-	cache_put(&ipm->h, &ip_map_cache);
+	cache_put(&ipm->h, ip_map_cache);
 	return rv;
 }
 EXPORT_SYMBOL_GPL(auth_unix_lookup);
 
 void svcauth_unix_purge(void)
 {
-	cache_purge(&ip_map_cache);
+	cache_purge(ip_map_cache);
 }
 EXPORT_SYMBOL_GPL(svcauth_unix_purge);
 
@@ -441,7 +462,7 @@ ip_map_cached_get(struct svc_xprt *xprt)
 				 */
 				xprt->xpt_auth_cache = NULL;
 				spin_unlock(&xprt->xpt_lock);
-				cache_put(&ipm->h, &ip_map_cache);
+				cache_put(&ipm->h, ip_map_cache);
 				return NULL;
 			}
 			cache_get(&ipm->h);
@@ -464,7 +485,7 @@ ip_map_cached_put(struct svc_xprt *xprt,
 		spin_unlock(&xprt->xpt_lock);
 	}
 	if (ipm)
-		cache_put(&ipm->h, &ip_map_cache);
+		cache_put(&ipm->h, ip_map_cache);
 }
 
 void
@@ -474,7 +495,7 @@ svcauth_unix_info_release(struct svc_xpr
 
 	ipm = xpt->xpt_auth_cache;
 	if (ipm != NULL)
-		cache_put(&ipm->h, &ip_map_cache);
+		cache_put(&ipm->h, ip_map_cache);
 }
 
 /****************************************************************************
@@ -731,7 +752,7 @@ svcauth_unix_set_client(struct svc_rqst 
 	if (ipm == NULL)
 		return SVC_DENIED;
 
-	switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+	switch (cache_check(ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
 		default:
 			BUG();
 		case -ETIMEDOUT:
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/svcsock.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/svcsock.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/svcsock.c	2014-12-12 23:29:35.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/svcsock.c	2015-01-21 12:02:46.940127718 +0300
@@ -1573,6 +1573,8 @@ static void svc_sock_detach(struct svc_x
 
 	dprintk("svc: svc_sock_detach(%p)\n", svsk);
 
+	/* XXX: serialization? */
+	sk->sk_user_data = NULL;
 	/* put back the old socket callbacks */
 	sk->sk_state_change = svsk->sk_ostate;
 	sk->sk_data_ready = svsk->sk_odata;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/ve.h linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/ve.h
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/ve.h	2015-01-21 12:02:47.644109029 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/ve.h	2015-01-21 12:02:47.693107728 +0300
@@ -0,0 +1,103 @@
+/*
+ * linux/include/ve_nfs.h
+ *
+ * VE context for SUNRPC
+ *
+ * Copyright (C) 2007 SWsoft
+ */
+
+#ifndef __VE_SUNRPC_H__
+#define __VE_SUNRPC_H__
+
+struct ve_rpc_data {
+	struct proc_dir_entry	*_proc_net_rpc;
+	struct cache_detail	*_ip_map_cache;
+	struct file_system_type	*_rpc_pipefs_fstype;
+	struct rpc_clnt		*_rpcb_local;
+	struct rpc_clnt		*_rpcb_local4;
+	spinlock_t		_rpcb_clnt_lock;
+	int			_rpcb_users;
+	struct workqueue_struct *_rpciod_workqueue;
+	atomic_t		_users;
+};
+
+#ifdef CONFIG_VE
+extern void rpcb_put_local(void);
+extern void rpciod_stop(void);
+
+static void destroy_rpc_data(struct work_struct *work)
+{
+	struct ve_struct *ve = container_of(work, struct ve_struct, rpc_destroy_work);
+
+	BUG_ON(!ve_is_super(get_exec_env()));
+
+	set_exec_env(ve);
+
+	rpciod_stop();
+	kfree(ve->ve_rpc_data);
+	ve->ve_rpc_data = NULL;
+
+	set_exec_env(&ve0);
+}
+
+static inline bool ve_rpc_data_put(struct ve_struct *ve)
+{
+	if (atomic_dec_and_test(&ve->ve_rpc_data->_users)) {
+		/*
+		 * RPC data usage counter have reached zero, and we
+		 * have to stop rpciod queue and release virtualized
+		 * data. But why we release this data in async queue?
+		 * Becuase we can come here from rpciod workqueue:
+		 * rpc_async_schedule -> __rpc_execute ->
+		 * rpc_release_task -> rpc_final_put_task ->
+		 * rpc_free_task -> rpc_release_calldata ->
+		 * rpcb_map_release -> xprt_put -> xprt_destroy ->
+		 * xs_destroy -> xprt_free -> ve_rpc_data_put ->
+		 * rpciod_stop
+		 * The only simple solution here is to schedule the same task
+		 * in another workqueue.
+		 */
+		queue_work(ve0.khelper_wq, &ve->rpc_destroy_work);
+		return true;
+	}
+	return false;
+}
+
+static inline void ve_rpc_data_init(void)
+{
+	atomic_set(&get_exec_env()->ve_rpc_data->_users, 1);
+	spin_lock_init(&get_exec_env()->ve_rpc_data->_rpcb_clnt_lock);
+	INIT_WORK(&get_exec_env()->rpc_destroy_work, destroy_rpc_data);
+}
+
+static inline void ve_rpc_data_get(void)
+{
+	atomic_inc(&get_exec_env()->ve_rpc_data->_users);
+}
+
+#define RPC_CTX_FIELD(arg)	(get_exec_env()->ve_rpc_data->_##arg)
+
+#else /* CONFIG_VE */
+
+#define RPC_CTX_FIELD(arg)	_##arg
+
+static void ve_rpc_data_init(void)
+{}
+static void ve_rpc_data_get(void)
+{}
+static void ve_rpc_data_put(struct ve_struct *ve)
+{ return true; }
+
+#endif /* CONFIG_VE */
+
+
+#define ip_map_cache		RPC_CTX_FIELD(ip_map_cache)
+#define proc_net_rpc		RPC_CTX_FIELD(proc_net_rpc)
+#define rpciod_workqueue	RPC_CTX_FIELD(rpciod_workqueue)
+#define rpc_pipefs_fstype	RPC_CTX_FIELD(rpc_pipefs_fstype)
+#define rpcb_local_clnt		RPC_CTX_FIELD(rpcb_local)
+#define rpcb_local_clnt4	RPC_CTX_FIELD(rpcb_local4)
+#define rpcb_clnt_lock		RPC_CTX_FIELD(rpcb_clnt_lock)
+#define rpcb_users		RPC_CTX_FIELD(rpcb_users)
+
+#endif /* __VE_SUNRPC_H__ */
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/xprt.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/xprt.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/xprt.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/xprt.c	2015-01-21 12:02:47.644109029 +0300
@@ -44,6 +44,7 @@
 #include <linux/workqueue.h>
 #include <linux/net.h>
 #include <linux/ktime.h>
+#include <linux/sched.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/metrics.h>
@@ -231,6 +232,7 @@ EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
 
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
+	BUG_ON(xprt->owner_env != get_exec_env());
 	xprt->snd_task = NULL;
 	if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
 		smp_mb__before_clear_bit();
@@ -637,6 +639,7 @@ EXPORT_SYMBOL_GPL(xprt_disconnect_done);
  */
 void xprt_force_disconnect(struct rpc_xprt *xprt)
 {
+	BUG_ON(xprt->owner_env != get_exec_env());
 	/* Don't race with the test_bit() in xprt_clear_locked() */
 	spin_lock_bh(&xprt->transport_lock);
 	set_bit(XPRT_CLOSE_WAIT, &xprt->state);
@@ -660,6 +663,7 @@ void xprt_force_disconnect(struct rpc_xp
  */
 void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
 {
+	BUG_ON(xprt->owner_env != get_exec_env());
 	/* Don't race with the test_bit() in xprt_clear_locked() */
 	spin_lock_bh(&xprt->transport_lock);
 	if (cookie != xprt->connect_cookie)
@@ -679,7 +683,13 @@ static void
 xprt_init_autodisconnect(unsigned long data)
 {
 	struct rpc_xprt *xprt = (struct rpc_xprt *)data;
+	struct ve_struct *ve;
 
+	/*
+	 * Here we have to change execution environment since this function is
+	 * called from timer handling code, which is executed in ve0.
+	 */
+	ve = set_exec_env(xprt->owner_env);
 	spin_lock(&xprt->transport_lock);
 	if (!list_empty(&xprt->recv))
 		goto out_abort;
@@ -688,9 +698,11 @@ xprt_init_autodisconnect(unsigned long d
 	spin_unlock(&xprt->transport_lock);
 	set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
 	queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	(void)set_exec_env(ve);
 	return;
 out_abort:
 	spin_unlock(&xprt->transport_lock);
+	(void)set_exec_env(ve);
 }
 
 /**
@@ -1117,6 +1129,7 @@ EXPORT_SYMBOL_GPL(xprt_alloc);
 
 void xprt_free(struct rpc_xprt *xprt)
 {
+	ve_rpc_data_put(xprt->owner_env);
 	put_net(xprt->xprt_net);
 	xprt_free_all_slots(xprt);
 	kfree(xprt);
@@ -1277,6 +1290,8 @@ static void xprt_init(struct rpc_xprt *x
 	xprt_init_xid(xprt);
 
 	xprt->xprt_net = get_net(net);
+	xprt->owner_env = get_exec_env();
+	ve_rpc_data_get();
 }
 
 /**
diff -upr linux-2.6.32-504.3.3.el6.orig/net/sunrpc/xprtsock.c linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/xprtsock.c
--- linux-2.6.32-504.3.3.el6.orig/net/sunrpc/xprtsock.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/sunrpc/xprtsock.c	2015-01-21 12:02:47.645109003 +0300
@@ -826,18 +826,23 @@ static void xs_restore_old_callbacks(str
 
 static void xs_reset_transport(struct sock_xprt *transport)
 {
-	struct socket *sock = transport->sock;
-	struct sock *sk = transport->inet;
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock;
+	struct sock *sk;
 
-	if (sk == NULL)
+	spin_lock_bh(&xprt->transport_lock);
+	if (transport->sock == NULL) {
+		spin_unlock_bh(&xprt->transport_lock);
 		return;
-
 	transport->srcport = 0;
-
-	write_lock_bh(&sk->sk_callback_lock);
+	}
+	sock = transport->sock;
+	sk = transport->inet;
 	transport->inet = NULL;
 	transport->sock = NULL;
+	spin_unlock_bh(&xprt->transport_lock);
 
+	write_lock_bh(&sk->sk_callback_lock);
 	sk->sk_user_data = NULL;
 
 	xs_restore_old_callbacks(transport, sk);
@@ -1469,6 +1474,7 @@ static void xs_tcp_schedule_linger_timeo
 {
 	struct sock_xprt *transport;
 
+	BUG_ON(xprt->owner_env != get_exec_env());
 	if (xprt_test_and_set_connecting(xprt))
 		return;
 	set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
@@ -1853,6 +1859,7 @@ static struct socket *xs_create_sock(str
 				protocol, -err);
 		goto out;
 	}
+	sk_change_net_get(sock->sk, xprt->owner_env->ve_netns);
 	xs_reclassify_socket(family, sock);
 
 	err = xs_bind(transport, sock);
@@ -2215,6 +2222,7 @@ static void xs_connect(struct rpc_task *
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 
+	BUG_ON(xprt->owner_env != get_exec_env());
 	if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
 				"seconds\n",
@@ -2569,8 +2577,10 @@ static struct rpc_xprt *xs_setup_xprt(st
 		int err;
 		err = xs_init_anyaddr(args->dstaddr->sa_family,
 					(struct sockaddr *)&new->srcaddr);
-		if (err != 0)
+		if (err != 0) {
+			xprt_free(xprt);
 			return ERR_PTR(err);
+		}
 	}
 
 	return xprt;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/unix/af_unix.c linux-2.6.32-504.3.3.el6-042stab103_6/net/unix/af_unix.c
--- linux-2.6.32-504.3.3.el6.orig/net/unix/af_unix.c	2014-12-12 23:29:40.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/unix/af_unix.c	2015-01-21 12:02:58.741814437 +0300
@@ -115,6 +115,9 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 
+#include <bc/net.h>
+#include <bc/beancounter.h>
+
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
 static atomic_long_t unix_nr_socks;
@@ -282,27 +285,38 @@ static inline struct sock *unix_find_soc
 	return s;
 }
 
-static struct sock *unix_find_socket_byinode(struct inode *i)
+static inline struct sock *__unix_find_socket_byinode(struct inode *i, int check_listen)
 {
 	struct sock *s;
 	struct hlist_node *node;
 
-	spin_lock(&unix_table_lock);
 	sk_for_each(s, node,
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->dentry;
 
-		if (dentry && dentry->d_inode == i) {
-			sock_hold(s);
+		if (check_listen && unix_sk(s)->sk.sk_state != TCP_LISTEN)
+			continue;
+
+		if(dentry && dentry->d_inode == i)
 			goto found;
-		}
 	}
 	s = NULL;
 found:
-	spin_unlock(&unix_table_lock);
 	return s;
 }
 
+static struct sock *unix_find_socket_byinode(struct inode *i)
+{
+	struct sock *s;
+
+	spin_lock(&unix_table_lock);
+	s = __unix_find_socket_byinode(i, 0);
+	if (s != NULL)
+		sock_hold(s);
+ 	spin_unlock(&unix_table_lock);
+ 	return s;
+}
+
 static inline int unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -602,7 +616,7 @@ static struct proto unix_proto = {
  */
 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 
-static struct sock *unix_create1(struct net *net, struct socket *sock)
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk = NULL;
 	struct unix_sock *u;
@@ -614,6 +628,8 @@ static struct sock *unix_create1(struct 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 	if (!sk)
 		goto out;
+	if (ub_other_sock_charge(sk, kern))
+		goto out_sk_free;
 
 	sock_init_data(sock, sk);
 	lockdep_set_class(&sk->sk_receive_queue.lock,
@@ -640,6 +656,10 @@ out:
 		local_bh_enable();
 	}
 	return sk;
+out_sk_free:
+	sk_free(sk);
+	atomic_long_dec(&unix_nr_socks);
+	return NULL;
 }
 
 static int unix_create(struct net *net, struct socket *sock, int protocol,
@@ -670,7 +690,7 @@ static int unix_create(struct net *net, 
 		return -ESOCKTNOSUPPORT;
 	}
 
-	return unix_create1(net, sock) ? 0 : -ENOMEM;
+	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock)
@@ -790,6 +810,62 @@ fail:
 	return NULL;
 }
 
+int unix_attach_addr(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len)
+{
+	int err;
+	unsigned hash;
+	struct unix_address *addr;
+
+	err = unix_mkname(sunaddr, addr_len, &hash);
+	if (err < 0)
+		return err;
+
+	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
+	if (addr == NULL)
+		return -ENOMEM;
+
+	memcpy(addr->name, sunaddr, addr_len);
+	addr->len = addr_len;
+	addr->hash = hash ^ sk->sk_type;
+	atomic_set(&addr->refcnt, 1);
+	unix_sk(sk)->addr = addr;
+
+	return 0;
+}
+EXPORT_SYMBOL(unix_attach_addr);
+
+int unix_bind_path(struct sock *sk, struct dentry *dentry, struct vfsmount *mnt)
+{
+	struct hlist_head *list;
+	struct unix_sock *u;
+
+	u = unix_sk(sk);
+	BUG_ON(u->addr == NULL);
+
+	spin_lock(&unix_table_lock);
+
+	if (sk->sk_state == TCP_LISTEN) {
+		if (__unix_find_socket_byinode(dentry->d_inode, 1)) {
+			spin_unlock(&unix_table_lock);
+			dput(dentry);
+			mntput(mnt);
+			return -EBUSY;
+		}
+	}
+
+	list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
+
+	u->dentry = dentry;
+	u->mnt = mnt;
+
+	__unix_remove_socket(sk);
+	__unix_insert_socket(list, sk);
+
+	spin_unlock(&unix_table_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(unix_bind_path);
 
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
@@ -1056,6 +1132,7 @@ static int unix_stream_connect(struct so
 	int st;
 	int err;
 	long timeo;
+	unsigned long chargesize;
 
 	err = unix_mkname(sunaddr, addr_len, &hash);
 	if (err < 0)
@@ -1076,7 +1153,7 @@ static int unix_stream_connect(struct so
 	err = -ENOMEM;
 
 	/* create new sock for complete connection */
-	newsk = unix_create1(sock_net(sk), NULL);
+	newsk = unix_create1(sock_net(sk), NULL, 0);
 	if (newsk == NULL)
 		goto out;
 
@@ -1084,6 +1161,10 @@ static int unix_stream_connect(struct so
 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
 	if (skb == NULL)
 		goto out;
+	chargesize = skb_charge_fullsize(skb);
+	if (ub_sock_getwres_other(newsk, chargesize) < 0)
+		goto out;	
+	ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF);
 
 restart:
 	/*  Find listening sock. */
@@ -1328,7 +1409,7 @@ static void unix_detach_fds(struct scm_c
 		unix_notinflight(scm->fp->fp[i]);
 }
 
-static void unix_destruct_scm(struct sk_buff *skb)
+void unix_destruct_scm(struct sk_buff *skb)
 {
 	struct scm_cookie scm;
 	memset(&scm, 0, sizeof(scm));
@@ -1342,6 +1423,7 @@ static void unix_destruct_scm(struct sk_
 	scm_destroy(&scm);
 	sock_wfree(skb);
 }
+EXPORT_SYMBOL(unix_destruct_scm);
 
 #define MAX_RECURSION_LEVEL 4
 
@@ -1594,7 +1676,8 @@ static int unix_stream_sendmsg(struct ki
 	struct sock *sk = sock->sk;
 	struct sock *other = NULL;
 	struct sockaddr_un *sunaddr = msg->msg_name;
-	int err, size;
+	unsigned int size, data_len;
+	int err;
 	struct sk_buff *skb;
 	int sent = 0;
 	struct scm_cookie tmp_scm;
@@ -1634,6 +1717,16 @@ static int unix_stream_sendmsg(struct ki
 
 		size = len-sent;
 
+		if (msg->msg_flags & MSG_DONTWAIT)
+			ub_sock_makewres_other(sk, skb_charge_size(size));
+		if (sock_bc(sk) != NULL) {
+			unsigned long res = sock_bc(sk)->poll_reserv;
+
+			if (res >= SOCK_MIN_UBCSPACE &&
+			    skb_charge_size(size) > res)
+				size = skb_charge_datalen(res);
+		}
+
 		/* Keep two messages in the pipe so it schedules better */
 		if (size > ((sk->sk_sndbuf >> 1) - 64))
 			size = (sk->sk_sndbuf >> 1) - 64;
@@ -1641,26 +1734,20 @@ static int unix_stream_sendmsg(struct ki
 		if (size > SKB_MAX_ALLOC)
 			size = SKB_MAX_ALLOC;
 
+		if (size <= SKB_MAX_HEAD(0))
+			data_len = 0;
+		else
+			data_len = size;
+
 		/*
 		 *	Grab a buffer
 		 */
 
-		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
-					  &err);
-
-		if (skb == NULL)
+		skb = sock_alloc_send_skb2(sk, size-data_len, data_len, SOCK_MIN_UBCSPACE,
+					   msg->msg_flags&MSG_DONTWAIT, &err);
+		if (!skb)
 			goto out_err;
 
-		/*
-		 *	If you pass two values to the sock_alloc_send_skb
-		 *	it tries to grab the large buffer with GFP_NOFS
-		 *	(which can fail easily), and if it fails grab the
-		 *	fallback size buffer which is under a page and will
-		 *	succeed. [Alan]
-		 */
-		size = min_t(int, size, skb_tailroom(skb));
-
-
 		/* Only send the fds in the first buffer */
 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
 		if (err < 0) {
@@ -1670,7 +1757,17 @@ static int unix_stream_sendmsg(struct ki
 		max_level = err + 1;
 		fds_sent = true;
 
-		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		/*
+		 *  We could allocate less or more than requested. Calculate sizes again.
+		 */
+
+		data_len = skb_pagelen(skb);
+		size = min(size, data_len + (skb->end - skb->tail));
+
+		skb_put(skb, size - data_len);
+		skb->data_len = data_len;
+		skb->len = size;
+		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, sent, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
@@ -1866,7 +1963,10 @@ static long unix_stream_data_wait(struct
 	return timeo;
 }
 
-
+static unsigned int unix_skb_len(const struct sk_buff *skb)
+{
+	return skb->len - UNIXCB(skb).consumed;
+}
 
 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			       struct msghdr *msg, size_t size,
@@ -1967,8 +2067,9 @@ static int unix_stream_recvmsg(struct ki
 			sunaddr = NULL;
 		}
 
-		chunk = min_t(unsigned int, skb->len, size);
-		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+		chunk = min_t(unsigned int, unix_skb_len(skb), size);
+		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed,
+					    msg->msg_iov, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (copied == 0)
 				copied = -EFAULT;
@@ -1979,13 +2080,13 @@ static int unix_stream_recvmsg(struct ki
 
 		/* Mark read part of skb as used */
 		if (!(flags & MSG_PEEK)) {
-			skb_pull(skb, chunk);
+			UNIXCB(skb).consumed += chunk;
 
 			if (UNIXCB(skb).fp)
 				unix_detach_fds(siocb->scm, skb);
 
 			/* put the skb back if we didn't use it up.. */
-			if (skb->len) {
+			if (unix_skb_len(skb)) {
 				skb_queue_head(&sk->sk_receive_queue, skb);
 				break;
 			}
@@ -2078,7 +2179,7 @@ static int unix_ioctl(struct socket *soc
 			if (sk->sk_type == SOCK_STREAM ||
 			    sk->sk_type == SOCK_SEQPACKET) {
 				skb_queue_walk(&sk->sk_receive_queue, skb)
-					amount += skb->len;
+					amount += unix_skb_len(skb);
 			} else {
 				skb = skb_peek(&sk->sk_receive_queue);
 				if (skb)
@@ -2100,6 +2201,7 @@ static unsigned int unix_poll(struct fil
 {
 	struct sock *sk = sock->sk;
 	unsigned int mask;
+	int no_ub_res;
 
 	sock_poll_wait(file, sk->sk_sleep, wait);
 	mask = 0;
@@ -2112,6 +2214,10 @@ static unsigned int unix_poll(struct fil
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		mask |= POLLRDHUP;
 
+	no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
+	if (no_ub_res)
+		ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
+
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
 	    (sk->sk_shutdown & RCV_SHUTDOWN))
@@ -2126,7 +2232,7 @@ static unsigned int unix_poll(struct fil
 	 * we set writable also when the other side has shut down the
 	 * connection. This prevents stuck sockets.
 	 */
-	if (unix_writable(sk))
+	if (!no_ub_res && unix_writable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 
 	return mask;
diff -upr linux-2.6.32-504.3.3.el6.orig/net/xfrm/xfrm_user.c linux-2.6.32-504.3.3.el6-042stab103_6/net/xfrm/xfrm_user.c
--- linux-2.6.32-504.3.3.el6.orig/net/xfrm/xfrm_user.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/net/xfrm/xfrm_user.c	2015-01-21 12:02:51.258013091 +0300
@@ -2180,7 +2180,8 @@ static int xfrm_user_rcv_msg(struct sk_b
 	link = &xfrm_dispatch[type];
 
 	/* All operations require privileges, even GET */
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
+	if (!netlink_capable(skb, CAP_NET_ADMIN) &&
+	    !netlink_capable(skb, CAP_VE_NET_ADMIN))
 		return -EPERM;
 
 	if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
@@ -2815,22 +2816,24 @@ static int __net_init xfrm_user_net_init
 				     xfrm_netlink_rcv, NULL, THIS_MODULE);
 	if (nlsk == NULL)
 		return -ENOMEM;
+	net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
 	rcu_assign_pointer(net->xfrm.nlsk, nlsk);
 	return 0;
 }
 
-static void __net_exit xfrm_user_net_exit(struct net *net)
+static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
 {
-	struct sock *nlsk = net->xfrm.nlsk;
-
-	rcu_assign_pointer(net->xfrm.nlsk, NULL);
-	synchronize_rcu();
-	netlink_kernel_release(nlsk);
+	struct net *net;
+	list_for_each_entry(net, net_exit_list, exit_list)
+		rcu_assign_pointer(net->xfrm.nlsk, NULL);
+	synchronize_net();
+	list_for_each_entry(net, net_exit_list, exit_list)
+		netlink_kernel_release(net->xfrm.nlsk_stash);
 }
 
 static struct pernet_operations xfrm_user_net_ops = {
-	.init = xfrm_user_net_init,
-	.exit = xfrm_user_net_exit,
+	.init	    = xfrm_user_net_init,
+	.exit_batch = xfrm_user_net_exit,
 };
 
 static int __init xfrm_user_init(void)
diff -upr linux-2.6.32-504.3.3.el6.orig/scripts/Makefile.modpost linux-2.6.32-504.3.3.el6-042stab103_6/scripts/Makefile.modpost
--- linux-2.6.32-504.3.3.el6.orig/scripts/Makefile.modpost	2014-12-12 23:29:41.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/scripts/Makefile.modpost	2015-01-21 12:02:41.360275861 +0300
@@ -83,7 +83,7 @@ modpost = scripts/mod/modpost           
  $(if $(KBUILD_EXTRA_SYMBOLS), $(patsubst %, -e %,$(KBUILD_EXTRA_SYMBOLS))) \
  $(if $(KBUILD_EXTMOD),-o $(modulesymfile))      \
  $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
- $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w) \
+ $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),$(if $(KBUILD_MODPOST_FAIL),,-w)) \
  $(if $(cross_build),-c)
 
 quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
diff -upr linux-2.6.32-504.3.3.el6.orig/security/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/security/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/security/Kconfig	2014-12-12 23:29:00.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/Kconfig	2015-01-21 12:02:44.028205026 +0300
@@ -53,7 +53,7 @@ config SECURITY_DMESG_RESTRICT
 
 config SECURITY
 	bool "Enable different security models"
-	depends on SYSFS
+	depends on SYSFS && !VE
 	help
 	  This allows you to choose different security modules to be
 	  configured into your kernel.
diff -upr linux-2.6.32-504.3.3.el6.orig/security/commoncap.c linux-2.6.32-504.3.3.el6-042stab103_6/security/commoncap.c
--- linux-2.6.32-504.3.3.el6.orig/security/commoncap.c	2014-12-12 23:29:39.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/commoncap.c	2015-01-21 12:02:44.731186363 +0300
@@ -625,7 +625,7 @@ int cap_inode_setxattr(struct dentry *de
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 		     sizeof(XATTR_SECURITY_PREFIX) - 1)  &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -651,7 +651,7 @@ int cap_inode_removexattr(struct dentry 
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 		     sizeof(XATTR_SECURITY_PREFIX) - 1)  &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -969,11 +969,13 @@ error:
  */
 int cap_syslog(int type)
 {
-	if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (dmesg_restrict && !capable(CAP_SYS_ADMIN) &&
+		 ve_is_super(get_exec_env()))
+			return -EPERM;
 
-	if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if ((type != 3 && type != 10) &&
+		!capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/security/device_cgroup.c linux-2.6.32-504.3.3.el6-042stab103_6/security/device_cgroup.c
--- linux-2.6.32-504.3.3.el6.orig/security/device_cgroup.c	2014-12-12 23:29:23.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/device_cgroup.c	2015-01-21 12:02:47.517112401 +0300
@@ -12,11 +12,24 @@
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
+#include <linux/ve.h>
+#include <linux/vzcalluser.h>
+#include <linux/major.h>
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
 #define ACC_WRITE 4
-#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
+#define ACC_QUOTA 8
+#define ACC_HIDDEN 16
+#define ACC_MOUNT 64
+#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA | ACC_MOUNT)
+
+static inline int convert_bits(int acc)
+{
+	/* ...10x <-> ...01x   trial: guess hwy */
+	return ((((acc & 06) == 00) || ((acc & 06) == 06)) ? acc : acc ^06) &
+		(ACC_READ | ACC_WRITE | ACC_QUOTA | ACC_MOUNT);
+}
 
 #define DEV_BLOCK 1
 #define DEV_CHAR  2
@@ -130,12 +143,53 @@ static int dev_exception_add(struct dev_
 /*
  * called under devcgroup_mutex
  */
+static int dev_exception_change(struct dev_cgroup *dev_cgroup,
+			struct dev_exception_item *ex)
+{
+	struct dev_exception_item *excopy, *walk, *tmp;
+
+	if (ex->access != 0) {
+		excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
+		if (!excopy)
+			return -ENOMEM;
+	} else
+		excopy = NULL;
+
+	list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
+		if (walk->type != ex->type)
+			continue;
+		if (walk->major != ex->major)
+			continue;
+		if (walk->minor != ex->minor)
+			continue;
+
+		if (ex->access == 0) {
+			list_del_rcu(&walk->list);
+			kfree_rcu(walk, rcu);
+		} else {
+			walk->access = ex->access;
+			kfree(excopy);
+			excopy = NULL;
+		}
+	}
+
+	if (excopy != NULL)
+		list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
+
+	return 0;
+}
+
+/*
+ * called under devcgroup_mutex
+ */
 static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
 			     struct dev_exception_item *ex)
 {
 	struct dev_exception_item *walk, *tmp;
 
 	list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
+		if (walk->type == DEV_ALL)
+			goto remove;
 		if (walk->type != ex->type)
 			continue;
 		if (walk->major != ex->major)
@@ -143,6 +197,7 @@ static void dev_exception_rm(struct dev_
 		if (walk->minor != ex->minor)
 			continue;
 
+remove:
 		walk->access &= ~ex->access;
 		if (!walk->access) {
 			list_del_rcu(&walk->list);
@@ -183,9 +238,9 @@ static struct cgroup_subsys_state *devcg
 	INIT_LIST_HEAD(&dev_cgroup->exceptions);
 	parent_cgroup = cgroup->parent;
 
-	if (parent_cgroup == NULL)
+	if (parent_cgroup == NULL) {
 		dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
-	else {
+	} else {
 		parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
 		mutex_lock(&devcgroup_mutex);
 		ret = dev_exceptions_copy(&dev_cgroup->exceptions,
@@ -274,8 +329,23 @@ static int devcgroup_seq_read(struct cgr
 			set_access(acc, ex->access);
 			set_majmin(maj, ex->major);
 			set_majmin(min, ex->minor);
-			seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
-				   maj, min, acc);
+
+			if (cft != NULL)
+				seq_printf(m, "%c %s:%s %s\n",
+					   type_to_char(ex->type),
+					   maj, min, acc);
+			else if (!(ex->access & ACC_HIDDEN)) {
+				int access;
+
+				access = convert_bits(ex->access);
+				if (access & (ACC_READ | ACC_WRITE))
+					access |= S_IXOTH;
+
+				seq_printf(m, "%10u %c %03o %s:%s\n",
+					   (unsigned)(unsigned long)m->private,
+					   type_to_char(ex->type),
+					   access, maj, min);
+			}
 		}
 	}
 	rcu_read_unlock();
@@ -299,6 +369,11 @@ static int may_access(struct dev_cgroup 
 	bool match = false;
 
 	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
+		short mismatched_bits;
+		bool allowed_mount;
+
+		if (ex->type & DEV_ALL)
+ 			goto found;
 		if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
 			continue;
 		if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
@@ -307,7 +382,13 @@ static int may_access(struct dev_cgroup 
 			continue;
 		if (ex->minor != ~0 && ex->minor != refex->minor)
 			continue;
-		if (refex->access & (~ex->access))
+found:
+		mismatched_bits = refex->access & (~ex->access) & ~ACC_MOUNT;
+		allowed_mount = !(mismatched_bits & ~ACC_WRITE) &&
+				(ex->access & ACC_MOUNT) &&
+				(refex->access & ACC_MOUNT);
+
+		if (mismatched_bits && !allowed_mount)
 			continue;
 		match = true;
 		break;
@@ -610,11 +691,85 @@ int __devcgroup_inode_permission(struct 
 		access |= ACC_WRITE;
 	if (mask & MAY_READ)
 		access |= ACC_READ;
+	if (mask & MAY_MOUNT)
+		access |= ACC_MOUNT;
 
 	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
 			access);
 }
 
+/* Returns 1 if exists, 0 otherwise */
+int devcgroup_device_exist(struct cgroup *cgrp, unsigned type, dev_t device)
+{
+	struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgrp);
+	struct dev_exception_item *ex;
+
+	/*
+	 * Let's pretend that the device exists if minor (or major) was not set
+	 * in the rule. This will prevent the caller from mangling devtmpfs and
+	 * sysfs.
+	 */
+	if ((type & VE_USE_MASK) != VE_USE_MINOR)
+		return 1;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
+		if (ex->type & DEV_ALL)
+			continue;
+		if ((ex->type & DEV_BLOCK) && (type == S_IFCHR))
+			continue;
+		if ((ex->type & DEV_CHAR) && (type == S_IFBLK))
+			continue;
+		if (ex->major != MAJOR(device))
+			continue;
+		if (ex->minor != MINOR(device))
+			continue;
+
+		rcu_read_unlock();
+		return 1;
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
+int devcgroup_device_visible(int type, int major, int start_minor, int nr_minors)
+{
+	struct dev_cgroup *dev_cgroup;
+	struct dev_exception_item *ex;
+
+	rcu_read_lock();
+	dev_cgroup = task_devcgroup(current);
+
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
+                rcu_read_unlock();
+                return 1;
+	}
+
+	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
+		if (ex->type & DEV_ALL)
+			goto found;
+		if ((ex->type & DEV_BLOCK) && (type == S_IFCHR))
+			continue;
+		if ((ex->type & DEV_CHAR) && (type == S_IFBLK))
+			continue;
+		if (ex->major != ~0 && ex->major != major)
+			continue;
+		if (ex->minor != ~0 && !(start_minor <= ex->minor &&
+					ex->minor < start_minor + nr_minors))
+			continue;
+found:
+		if (!(ex->access & (ACC_READ | ACC_WRITE | ACC_QUOTA)))
+			continue;
+		rcu_read_unlock();
+		return 1;
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
 int devcgroup_inode_mknod(int mode, dev_t dev)
 {
 	short type;
@@ -631,3 +786,132 @@ int devcgroup_inode_mknod(int mode, dev_
 			ACC_MKNOD);
 
 }
+
+#ifdef CONFIG_VE
+
+static struct dev_exception_item ve_devcgroup_ex_items[] = {
+	{ ~0,				~0,	DEV_ALL,  ACC_MKNOD		},
+	{ UNIX98_PTY_MASTER_MAJOR,	~0,	DEV_CHAR, ACC_READ | ACC_WRITE	},
+	{ UNIX98_PTY_SLAVE_MAJOR,	~0,	DEV_CHAR, ACC_READ | ACC_WRITE	},
+	{ PTY_MASTER_MAJOR,		~0,	DEV_CHAR, ACC_READ | ACC_WRITE	},
+	{ PTY_SLAVE_MAJOR,		~0,	DEV_CHAR, ACC_READ | ACC_WRITE	},
+	{ MEM_MAJOR,			3,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* null */
+	{ MEM_MAJOR,			5,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* zero */
+	{ MEM_MAJOR,			7,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* full */
+	{ TTYAUX_MAJOR,			0,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* tty */
+	{ TTYAUX_MAJOR,			1,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* console */
+	{ TTYAUX_MAJOR,			2,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* ptmx */
+	{ MEM_MAJOR,			8,	DEV_CHAR, ACC_READ		}, /* random */
+	{ MEM_MAJOR,			9,	DEV_CHAR, ACC_READ | ACC_WRITE	}, /* urandom */
+	{ MEM_MAJOR,			11,	DEV_CHAR, ACC_WRITE		}, /* kmsg */
+};
+
+static LIST_HEAD(ve_devcgroup_ex_list);
+
+int ve_prep_devcgroup(struct ve_struct *ve)
+{
+	struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(ve->ve_cgroup);
+	size_t i;
+	int ret;
+
+	if (unlikely(list_empty(&ve_devcgroup_ex_list))) {
+		for (i = 0; i < ARRAY_SIZE(ve_devcgroup_ex_items); i++) {
+			ve_devcgroup_ex_items[i].access |= ACC_HIDDEN;
+			list_add(&ve_devcgroup_ex_items[i].list,
+				 &ve_devcgroup_ex_list);
+		}
+	}
+
+	/*
+	 * When allowing device cgroup inside a container
+	 * we use _very_ strict rules over them:
+	 *
+	 *  - DEVCG_DEFAULT_DENY is used for children behaviour
+	 *  - we ship predefined "exception" items which are known
+	 *    to be virtualized
+	 */
+	mutex_lock(&devcgroup_mutex);
+
+	dev_cgroup->behavior = DEVCG_DEFAULT_DENY;
+
+	dev_exception_clean(dev_cgroup);
+	ret = dev_exceptions_copy(&dev_cgroup->exceptions,
+				  &ve_devcgroup_ex_list);
+
+	mutex_unlock(&devcgroup_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ve_prep_devcgroup);
+
+int get_device_perms_ve(int dev_type, dev_t dev, int access_mode)
+{
+	short access = 0;
+	short type;
+
+	if (dev_type == S_IFBLK)
+		type = DEV_BLOCK;
+	else
+		type = DEV_CHAR;
+
+	access |= (access_mode & FMODE_READ ? ACC_READ : 0);
+	access |= (access_mode & FMODE_WRITE ? ACC_WRITE : 0);
+	access |= (access_mode & FMODE_QUOTACTL ? ACC_QUOTA : 0);
+
+	return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
+					    access);
+}
+EXPORT_SYMBOL(get_device_perms_ve);
+
+int set_device_perms_ve(struct ve_struct *ve,
+		unsigned type, dev_t dev, unsigned mask)
+{
+	int err = -EINVAL;
+	struct dev_exception_item new;
+
+	if ((type & S_IFMT) == S_IFBLK)
+		new.type = DEV_BLOCK;
+	else if ((type & S_IFMT) == S_IFCHR)
+		new.type = DEV_CHAR;
+	else
+		return -EINVAL;
+
+	new.access = convert_bits(mask);
+	new.major = new.minor = ~0;
+
+	switch (type & VE_USE_MASK) {
+	default:
+		new.minor = MINOR(dev);
+	case VE_USE_MAJOR:
+		new.major = MAJOR(dev);
+	case 0:
+		;
+	}
+
+	mutex_lock(&devcgroup_mutex);
+	err = dev_exception_change(cgroup_to_devcgroup(ve->ve_cgroup), &new);
+	mutex_unlock(&devcgroup_mutex);
+	return err;
+}
+EXPORT_SYMBOL(set_device_perms_ve);
+
+#ifdef CONFIG_PROC_FS
+int devperms_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
+
+	if (m->private == (void *)0) {
+		seq_printf(m, "Version: 2.7\n");
+		m->private = (void *)-1;
+	}
+
+	if (ve_is_super(ve)) {
+		seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0);
+		return 0;
+	}
+
+	m->private = (void *)(unsigned long)ve->veid;
+	return devcgroup_seq_read(ve->ve_cgroup, NULL, m);
+}
+EXPORT_SYMBOL(devperms_seq_show);
+#endif
+#endif
diff -upr linux-2.6.32-504.3.3.el6.orig/security/integrity/ima/ima_main.c linux-2.6.32-504.3.3.el6-042stab103_6/security/integrity/ima/ima_main.c
--- linux-2.6.32-504.3.3.el6.orig/security/integrity/ima/ima_main.c	2014-12-12 23:28:57.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/integrity/ima/ima_main.c	2015-01-21 12:02:57.998834158 +0300
@@ -177,6 +177,7 @@ out:
 
 	kref_put(&iint->refcount, iint_free);
 }
+EXPORT_SYMBOL(ima_counts_get);
 
 /*
  * Decrement ima counts
diff -upr linux-2.6.32-504.3.3.el6.orig/security/keys/internal.h linux-2.6.32-504.3.3.el6-042stab103_6/security/keys/internal.h
--- linux-2.6.32-504.3.3.el6.orig/security/keys/internal.h	2014-12-12 23:29:15.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/keys/internal.h	2015-01-21 12:02:42.729239514 +0300
@@ -130,6 +130,7 @@ extern struct key *find_keyring_by_name(
 extern int install_user_keyrings(void);
 extern int install_thread_keyring_to_cred(struct cred *);
 extern int install_process_keyring_to_cred(struct cred *);
+extern int install_session_keyring_to_cred(struct cred *, struct key *);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
diff -upr linux-2.6.32-504.3.3.el6.orig/security/keys/process_keys.c linux-2.6.32-504.3.3.el6-042stab103_6/security/keys/process_keys.c
--- linux-2.6.32-504.3.3.el6.orig/security/keys/process_keys.c	2014-12-12 23:29:24.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/keys/process_keys.c	2015-01-21 12:02:42.730239487 +0300
@@ -221,8 +221,7 @@ static int install_process_keyring(void)
 /*
  * Install a session keyring directly to a credentials struct.
  */
-static int install_session_keyring_to_cred(struct cred *cred,
-					   struct key *keyring)
+int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 {
 	unsigned long flags;
 	struct key *old;
diff -upr linux-2.6.32-504.3.3.el6.orig/security/keys/request_key.c linux-2.6.32-504.3.3.el6-042stab103_6/security/keys/request_key.c
--- linux-2.6.32-504.3.3.el6.orig/security/keys/request_key.c	2014-12-12 23:29:33.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/keys/request_key.c	2015-01-21 12:02:42.746239063 +0300
@@ -65,6 +65,44 @@ void complete_request_key(struct key_con
 EXPORT_SYMBOL(complete_request_key);
 
 /*
+ * Initialise a usermode helper that is going to have a specific session
+ * keyring.
+ *
+ * This is called in context of freshly forked kthread before kernel_execve(),
+ * so we can simply install the desired session_keyring at this point.
+ */
+static int umh_keys_init(struct subprocess_info *info, struct cred *cred)
+{
+	struct key *keyring = info->data;
+	/*
+	 * This is called in context of freshly forked kthread before
+	 * kernel_execve(), we can just change our ->session_keyring.
+	 */
+	return install_session_keyring_to_cred(cred, keyring);
+}
+
+static void umh_keys_cleanup(struct subprocess_info *info)
+{
+	struct key *keyring = info->data;
+	key_put(keyring);
+}
+
+static int call_usermodehelper_keys(char *path, char **argv, char **envp,
+			 struct key *session_keyring, enum umh_wait wait)
+{
+	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+	struct subprocess_info *info =
+		call_usermodehelper_setup(path, argv, envp, gfp_mask);
+
+	if (!info)
+		return -ENOMEM;
+
+	call_usermodehelper_setfns(info, umh_keys_init, umh_keys_cleanup,
+					key_get(session_keyring));
+	return call_usermodehelper_exec(info, wait);
+}
+
+/*
  * Request userspace finish the construction of a key
  * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>"
  */
diff -upr linux-2.6.32-504.3.3.el6.orig/security/selinux/Kconfig linux-2.6.32-504.3.3.el6-042stab103_6/security/selinux/Kconfig
--- linux-2.6.32-504.3.3.el6.orig/security/selinux/Kconfig	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/selinux/Kconfig	2015-01-21 12:02:44.028205026 +0300
@@ -1,6 +1,6 @@
 config SECURITY_SELINUX
 	bool "NSA SELinux Support"
-	depends on SECURITY_NETWORK && AUDIT && NET && INET
+	depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE
 	select NETWORK_SECMARK
 	default n
 	help
diff -upr linux-2.6.32-504.3.3.el6.orig/security/selinux/hooks.c linux-2.6.32-504.3.3.el6-042stab103_6/security/selinux/hooks.c
--- linux-2.6.32-504.3.3.el6.orig/security/selinux/hooks.c	2014-12-12 23:29:42.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/selinux/hooks.c	2015-01-21 12:02:42.854236195 +0300
@@ -2286,8 +2286,9 @@ static inline void flush_unauthorized_fi
 
 	tty = get_current_tty();
 	if (tty) {
-		file_list_lock();
+		spin_lock(&tty_files_lock);
 		if (!list_empty(&tty->tty_files)) {
+			struct tty_file_private *file_priv;
 			struct inode *inode;
 
 			/* Revalidate access to controlling tty.
@@ -2295,14 +2296,16 @@ static inline void flush_unauthorized_fi
 			   than using file_has_perm, as this particular open
 			   file may belong to another process and we are only
 			   interested in the inode-based check here. */
-			file = list_first_entry(&tty->tty_files, struct file, f_u.fu_list);
+			file_priv = list_first_entry(&tty->tty_files,
+						struct tty_file_private, list);
+			file = file_priv->file;
 			inode = file->f_path.dentry->d_inode;
 			if (inode_has_perm(cred, inode,
 					   FILE__READ | FILE__WRITE, NULL)) {
 				drop_tty = 1;
 			}
 		}
-		file_list_unlock();
+		spin_unlock(&tty_files_lock);
 		tty_kref_put(tty);
 	}
 	/* Reset controlling tty. */
diff -upr linux-2.6.32-504.3.3.el6.orig/security/tomoyo/realpath.c linux-2.6.32-504.3.3.el6-042stab103_6/security/tomoyo/realpath.c
--- linux-2.6.32-504.3.3.el6.orig/security/tomoyo/realpath.c	2009-12-03 06:51:21.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/security/tomoyo/realpath.c	2015-01-21 12:02:42.155254754 +0300
@@ -92,10 +92,7 @@ int tomoyo_realpath_from_path2(struct pa
 		struct path ns_root = { };
 		struct path tmp;
 
-		read_lock(&current->fs->lock);
-		root = current->fs->root;
-		path_get(&root);
-		read_unlock(&current->fs->lock);
+		get_fs_root(current->fs, &root);
 		spin_lock(&vfsmount_lock);
 		if (root.mnt && root.mnt->mnt_ns)
 			ns_root.mnt = mntget(root.mnt->mnt_ns->root);
diff -upr linux-2.6.32-504.3.3.el6.orig/sound/core/init.c linux-2.6.32-504.3.3.el6-042stab103_6/sound/core/init.c
--- linux-2.6.32-504.3.3.el6.orig/sound/core/init.c	2014-12-12 23:29:38.000000000 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/sound/core/init.c	2015-01-21 12:02:41.798264233 +0300
@@ -392,12 +392,11 @@ int snd_card_disconnect(struct snd_card 
 		snd_printk(KERN_ERR "not all devices for card %i can be disconnected\n", card->number);
 
 	snd_info_card_disconnect(card);
-#ifndef CONFIG_SYSFS_DEPRECATED
-	if (card->card_dev) {
+
+	if (!sysfs_deprecated && card->card_dev) {
 		device_unregister(card->card_dev);
 		card->card_dev = NULL;
 	}
-#endif
 #ifdef CONFIG_PM
 	wake_up(&card->power_sleep);
 #endif
@@ -570,7 +569,6 @@ void snd_card_set_id(struct snd_card *ca
 }
 EXPORT_SYMBOL(snd_card_set_id);
 
-#ifndef CONFIG_SYSFS_DEPRECATED
 static ssize_t
 card_id_show_attr(struct device *dev,
 		  struct device_attribute *attr, char *buf)
@@ -632,7 +630,6 @@ card_number_show_attr(struct device *dev
 
 static struct device_attribute card_number_attrs =
 	__ATTR(number, S_IRUGO, card_number_show_attr, NULL);
-#endif /* CONFIG_SYSFS_DEPRECATED */
 
 /**
  *  snd_card_register - register the soundcard
@@ -651,15 +648,15 @@ int snd_card_register(struct snd_card *c
 
 	if (snd_BUG_ON(!card))
 		return -EINVAL;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	if (!card->card_dev) {
+
+	if (!sysfs_deprecated && !card->card_dev) {
 		card->card_dev = device_create(sound_class, card->dev,
 					       MKDEV(0, 0), card,
 					       "card%i", card->number);
 		if (IS_ERR(card->card_dev))
 			card->card_dev = NULL;
 	}
-#endif
+
 	if ((err = snd_device_register_all(card)) < 0)
 		return err;
 	mutex_lock(&snd_card_mutex);
@@ -676,8 +673,7 @@ int snd_card_register(struct snd_card *c
 	if (snd_mixer_oss_notify_callback)
 		snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_REGISTER);
 #endif
-#ifndef CONFIG_SYSFS_DEPRECATED
-	if (card->card_dev) {
+	if (!sysfs_deprecated && card->card_dev) {
 		err = device_create_file(card->card_dev, &card_id_attrs);
 		if (err < 0)
 			return err;
@@ -685,7 +681,7 @@ int snd_card_register(struct snd_card *c
 		if (err < 0)
 			return err;
 	}
-#endif
+
 	return 0;
 }
 
diff -upr linux-2.6.32-504.3.3.el6.orig/tools/perf/scripts/python/bin/sched-stat-record linux-2.6.32-504.3.3.el6-042stab103_6/tools/perf/scripts/python/bin/sched-stat-record
--- linux-2.6.32-504.3.3.el6.orig/tools/perf/scripts/python/bin/sched-stat-record	2015-01-21 12:02:58.406823328 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/tools/perf/scripts/python/bin/sched-stat-record	2015-01-21 12:02:58.406823328 +0300
@@ -0,0 +1,65 @@
+#!/bin/bash
+# perf script record -- sched:sched_stat_[smth] -- CMD
+# perf script record -- -e sched:sched_stat_[smth]
+#
+set -o monitor
+
+usage()
+{
+	echo "Usage:"
+	echo "	perf script record sched-stat -- sched:sched_stat_[smth] -- CMD"
+	echo "	perf script record sched-stat -- [PERF OPTS] -e sched:sched_stat_[smth]"
+	exit 1;
+}
+
+declare -a opt
+declare -a cmd
+f=0;
+for i in "${@:2}"; do
+	if [ "$i" == "--" ]; then
+		f=1
+		continue
+	fi
+	if [ $f -eq 1 ]; then
+		cmd[${#cmd[*]}]="$i"
+	else
+		opt[${#opt[*]}]="$i"
+	fi
+done
+
+if [[ "${opt[@]}" != *sched_stat_* ]]; then
+	usage;
+fi
+
+if [ ${#cmd[@]} -eq 0 ]; then
+	if [ ${#opt[@]} -eq 0 ]; then
+		usage;
+	fi
+	exec perf record -agP \
+		-e sched:sched_switch \
+		--filter "prev_state == 1 || prev_state == 2" \
+		"${opt[@]}"
+fi
+
+if [ ${#opt[@]} -ne 1 ]; then
+	usage;
+fi
+
+# Wait until a target process is stopped.
+bash -c 'kill -STOP $$; exec "$@"' -- "${cmd[@]}" &
+pid=$!
+wait %1
+[ $? -eq 147 ] || exit 1;
+
+perf record -agP \
+		-e sched:sched_switch \
+		--filter "prev_pid == $pid && prev_state == 1 || prev_state == 2" \
+		-e sched:sched_process_exit -e "${opt[@]}" --filter "pid == $pid" &
+pid_perf=$!
+kill -CONT %1
+while :; do
+	wait %1
+	[ $? -eq 127 ] && break;
+done
+kill -INT %2
+wait %2
diff -upr linux-2.6.32-504.3.3.el6.orig/tools/perf/scripts/python/bin/sched-stat-report linux-2.6.32-504.3.3.el6-042stab103_6/tools/perf/scripts/python/bin/sched-stat-report
--- linux-2.6.32-504.3.3.el6.orig/tools/perf/scripts/python/bin/sched-stat-report	2015-01-21 12:02:58.406823328 +0300
+++ linux-2.6.32-504.3.3.el6-042stab103_6/tools/perf/scripts/python/bin/sched-stat-report	2015-01-21 12:02:58.406823328 +0300
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: profiling sleep times
+perf inject -s -i perf.data -o perf.data.d || exit
+perf report -i perf.data.d || exit
+unlink perf.data.d
